bpf: add initial support for splitting map dscp value into ingress and egress
[project/qosify.git] / qosify-bpf.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * Copyright (C) 2021 Felix Fietkau <nbd@nbd.name>
4 */
5 #define KBUILD_MODNAME "foo"
6 #include <uapi/linux/bpf.h>
7 #include <uapi/linux/if_ether.h>
8 #include <uapi/linux/if_packet.h>
9 #include <uapi/linux/ip.h>
10 #include <uapi/linux/ipv6.h>
11 #include <uapi/linux/in.h>
12 #include <uapi/linux/tcp.h>
13 #include <uapi/linux/udp.h>
14 #include <uapi/linux/filter.h>
15 #include <uapi/linux/pkt_cls.h>
16 #include <linux/ip.h>
17 #include <net/ipv6.h>
18 #include <bpf/bpf_helpers.h>
19 #include <bpf/bpf_endian.h>
20 #include "qosify-bpf.h"
21
22 #define INET_ECN_MASK 3
23
24 #define FLOW_CHECK_INTERVAL ((u32)((1000000000ULL) >> 24))
25 #define FLOW_TIMEOUT ((u32)((30ULL * 1000000000ULL) >> 24))
26 #define FLOW_BULK_TIMEOUT 5
27
28 #define EWMA_SHIFT 12
29
30 const volatile static uint32_t module_flags = 0;
31
32 struct flow_bucket {
33 __u32 last_update;
34 __u32 pkt_len_avg;
35 __u16 pkt_count;
36 __u8 dscp;
37 __u8 bulk_timeout;
38 };
39
40 struct {
41 __uint(type, BPF_MAP_TYPE_ARRAY);
42 __uint(pinning, 1);
43 __type(key, __u32);
44 __type(value, struct qosify_config);
45 __uint(max_entries, 1);
46 } config SEC(".maps");
47
48 typedef struct {
49 __uint(type, BPF_MAP_TYPE_ARRAY);
50 __uint(pinning, 1);
51 __type(key, __u32);
52 __type(value, struct qosify_dscp_val);
53 __uint(max_entries, 1 << 16);
54 } port_array_t;
55
56 struct {
57 __uint(type, BPF_MAP_TYPE_LRU_HASH);
58 __uint(pinning, 1);
59 __type(key, __u32);
60 __uint(value_size, sizeof(struct flow_bucket));
61 __uint(max_entries, QOSIFY_FLOW_BUCKETS);
62 } flow_map SEC(".maps");
63
64 port_array_t tcp_ports SEC(".maps");
65 port_array_t udp_ports SEC(".maps");
66
67 struct {
68 __uint(type, BPF_MAP_TYPE_HASH);
69 __uint(pinning, 1);
70 __uint(key_size, sizeof(struct in_addr));
71 __type(value, struct qosify_ip_map_val);
72 __uint(max_entries, 100000);
73 __uint(map_flags, BPF_F_NO_PREALLOC);
74 } ipv4_map SEC(".maps");
75
76 struct {
77 __uint(type, BPF_MAP_TYPE_HASH);
78 __uint(pinning, 1);
79 __uint(key_size, sizeof(struct in6_addr));
80 __type(value, struct qosify_ip_map_val);
81 __uint(max_entries, 100000);
82 __uint(map_flags, BPF_F_NO_PREALLOC);
83 } ipv6_map SEC(".maps");
84
85 static struct qosify_config *get_config(void)
86 {
87 __u32 key = 0;
88
89 return bpf_map_lookup_elem(&config, &key);
90 }
91
92 static __always_inline int proto_is_vlan(__u16 h_proto)
93 {
94 return !!(h_proto == bpf_htons(ETH_P_8021Q) ||
95 h_proto == bpf_htons(ETH_P_8021AD));
96 }
97
98 static __always_inline int proto_is_ip(__u16 h_proto)
99 {
100 return !!(h_proto == bpf_htons(ETH_P_IP) ||
101 h_proto == bpf_htons(ETH_P_IPV6));
102 }
103
104 static __always_inline void *skb_ptr(struct __sk_buff *skb, __u32 offset)
105 {
106 void *start = (void *)(unsigned long long)skb->data;
107
108 return start + offset;
109 }
110
111 static __always_inline void *skb_end_ptr(struct __sk_buff *skb)
112 {
113 return (void *)(unsigned long long)skb->data_end;
114 }
115
116 static __always_inline int skb_check(struct __sk_buff *skb, void *ptr)
117 {
118 if (ptr > skb_end_ptr(skb))
119 return -1;
120
121 return 0;
122 }
123
124 static __always_inline __u32 cur_time(void)
125 {
126 __u32 val = bpf_ktime_get_ns() >> 24;
127
128 if (!val)
129 val = 1;
130
131 return val;
132 }
133
134 static __always_inline __u32 ewma(__u32 *avg, __u32 val)
135 {
136 if (*avg)
137 *avg = (*avg * 3) / 4 + (val << EWMA_SHIFT) / 4;
138 else
139 *avg = val << EWMA_SHIFT;
140
141 return *avg >> EWMA_SHIFT;
142 }
143
144 static __always_inline __u8 dscp_val(struct qosify_dscp_val *val, bool ingress)
145 {
146 __u8 ival = val->ingress;
147 __u8 eval = val->egress;
148
149 return ingress ? ival : eval;
150 }
151
152 static __always_inline void
153 ipv4_change_dsfield(struct iphdr *iph, __u8 mask, __u8 value, bool force)
154 {
155 __u32 check = bpf_ntohs(iph->check);
156 __u8 dsfield;
157
158 if ((iph->tos & mask) && !force)
159 return;
160
161 dsfield = (iph->tos & mask) | value;
162 if (iph->tos == dsfield)
163 return;
164
165 check += iph->tos;
166 if ((check + 1) >> 16)
167 check = (check + 1) & 0xffff;
168 check -= dsfield;
169 check += check >> 16;
170 iph->check = bpf_htons(check);
171 iph->tos = dsfield;
172 }
173
174 static __always_inline void
175 ipv6_change_dsfield(struct ipv6hdr *ipv6h, __u8 mask, __u8 value, bool force)
176 {
177 __u16 *p = (__u16 *)ipv6h;
178 __u16 val;
179
180 if (((*p >> 4) & mask) && !force)
181 return;
182
183 val = (*p & bpf_htons((((__u16)mask << 4) | 0xf00f))) | bpf_htons((__u16)value << 4);
184 if (val == *p)
185 return;
186
187 *p = val;
188 }
189
190 static __always_inline int
191 parse_ethernet(struct __sk_buff *skb, __u32 *offset)
192 {
193 struct ethhdr *eth;
194 __u16 h_proto;
195 int i;
196
197 eth = skb_ptr(skb, *offset);
198 if (skb_check(skb, eth + 1))
199 return -1;
200
201 h_proto = eth->h_proto;
202 *offset += sizeof(*eth);
203
204 #pragma unroll
205 for (i = 0; i < 2; i++) {
206 struct vlan_hdr *vlh = skb_ptr(skb, *offset);
207
208 if (!proto_is_vlan(h_proto))
209 break;
210
211 if (skb_check(skb, vlh + 1))
212 return -1;
213
214 h_proto = vlh->h_vlan_encapsulated_proto;
215 *offset += sizeof(*vlh);
216 }
217
218 return h_proto;
219 }
220
221 static void
222 parse_l4proto(struct qosify_config *config, struct __sk_buff *skb,
223 __u32 offset, __u8 proto, __u8 *dscp_out, bool ingress)
224 {
225 struct udphdr *udp;
226 __u32 src, dest, key;
227 struct qosify_dscp_val *value;
228
229 udp = skb_ptr(skb, offset);
230 if (skb_check(skb, &udp->len))
231 return;
232
233 if (config && (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)) {
234 *dscp_out = dscp_val(&config->dscp_icmp, ingress);
235 return;
236 }
237
238 if (ingress)
239 key = udp->source;
240 else
241 key = udp->dest;
242
243 if (proto == IPPROTO_TCP) {
244 value = bpf_map_lookup_elem(&tcp_ports, &key);
245 } else {
246 if (proto != IPPROTO_UDP)
247 key = 0;
248
249 value = bpf_map_lookup_elem(&udp_ports, &key);
250 }
251
252 if (!value)
253 return;
254
255 *dscp_out = dscp_val(value, ingress);
256 }
257
258 static __always_inline void
259 check_flow(struct qosify_config *config, struct __sk_buff *skb,
260 uint8_t *dscp, bool ingress)
261 {
262 struct flow_bucket flow_data;
263 struct flow_bucket *flow;
264 __s32 delta;
265 __u32 hash;
266 __u32 time;
267
268 if (!(*dscp & QOSIFY_DSCP_DEFAULT_FLAG))
269 return;
270
271 if (!config)
272 return;
273
274 if (!config->bulk_trigger_pps &&
275 !config->prio_max_avg_pkt_len)
276 return;
277
278 time = cur_time();
279 hash = bpf_get_hash_recalc(skb);
280 flow = bpf_map_lookup_elem(&flow_map, &hash);
281 if (!flow) {
282 memset(&flow_data, 0, sizeof(flow_data));
283 bpf_map_update_elem(&flow_map, &hash, &flow_data, BPF_ANY);
284 flow = bpf_map_lookup_elem(&flow_map, &hash);
285 if (!flow)
286 return;
287 }
288
289 if (!flow->last_update)
290 goto reset;
291
292 delta = time - flow->last_update;
293 if ((u32)delta > FLOW_TIMEOUT)
294 goto reset;
295
296 if (delta >= FLOW_CHECK_INTERVAL) {
297 if (flow->bulk_timeout) {
298 flow->bulk_timeout--;
299 if (!flow->bulk_timeout)
300 flow->dscp = 0xff;
301 }
302
303 goto clear;
304 }
305
306 if (flow->pkt_count < 0xffff)
307 flow->pkt_count++;
308
309 if (config->bulk_trigger_pps &&
310 flow->pkt_count > config->bulk_trigger_pps) {
311 flow->dscp = dscp_val(&config->dscp_bulk, ingress);
312 flow->bulk_timeout = config->bulk_trigger_timeout;
313 }
314
315 out:
316 if (config->prio_max_avg_pkt_len &&
317 flow->dscp != dscp_val(&config->dscp_bulk, ingress)) {
318 if (ewma(&flow->pkt_len_avg, skb->len) <
319 config->prio_max_avg_pkt_len)
320 flow->dscp = dscp_val(&config->dscp_prio, ingress);
321 else
322 flow->dscp = 0xff;
323 }
324
325 if (flow->dscp != 0xff)
326 *dscp = flow->dscp;
327
328 return;
329
330 reset:
331 flow->dscp = 0xff;
332 flow->pkt_len_avg = 0;
333 clear:
334 flow->pkt_count = 1;
335 flow->last_update = time;
336
337 goto out;
338 }
339
340 static __always_inline void
341 parse_ipv4(struct __sk_buff *skb, __u32 *offset, bool ingress)
342 {
343 struct qosify_config *config;
344 struct qosify_ip_map_val *ip_val;
345 struct qosify_dscp_val *value;
346 const __u32 zero_port = 0;
347 struct iphdr *iph;
348 __u8 dscp = 0xff;
349 __u8 ipproto;
350 int hdr_len;
351 void *key;
352 bool force;
353
354 config = get_config();
355
356 iph = skb_ptr(skb, *offset);
357 if (skb_check(skb, iph + 1))
358 return;
359
360 hdr_len = iph->ihl * 4;
361 if (bpf_skb_pull_data(skb, *offset + hdr_len + sizeof(struct udphdr)))
362 return;
363
364 iph = skb_ptr(skb, *offset);
365 *offset += hdr_len;
366
367 if (skb_check(skb, (void *)(iph + 1)))
368 return;
369
370 ipproto = iph->protocol;
371 parse_l4proto(config, skb, *offset, ipproto, &dscp, ingress);
372
373 if (ingress)
374 key = &iph->saddr;
375 else
376 key = &iph->daddr;
377
378 ip_val = bpf_map_lookup_elem(&ipv4_map, key);
379 if (ip_val) {
380 if (!ip_val->seen)
381 ip_val->seen = 1;
382 dscp = dscp_val(&ip_val->dscp, ingress);
383 } else if (dscp == 0xff) {
384 /* use udp port 0 entry as fallback for non-tcp/udp */
385 value = bpf_map_lookup_elem(&udp_ports, &zero_port);
386 if (value)
387 dscp = dscp_val(value, ingress);
388 }
389
390 check_flow(config, skb, &dscp, ingress);
391
392 force = !(dscp & QOSIFY_DSCP_FALLBACK_FLAG);
393 dscp &= GENMASK(5, 0);
394
395 ipv4_change_dsfield(iph, INET_ECN_MASK, dscp << 2, force);
396 }
397
398 static __always_inline void
399 parse_ipv6(struct __sk_buff *skb, __u32 *offset, bool ingress)
400 {
401 struct qosify_config *config;
402 struct qosify_ip_map_val *ip_val;
403 struct qosify_dscp_val *value;
404 const __u32 zero_port = 0;
405 struct ipv6hdr *iph;
406 __u8 dscp = 0;
407 __u8 ipproto;
408 void *key;
409 bool force;
410
411 config = get_config();
412
413 if (bpf_skb_pull_data(skb, *offset + sizeof(*iph) + sizeof(struct udphdr)))
414 return;
415
416 iph = skb_ptr(skb, *offset);
417 *offset += sizeof(*iph);
418
419 if (skb_check(skb, (void *)(iph + 1)))
420 return;
421
422 ipproto = iph->nexthdr;
423 if (ingress)
424 key = &iph->saddr;
425 else
426 key = &iph->daddr;
427
428 parse_l4proto(config, skb, *offset, ipproto, &dscp, ingress);
429
430 ip_val = bpf_map_lookup_elem(&ipv6_map, key);
431 if (ip_val) {
432 if (!ip_val->seen)
433 ip_val->seen = 1;
434 dscp = dscp_val(&ip_val->dscp, ingress);
435 } else if (dscp == 0xff) {
436 /* use udp port 0 entry as fallback for non-tcp/udp */
437 value = bpf_map_lookup_elem(&udp_ports, &zero_port);
438 if (value)
439 dscp = dscp_val(value, ingress);
440 }
441
442 check_flow(config, skb, &dscp, ingress);
443
444 force = !(dscp & QOSIFY_DSCP_FALLBACK_FLAG);
445 dscp &= GENMASK(5, 0);
446
447 ipv6_change_dsfield(iph, INET_ECN_MASK, dscp << 2, force);
448 }
449
450 SEC("classifier")
451 int classify(struct __sk_buff *skb)
452 {
453 bool ingress = module_flags & QOSIFY_INGRESS;
454 __u32 offset = 0;
455 int type;
456
457 if (module_flags & QOSIFY_IP_ONLY)
458 type = skb->protocol;
459 else
460 type = parse_ethernet(skb, &offset);
461
462 if (type == bpf_htons(ETH_P_IP))
463 parse_ipv4(skb, &offset, ingress);
464 else if (type == bpf_htons(ETH_P_IPV6))
465 parse_ipv6(skb, &offset, ingress);
466
467 return TC_ACT_OK;
468 }
469
470 char _license[] SEC("license") = "GPL";