kernel: improve GRO performance
[openwrt/staging/pepe2k.git] / target / linux / generic / pending-6.1 / 680-net-add-TCP-fraglist-GRO-support.patch
1 From: Felix Fietkau <nbd@nbd.name>
2 Date: Tue, 23 Apr 2024 11:23:03 +0200
3 Subject: [PATCH] net: add TCP fraglist GRO support
4
5 When forwarding TCP after GRO, software segmentation is very expensive,
6 especially when the checksum needs to be recalculated.
7 One case where that's currently unavoidable is when routing packets over
8 PPPoE. Performance improves significantly when using fraglist GRO
9 implemented in the same way as for UDP.
10
11 Here's a measurement of running 2 TCP streams through a MediaTek MT7622
12 device (2-core Cortex-A53), which runs NAT with flow offload enabled from
13 one ethernet port to PPPoE on another ethernet port + cake qdisc set to
14 1Gbps.
15
16 rx-gro-list off: 630 Mbit/s, CPU 35% idle
17 rx-gro-list on: 770 Mbit/s, CPU 40% idle
18
19 Signe-off-by: Felix Fietkau <nbd@nbd.name>
20 ---
21
22 --- a/include/net/gro.h
23 +++ b/include/net/gro.h
24 @@ -424,6 +424,7 @@ static inline __wsum ip6_gro_compute_pse
25 }
26
27 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
28 +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
29
30 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
31 static inline void gro_normal_list(struct napi_struct *napi)
32 @@ -446,5 +447,48 @@ static inline void gro_normal_one(struct
33 gro_normal_list(napi);
34 }
35
36 +/* This function is the alternative of 'inet_iif' and 'inet_sdif'
37 + * functions in case we can not rely on fields of IPCB.
38 + *
39 + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
40 + * The caller must hold the RCU read lock.
41 + */
42 +static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
43 +{
44 + *iif = inet_iif(skb) ?: skb->dev->ifindex;
45 + *sdif = 0;
46 +
47 +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
48 + if (netif_is_l3_slave(skb->dev)) {
49 + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
50 +
51 + *sdif = *iif;
52 + *iif = master ? master->ifindex : 0;
53 + }
54 +#endif
55 +}
56 +
57 +/* This function is the alternative of 'inet6_iif' and 'inet6_sdif'
58 + * functions in case we can not rely on fields of IP6CB.
59 + *
60 + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
61 + * The caller must hold the RCU read lock.
62 + */
63 +static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
64 +{
65 + /* using skb->dev->ifindex because skb_dst(skb) is not initialized */
66 + *iif = skb->dev->ifindex;
67 + *sdif = 0;
68 +
69 +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
70 + if (netif_is_l3_slave(skb->dev)) {
71 + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
72 +
73 + *sdif = *iif;
74 + *iif = master ? master->ifindex : 0;
75 + }
76 +#endif
77 +}
78 +
79
80 #endif /* _NET_IPV6_GRO_H */
81 --- a/include/net/tcp.h
82 +++ b/include/net/tcp.h
83 @@ -2057,7 +2057,10 @@ void tcp_v4_destroy_sock(struct sock *sk
84
85 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
86 netdev_features_t features);
87 -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
88 +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
89 +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
90 +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
91 + struct tcphdr *th);
92 INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
93 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
94 INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
95 --- a/net/core/gro.c
96 +++ b/net/core/gro.c
97 @@ -290,6 +290,33 @@ done:
98 return 0;
99 }
100
101 +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
102 +{
103 + if (unlikely(p->len + skb->len >= 65536))
104 + return -E2BIG;
105 +
106 + if (NAPI_GRO_CB(p)->last == p)
107 + skb_shinfo(p)->frag_list = skb;
108 + else
109 + NAPI_GRO_CB(p)->last->next = skb;
110 +
111 + skb_pull(skb, skb_gro_offset(skb));
112 +
113 + NAPI_GRO_CB(p)->last = skb;
114 + NAPI_GRO_CB(p)->count++;
115 + p->data_len += skb->len;
116 +
117 + /* sk ownership - if any - completely transferred to the aggregated packet */
118 + skb->destructor = NULL;
119 + skb->sk = NULL;
120 + p->truesize += skb->truesize;
121 + p->len += skb->len;
122 +
123 + NAPI_GRO_CB(skb)->same_flow = 1;
124 +
125 + return 0;
126 +}
127 +
128
129 static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
130 {
131 --- a/net/ipv4/tcp_offload.c
132 +++ b/net/ipv4/tcp_offload.c
133 @@ -27,6 +27,68 @@ static void tcp_gso_tstamp(struct sk_buf
134 }
135 }
136
137 +static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
138 + __be32 *oldip, __be32 *newip,
139 + __be16 *oldport, __be16 *newport)
140 +{
141 + struct tcphdr *th;
142 + struct iphdr *iph;
143 +
144 + if (*oldip == *newip && *oldport == *newport)
145 + return;
146 +
147 + th = tcp_hdr(seg);
148 + iph = ip_hdr(seg);
149 +
150 + inet_proto_csum_replace4(&th->check, seg, *oldip, *newip, true);
151 + inet_proto_csum_replace2(&th->check, seg, *oldport, *newport, false);
152 + *oldport = *newport;
153 +
154 + csum_replace4(&iph->check, *oldip, *newip);
155 + *oldip = *newip;
156 +}
157 +
158 +static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
159 +{
160 + struct sk_buff *seg;
161 + struct tcphdr *th, *th2;
162 + struct iphdr *iph, *iph2;
163 +
164 + seg = segs;
165 + th = tcp_hdr(seg);
166 + iph = ip_hdr(seg);
167 + th2 = tcp_hdr(seg->next);
168 + iph2 = ip_hdr(seg->next);
169 +
170 + if (!(*(u32 *)&th->source ^ *(u32 *)&th2->source) &&
171 + iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
172 + return segs;
173 +
174 + while ((seg = seg->next)) {
175 + th2 = tcp_hdr(seg);
176 + iph2 = ip_hdr(seg);
177 +
178 + __tcpv4_gso_segment_csum(seg,
179 + &iph2->saddr, &iph->saddr,
180 + &th2->source, &th->source);
181 + __tcpv4_gso_segment_csum(seg,
182 + &iph2->daddr, &iph->daddr,
183 + &th2->dest, &th->dest);
184 + }
185 +
186 + return segs;
187 +}
188 +
189 +static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
190 + netdev_features_t features)
191 +{
192 + skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
193 + if (IS_ERR(skb))
194 + return skb;
195 +
196 + return __tcpv4_gso_segment_list_csum(skb);
197 +}
198 +
199 static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
200 netdev_features_t features)
201 {
202 @@ -36,6 +98,9 @@ static struct sk_buff *tcp4_gso_segment(
203 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
204 return ERR_PTR(-EINVAL);
205
206 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
207 + return __tcp4_gso_segment_list(skb, features);
208 +
209 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
210 const struct iphdr *iph = ip_hdr(skb);
211 struct tcphdr *th = tcp_hdr(skb);
212 @@ -177,61 +242,76 @@ out:
213 return segs;
214 }
215
216 -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
217 +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
218 {
219 - struct sk_buff *pp = NULL;
220 + struct tcphdr *th2;
221 struct sk_buff *p;
222 +
223 + list_for_each_entry(p, head, list) {
224 + if (!NAPI_GRO_CB(p)->same_flow)
225 + continue;
226 +
227 + th2 = tcp_hdr(p);
228 + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
229 + NAPI_GRO_CB(p)->same_flow = 0;
230 + continue;
231 + }
232 +
233 + return p;
234 + }
235 +
236 + return NULL;
237 +}
238 +
239 +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
240 +{
241 + unsigned int thlen, hlen, off;
242 struct tcphdr *th;
243 - struct tcphdr *th2;
244 - unsigned int len;
245 - unsigned int thlen;
246 - __be32 flags;
247 - unsigned int mss = 1;
248 - unsigned int hlen;
249 - unsigned int off;
250 - int flush = 1;
251 - int i;
252
253 off = skb_gro_offset(skb);
254 hlen = off + sizeof(*th);
255 th = skb_gro_header(skb, hlen, off);
256 if (unlikely(!th))
257 - goto out;
258 + return NULL;
259
260 thlen = th->doff * 4;
261 if (thlen < sizeof(*th))
262 - goto out;
263 + return NULL;
264
265 hlen = off + thlen;
266 if (skb_gro_header_hard(skb, hlen)) {
267 th = skb_gro_header_slow(skb, hlen, off);
268 if (unlikely(!th))
269 - goto out;
270 + return NULL;
271 }
272
273 skb_gro_pull(skb, thlen);
274
275 - len = skb_gro_len(skb);
276 - flags = tcp_flag_word(th);
277 -
278 - list_for_each_entry(p, head, list) {
279 - if (!NAPI_GRO_CB(p)->same_flow)
280 - continue;
281 + return th;
282 +}
283
284 - th2 = tcp_hdr(p);
285 +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
286 + struct tcphdr *th)
287 +{
288 + unsigned int thlen = th->doff * 4;
289 + struct sk_buff *pp = NULL;
290 + struct sk_buff *p;
291 + struct tcphdr *th2;
292 + unsigned int len;
293 + __be32 flags;
294 + unsigned int mss = 1;
295 + int flush = 1;
296 + int i;
297
298 - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
299 - NAPI_GRO_CB(p)->same_flow = 0;
300 - continue;
301 - }
302 + len = skb_gro_len(skb);
303 + flags = tcp_flag_word(th);
304
305 - goto found;
306 - }
307 - p = NULL;
308 - goto out_check_final;
309 + p = tcp_gro_lookup(head, th);
310 + if (!p)
311 + goto out_check_final;
312
313 -found:
314 /* Include the IP ID check below from the inner most IP hdr */
315 + th2 = tcp_hdr(p);
316 flush = NAPI_GRO_CB(p)->flush;
317 flush |= (__force int)(flags & TCP_FLAG_CWR);
318 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
319 @@ -268,6 +348,19 @@ found:
320 flush |= p->decrypted ^ skb->decrypted;
321 #endif
322
323 + if (NAPI_GRO_CB(p)->is_flist) {
324 + flush |= (__force int)(flags ^ tcp_flag_word(th2));
325 + flush |= skb->ip_summed != p->ip_summed;
326 + flush |= skb->csum_level != p->csum_level;
327 + flush |= !pskb_may_pull(skb, skb_gro_offset(skb));
328 + flush |= NAPI_GRO_CB(p)->count >= 64;
329 +
330 + if (flush || skb_gro_receive_list(p, skb))
331 + mss = 1;
332 +
333 + goto out_check_final;
334 + }
335 +
336 if (flush || skb_gro_receive(p, skb)) {
337 mss = 1;
338 goto out_check_final;
339 @@ -289,7 +382,6 @@ out_check_final:
340 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
341 pp = p;
342
343 -out:
344 NAPI_GRO_CB(skb)->flush |= (flush != 0);
345
346 return pp;
347 @@ -315,18 +407,56 @@ int tcp_gro_complete(struct sk_buff *skb
348 }
349 EXPORT_SYMBOL(tcp_gro_complete);
350
351 +static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
352 + struct tcphdr *th)
353 +{
354 + const struct iphdr *iph = skb_gro_network_header(skb);
355 + struct net *net = dev_net(skb->dev);
356 + struct sk_buff *p;
357 + struct sock *sk;
358 + int iif, sdif;
359 +
360 + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
361 + return;
362 +
363 + p = tcp_gro_lookup(head, th);
364 + if (p) {
365 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
366 + return;
367 + }
368 +
369 + inet_get_iif_sdif(skb, &iif, &sdif);
370 + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
371 + iph->saddr, th->source,
372 + iph->daddr, ntohs(th->dest),
373 + iif, sdif);
374 + NAPI_GRO_CB(skb)->is_flist = !sk;
375 + if (sk)
376 + sock_put(sk);
377 +}
378 +
379 INDIRECT_CALLABLE_SCOPE
380 struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
381 {
382 + struct tcphdr *th;
383 +
384 /* Don't bother verifying checksum if we're going to flush anyway. */
385 if (!NAPI_GRO_CB(skb)->flush &&
386 skb_gro_checksum_validate(skb, IPPROTO_TCP,
387 - inet_gro_compute_pseudo)) {
388 - NAPI_GRO_CB(skb)->flush = 1;
389 - return NULL;
390 - }
391 + inet_gro_compute_pseudo))
392 + goto flush;
393 +
394 + th = tcp_gro_pull_header(skb);
395 + if (!th)
396 + goto flush;
397
398 - return tcp_gro_receive(head, skb);
399 + tcp4_check_fraglist_gro(head, skb, th);
400 +
401 + return tcp_gro_receive(head, skb, th);
402 +
403 +flush:
404 + NAPI_GRO_CB(skb)->flush = 1;
405 + return NULL;
406 }
407
408 INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
409 @@ -334,6 +464,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_com
410 const struct iphdr *iph = ip_hdr(skb);
411 struct tcphdr *th = tcp_hdr(skb);
412
413 + if (NAPI_GRO_CB(skb)->is_flist) {
414 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
415 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
416 +
417 + __skb_incr_checksum_unnecessary(skb);
418 +
419 + return 0;
420 + }
421 +
422 th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
423 iph->daddr, 0);
424 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
425 --- a/net/ipv4/udp_offload.c
426 +++ b/net/ipv4/udp_offload.c
427 @@ -425,33 +425,6 @@ out:
428 return segs;
429 }
430
431 -static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
432 -{
433 - if (unlikely(p->len + skb->len >= 65536))
434 - return -E2BIG;
435 -
436 - if (NAPI_GRO_CB(p)->last == p)
437 - skb_shinfo(p)->frag_list = skb;
438 - else
439 - NAPI_GRO_CB(p)->last->next = skb;
440 -
441 - skb_pull(skb, skb_gro_offset(skb));
442 -
443 - NAPI_GRO_CB(p)->last = skb;
444 - NAPI_GRO_CB(p)->count++;
445 - p->data_len += skb->len;
446 -
447 - /* sk ownership - if any - completely transferred to the aggregated packet */
448 - skb->destructor = NULL;
449 - skb->sk = NULL;
450 - p->truesize += skb->truesize;
451 - p->len += skb->len;
452 -
453 - NAPI_GRO_CB(skb)->same_flow = 1;
454 -
455 - return 0;
456 -}
457 -
458
459 #define UDP_GRO_CNT_MAX 64
460 static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
461 --- a/net/ipv6/tcpv6_offload.c
462 +++ b/net/ipv6/tcpv6_offload.c
463 @@ -7,24 +7,65 @@
464 */
465 #include <linux/indirect_call_wrapper.h>
466 #include <linux/skbuff.h>
467 +#include <net/inet6_hashtables.h>
468 #include <net/gro.h>
469 #include <net/protocol.h>
470 #include <net/tcp.h>
471 #include <net/ip6_checksum.h>
472 #include "ip6_offload.h"
473
474 +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
475 + struct tcphdr *th)
476 +{
477 +#if IS_ENABLED(CONFIG_IPV6)
478 + const struct ipv6hdr *hdr = skb_gro_network_header(skb);
479 + struct net *net = dev_net(skb->dev);
480 + struct sk_buff *p;
481 + struct sock *sk;
482 + int iif, sdif;
483 +
484 + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
485 + return;
486 +
487 + p = tcp_gro_lookup(head, th);
488 + if (p) {
489 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
490 + return;
491 + }
492 +
493 + inet6_get_iif_sdif(skb, &iif, &sdif);
494 + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
495 + &hdr->saddr, th->source,
496 + &hdr->daddr, ntohs(th->dest),
497 + iif, sdif);
498 + NAPI_GRO_CB(skb)->is_flist = !sk;
499 + if (sk)
500 + sock_put(sk);
501 +#endif /* IS_ENABLED(CONFIG_IPV6) */
502 +}
503 +
504 INDIRECT_CALLABLE_SCOPE
505 struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
506 {
507 + struct tcphdr *th;
508 +
509 /* Don't bother verifying checksum if we're going to flush anyway. */
510 if (!NAPI_GRO_CB(skb)->flush &&
511 skb_gro_checksum_validate(skb, IPPROTO_TCP,
512 - ip6_gro_compute_pseudo)) {
513 - NAPI_GRO_CB(skb)->flush = 1;
514 - return NULL;
515 - }
516 + ip6_gro_compute_pseudo))
517 + goto flush;
518
519 - return tcp_gro_receive(head, skb);
520 + th = tcp_gro_pull_header(skb);
521 + if (!th)
522 + goto flush;
523 +
524 + tcp6_check_fraglist_gro(head, skb, th);
525 +
526 + return tcp_gro_receive(head, skb, th);
527 +
528 +flush:
529 + NAPI_GRO_CB(skb)->flush = 1;
530 + return NULL;
531 }
532
533 INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
534 @@ -32,6 +73,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
535 const struct ipv6hdr *iph = ipv6_hdr(skb);
536 struct tcphdr *th = tcp_hdr(skb);
537
538 + if (NAPI_GRO_CB(skb)->is_flist) {
539 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
540 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
541 +
542 + __skb_incr_checksum_unnecessary(skb);
543 +
544 + return 0;
545 + }
546 +
547 th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
548 &iph->daddr, 0);
549 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
550 @@ -50,6 +100,9 @@ static struct sk_buff *tcp6_gso_segment(
551 if (!pskb_may_pull(skb, sizeof(*th)))
552 return ERR_PTR(-EINVAL);
553
554 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
555 + return skb_segment_list(skb, features, skb_mac_header_len(skb));
556 +
557 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
558 const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
559 struct tcphdr *th = tcp_hdr(skb);