6205901707a24aec0d0045bd8b16ebbf4a9f47df
[openwrt/staging/nbd.git] / target / linux / generic / pending-6.6 / 680-net-add-TCP-fraglist-GRO-support.patch
1 From: Felix Fietkau <nbd@nbd.name>
2 Date: Tue, 23 Apr 2024 11:23:03 +0200
3 Subject: [PATCH] net: add TCP fraglist GRO support
4
5 When forwarding TCP after GRO, software segmentation is very expensive,
6 especially when the checksum needs to be recalculated.
7 One case where that's currently unavoidable is when routing packets over
8 PPPoE. Performance improves significantly when using fraglist GRO
9 implemented in the same way as for UDP.
10
11 Here's a measurement of running 2 TCP streams through a MediaTek MT7622
12 device (2-core Cortex-A53), which runs NAT with flow offload enabled from
13 one ethernet port to PPPoE on another ethernet port + cake qdisc set to
14 1Gbps.
15
16 rx-gro-list off: 630 Mbit/s, CPU 35% idle
17 rx-gro-list on: 770 Mbit/s, CPU 40% idle
18
19 Signe-off-by: Felix Fietkau <nbd@nbd.name>
20 ---
21
22 --- a/include/net/gro.h
23 +++ b/include/net/gro.h
24 @@ -430,6 +430,7 @@ static inline __wsum ip6_gro_compute_pse
25 }
26
27 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
28 +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
29
30 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
31 static inline void gro_normal_list(struct napi_struct *napi)
32 --- a/include/net/tcp.h
33 +++ b/include/net/tcp.h
34 @@ -2082,7 +2082,10 @@ void tcp_v4_destroy_sock(struct sock *sk
35
36 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
37 netdev_features_t features);
38 -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
39 +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
40 +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
41 +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
42 + struct tcphdr *th);
43 INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
44 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
45 INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
46 --- a/net/core/gro.c
47 +++ b/net/core/gro.c
48 @@ -233,6 +233,33 @@ done:
49 return 0;
50 }
51
52 +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
53 +{
54 + if (unlikely(p->len + skb->len >= 65536))
55 + return -E2BIG;
56 +
57 + if (NAPI_GRO_CB(p)->last == p)
58 + skb_shinfo(p)->frag_list = skb;
59 + else
60 + NAPI_GRO_CB(p)->last->next = skb;
61 +
62 + skb_pull(skb, skb_gro_offset(skb));
63 +
64 + NAPI_GRO_CB(p)->last = skb;
65 + NAPI_GRO_CB(p)->count++;
66 + p->data_len += skb->len;
67 +
68 + /* sk ownership - if any - completely transferred to the aggregated packet */
69 + skb->destructor = NULL;
70 + skb->sk = NULL;
71 + p->truesize += skb->truesize;
72 + p->len += skb->len;
73 +
74 + NAPI_GRO_CB(skb)->same_flow = 1;
75 +
76 + return 0;
77 +}
78 +
79
80 static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
81 {
82 --- a/net/ipv4/tcp_offload.c
83 +++ b/net/ipv4/tcp_offload.c
84 @@ -28,6 +28,70 @@ static void tcp_gso_tstamp(struct sk_buf
85 }
86 }
87
88 +static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
89 + __be32 *oldip, __be32 newip,
90 + __be16 *oldport, __be16 newport)
91 +{
92 + struct tcphdr *th;
93 + struct iphdr *iph;
94 +
95 + if (*oldip == newip && *oldport == newport)
96 + return;
97 +
98 + th = tcp_hdr(seg);
99 + iph = ip_hdr(seg);
100 +
101 + inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
102 + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
103 + *oldport = newport;
104 +
105 + csum_replace4(&iph->check, *oldip, newip);
106 + *oldip = newip;
107 +}
108 +
109 +static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
110 +{
111 + const struct tcphdr *th;
112 + const struct iphdr *iph;
113 + struct sk_buff *seg;
114 + struct tcphdr *th2;
115 + struct iphdr *iph2;
116 +
117 + seg = segs;
118 + th = tcp_hdr(seg);
119 + iph = ip_hdr(seg);
120 + th2 = tcp_hdr(seg->next);
121 + iph2 = ip_hdr(seg->next);
122 +
123 + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
124 + iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
125 + return segs;
126 +
127 + while ((seg = seg->next)) {
128 + th2 = tcp_hdr(seg);
129 + iph2 = ip_hdr(seg);
130 +
131 + __tcpv4_gso_segment_csum(seg,
132 + &iph2->saddr, iph->saddr,
133 + &th2->source, th->source);
134 + __tcpv4_gso_segment_csum(seg,
135 + &iph2->daddr, iph->daddr,
136 + &th2->dest, th->dest);
137 + }
138 +
139 + return segs;
140 +}
141 +
142 +static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
143 + netdev_features_t features)
144 +{
145 + skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
146 + if (IS_ERR(skb))
147 + return skb;
148 +
149 + return __tcpv4_gso_segment_list_csum(skb);
150 +}
151 +
152 static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
153 netdev_features_t features)
154 {
155 @@ -37,6 +101,9 @@ static struct sk_buff *tcp4_gso_segment(
156 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
157 return ERR_PTR(-EINVAL);
158
159 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
160 + return __tcp4_gso_segment_list(skb, features);
161 +
162 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
163 const struct iphdr *iph = ip_hdr(skb);
164 struct tcphdr *th = tcp_hdr(skb);
165 @@ -178,61 +245,76 @@ out:
166 return segs;
167 }
168
169 -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
170 +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
171 {
172 - struct sk_buff *pp = NULL;
173 + struct tcphdr *th2;
174 struct sk_buff *p;
175 +
176 + list_for_each_entry(p, head, list) {
177 + if (!NAPI_GRO_CB(p)->same_flow)
178 + continue;
179 +
180 + th2 = tcp_hdr(p);
181 + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
182 + NAPI_GRO_CB(p)->same_flow = 0;
183 + continue;
184 + }
185 +
186 + return p;
187 + }
188 +
189 + return NULL;
190 +}
191 +
192 +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
193 +{
194 + unsigned int thlen, hlen, off;
195 struct tcphdr *th;
196 - struct tcphdr *th2;
197 - unsigned int len;
198 - unsigned int thlen;
199 - __be32 flags;
200 - unsigned int mss = 1;
201 - unsigned int hlen;
202 - unsigned int off;
203 - int flush = 1;
204 - int i;
205
206 off = skb_gro_offset(skb);
207 hlen = off + sizeof(*th);
208 th = skb_gro_header(skb, hlen, off);
209 if (unlikely(!th))
210 - goto out;
211 + return NULL;
212
213 thlen = th->doff * 4;
214 if (thlen < sizeof(*th))
215 - goto out;
216 + return NULL;
217
218 hlen = off + thlen;
219 if (skb_gro_header_hard(skb, hlen)) {
220 th = skb_gro_header_slow(skb, hlen, off);
221 if (unlikely(!th))
222 - goto out;
223 + return NULL;
224 }
225
226 skb_gro_pull(skb, thlen);
227
228 - len = skb_gro_len(skb);
229 - flags = tcp_flag_word(th);
230 -
231 - list_for_each_entry(p, head, list) {
232 - if (!NAPI_GRO_CB(p)->same_flow)
233 - continue;
234 + return th;
235 +}
236
237 - th2 = tcp_hdr(p);
238 +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
239 + struct tcphdr *th)
240 +{
241 + unsigned int thlen = th->doff * 4;
242 + struct sk_buff *pp = NULL;
243 + struct sk_buff *p;
244 + struct tcphdr *th2;
245 + unsigned int len;
246 + __be32 flags;
247 + unsigned int mss = 1;
248 + int flush = 1;
249 + int i;
250
251 - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
252 - NAPI_GRO_CB(p)->same_flow = 0;
253 - continue;
254 - }
255 + len = skb_gro_len(skb);
256 + flags = tcp_flag_word(th);
257
258 - goto found;
259 - }
260 - p = NULL;
261 - goto out_check_final;
262 + p = tcp_gro_lookup(head, th);
263 + if (!p)
264 + goto out_check_final;
265
266 -found:
267 /* Include the IP ID check below from the inner most IP hdr */
268 + th2 = tcp_hdr(p);
269 flush = NAPI_GRO_CB(p)->flush;
270 flush |= (__force int)(flags & TCP_FLAG_CWR);
271 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
272 @@ -269,6 +351,18 @@ found:
273 flush |= p->decrypted ^ skb->decrypted;
274 #endif
275
276 + if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
277 + flush |= (__force int)(flags ^ tcp_flag_word(th2));
278 + flush |= skb->ip_summed != p->ip_summed;
279 + flush |= skb->csum_level != p->csum_level;
280 + flush |= NAPI_GRO_CB(p)->count >= 64;
281 +
282 + if (flush || skb_gro_receive_list(p, skb))
283 + mss = 1;
284 +
285 + goto out_check_final;
286 + }
287 +
288 if (flush || skb_gro_receive(p, skb)) {
289 mss = 1;
290 goto out_check_final;
291 @@ -290,7 +384,6 @@ out_check_final:
292 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
293 pp = p;
294
295 -out:
296 NAPI_GRO_CB(skb)->flush |= (flush != 0);
297
298 return pp;
299 @@ -314,18 +407,58 @@ void tcp_gro_complete(struct sk_buff *sk
300 }
301 EXPORT_SYMBOL(tcp_gro_complete);
302
303 +static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
304 + struct tcphdr *th)
305 +{
306 + const struct iphdr *iph;
307 + struct sk_buff *p;
308 + struct sock *sk;
309 + struct net *net;
310 + int iif, sdif;
311 +
312 + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
313 + return;
314 +
315 + p = tcp_gro_lookup(head, th);
316 + if (p) {
317 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
318 + return;
319 + }
320 +
321 + inet_get_iif_sdif(skb, &iif, &sdif);
322 + iph = skb_gro_network_header(skb);
323 + net = dev_net(skb->dev);
324 + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
325 + iph->saddr, th->source,
326 + iph->daddr, ntohs(th->dest),
327 + iif, sdif);
328 + NAPI_GRO_CB(skb)->is_flist = !sk;
329 + if (sk)
330 + sock_put(sk);
331 +}
332 +
333 INDIRECT_CALLABLE_SCOPE
334 struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
335 {
336 + struct tcphdr *th;
337 +
338 /* Don't bother verifying checksum if we're going to flush anyway. */
339 if (!NAPI_GRO_CB(skb)->flush &&
340 skb_gro_checksum_validate(skb, IPPROTO_TCP,
341 - inet_gro_compute_pseudo)) {
342 - NAPI_GRO_CB(skb)->flush = 1;
343 - return NULL;
344 - }
345 + inet_gro_compute_pseudo))
346 + goto flush;
347 +
348 + th = tcp_gro_pull_header(skb);
349 + if (!th)
350 + goto flush;
351
352 - return tcp_gro_receive(head, skb);
353 + tcp4_check_fraglist_gro(head, skb, th);
354 +
355 + return tcp_gro_receive(head, skb, th);
356 +
357 +flush:
358 + NAPI_GRO_CB(skb)->flush = 1;
359 + return NULL;
360 }
361
362 INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
363 @@ -333,6 +466,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_com
364 const struct iphdr *iph = ip_hdr(skb);
365 struct tcphdr *th = tcp_hdr(skb);
366
367 + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
368 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
369 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
370 +
371 + __skb_incr_checksum_unnecessary(skb);
372 +
373 + return 0;
374 + }
375 +
376 th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
377 iph->daddr, 0);
378 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
379 --- a/net/ipv4/udp_offload.c
380 +++ b/net/ipv4/udp_offload.c
381 @@ -433,33 +433,6 @@ out:
382 return segs;
383 }
384
385 -static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
386 -{
387 - if (unlikely(p->len + skb->len >= 65536))
388 - return -E2BIG;
389 -
390 - if (NAPI_GRO_CB(p)->last == p)
391 - skb_shinfo(p)->frag_list = skb;
392 - else
393 - NAPI_GRO_CB(p)->last->next = skb;
394 -
395 - skb_pull(skb, skb_gro_offset(skb));
396 -
397 - NAPI_GRO_CB(p)->last = skb;
398 - NAPI_GRO_CB(p)->count++;
399 - p->data_len += skb->len;
400 -
401 - /* sk ownership - if any - completely transferred to the aggregated packet */
402 - skb->destructor = NULL;
403 - skb->sk = NULL;
404 - p->truesize += skb->truesize;
405 - p->len += skb->len;
406 -
407 - NAPI_GRO_CB(skb)->same_flow = 1;
408 -
409 - return 0;
410 -}
411 -
412
413 #define UDP_GRO_CNT_MAX 64
414 static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
415 --- a/net/ipv6/tcpv6_offload.c
416 +++ b/net/ipv6/tcpv6_offload.c
417 @@ -7,24 +7,67 @@
418 */
419 #include <linux/indirect_call_wrapper.h>
420 #include <linux/skbuff.h>
421 +#include <net/inet6_hashtables.h>
422 #include <net/gro.h>
423 #include <net/protocol.h>
424 #include <net/tcp.h>
425 #include <net/ip6_checksum.h>
426 #include "ip6_offload.h"
427
428 +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
429 + struct tcphdr *th)
430 +{
431 +#if IS_ENABLED(CONFIG_IPV6)
432 + const struct ipv6hdr *hdr;
433 + struct sk_buff *p;
434 + struct sock *sk;
435 + struct net *net;
436 + int iif, sdif;
437 +
438 + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
439 + return;
440 +
441 + p = tcp_gro_lookup(head, th);
442 + if (p) {
443 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
444 + return;
445 + }
446 +
447 + inet6_get_iif_sdif(skb, &iif, &sdif);
448 + hdr = skb_gro_network_header(skb);
449 + net = dev_net(skb->dev);
450 + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
451 + &hdr->saddr, th->source,
452 + &hdr->daddr, ntohs(th->dest),
453 + iif, sdif);
454 + NAPI_GRO_CB(skb)->is_flist = !sk;
455 + if (sk)
456 + sock_put(sk);
457 +#endif /* IS_ENABLED(CONFIG_IPV6) */
458 +}
459 +
460 INDIRECT_CALLABLE_SCOPE
461 struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
462 {
463 + struct tcphdr *th;
464 +
465 /* Don't bother verifying checksum if we're going to flush anyway. */
466 if (!NAPI_GRO_CB(skb)->flush &&
467 skb_gro_checksum_validate(skb, IPPROTO_TCP,
468 - ip6_gro_compute_pseudo)) {
469 - NAPI_GRO_CB(skb)->flush = 1;
470 - return NULL;
471 - }
472 + ip6_gro_compute_pseudo))
473 + goto flush;
474
475 - return tcp_gro_receive(head, skb);
476 + th = tcp_gro_pull_header(skb);
477 + if (!th)
478 + goto flush;
479 +
480 + tcp6_check_fraglist_gro(head, skb, th);
481 +
482 + return tcp_gro_receive(head, skb, th);
483 +
484 +flush:
485 + NAPI_GRO_CB(skb)->flush = 1;
486 + return NULL;
487 }
488
489 INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
490 @@ -32,6 +75,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
491 const struct ipv6hdr *iph = ipv6_hdr(skb);
492 struct tcphdr *th = tcp_hdr(skb);
493
494 + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
495 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
496 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
497 +
498 + __skb_incr_checksum_unnecessary(skb);
499 +
500 + return 0;
501 + }
502 +
503 th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
504 &iph->daddr, 0);
505 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
506 @@ -40,6 +92,61 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
507 return 0;
508 }
509
510 +static void __tcpv6_gso_segment_csum(struct sk_buff *seg,
511 + __be16 *oldport, __be16 newport)
512 +{
513 + struct tcphdr *th;
514 +
515 + if (*oldport == newport)
516 + return;
517 +
518 + th = tcp_hdr(seg);
519 + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
520 + *oldport = newport;
521 +}
522 +
523 +static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs)
524 +{
525 + const struct tcphdr *th;
526 + const struct ipv6hdr *iph;
527 + struct sk_buff *seg;
528 + struct tcphdr *th2;
529 + struct ipv6hdr *iph2;
530 +
531 + seg = segs;
532 + th = tcp_hdr(seg);
533 + iph = ipv6_hdr(seg);
534 + th2 = tcp_hdr(seg->next);
535 + iph2 = ipv6_hdr(seg->next);
536 +
537 + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
538 + ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
539 + ipv6_addr_equal(&iph->daddr, &iph2->daddr))
540 + return segs;
541 +
542 + while ((seg = seg->next)) {
543 + th2 = tcp_hdr(seg);
544 + iph2 = ipv6_hdr(seg);
545 +
546 + iph2->saddr = iph->saddr;
547 + iph2->daddr = iph->daddr;
548 + __tcpv6_gso_segment_csum(seg, &th2->source, th->source);
549 + __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest);
550 + }
551 +
552 + return segs;
553 +}
554 +
555 +static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb,
556 + netdev_features_t features)
557 +{
558 + skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
559 + if (IS_ERR(skb))
560 + return skb;
561 +
562 + return __tcpv6_gso_segment_list_csum(skb);
563 +}
564 +
565 static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
566 netdev_features_t features)
567 {
568 @@ -51,6 +158,9 @@ static struct sk_buff *tcp6_gso_segment(
569 if (!pskb_may_pull(skb, sizeof(*th)))
570 return ERR_PTR(-EINVAL);
571
572 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
573 + return __tcp6_gso_segment_list(skb, features);
574 +
575 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
576 const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
577 struct tcphdr *th = tcp_hdr(skb);