11255afbdaca7b43be9b5ab9c1fb132bdb8918cc
[openwrt/openwrt.git] / target / linux / generic / pending-6.6 / 680-net-add-TCP-fraglist-GRO-support.patch
1 From: Felix Fietkau <nbd@nbd.name>
2 Date: Tue, 23 Apr 2024 11:23:03 +0200
3 Subject: [PATCH] net: add TCP fraglist GRO support
4
5 When forwarding TCP after GRO, software segmentation is very expensive,
6 especially when the checksum needs to be recalculated.
7 One case where that's currently unavoidable is when routing packets over
8 PPPoE. Performance improves significantly when using fraglist GRO
9 implemented in the same way as for UDP.
10
11 Here's a measurement of running 2 TCP streams through a MediaTek MT7622
12 device (2-core Cortex-A53), which runs NAT with flow offload enabled from
13 one ethernet port to PPPoE on another ethernet port + cake qdisc set to
14 1Gbps.
15
16 rx-gro-list off: 630 Mbit/s, CPU 35% idle
17 rx-gro-list on: 770 Mbit/s, CPU 40% idle
18
19 Signe-off-by: Felix Fietkau <nbd@nbd.name>
20 ---
21
22 --- a/include/net/gro.h
23 +++ b/include/net/gro.h
24 @@ -430,6 +430,7 @@ static inline __wsum ip6_gro_compute_pse
25 }
26
27 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
28 +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
29
30 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
31 static inline void gro_normal_list(struct napi_struct *napi)
32 --- a/include/net/tcp.h
33 +++ b/include/net/tcp.h
34 @@ -2082,7 +2082,10 @@ void tcp_v4_destroy_sock(struct sock *sk
35
36 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
37 netdev_features_t features);
38 -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
39 +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
40 +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
41 +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
42 + struct tcphdr *th);
43 INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
44 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
45 INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
46 --- a/net/core/gro.c
47 +++ b/net/core/gro.c
48 @@ -233,6 +233,33 @@ done:
49 return 0;
50 }
51
52 +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
53 +{
54 + if (unlikely(p->len + skb->len >= 65536))
55 + return -E2BIG;
56 +
57 + if (NAPI_GRO_CB(p)->last == p)
58 + skb_shinfo(p)->frag_list = skb;
59 + else
60 + NAPI_GRO_CB(p)->last->next = skb;
61 +
62 + skb_pull(skb, skb_gro_offset(skb));
63 +
64 + NAPI_GRO_CB(p)->last = skb;
65 + NAPI_GRO_CB(p)->count++;
66 + p->data_len += skb->len;
67 +
68 + /* sk ownership - if any - completely transferred to the aggregated packet */
69 + skb->destructor = NULL;
70 + skb->sk = NULL;
71 + p->truesize += skb->truesize;
72 + p->len += skb->len;
73 +
74 + NAPI_GRO_CB(skb)->same_flow = 1;
75 +
76 + return 0;
77 +}
78 +
79
80 static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
81 {
82 --- a/net/ipv4/tcp_offload.c
83 +++ b/net/ipv4/tcp_offload.c
84 @@ -28,6 +28,68 @@ static void tcp_gso_tstamp(struct sk_buf
85 }
86 }
87
88 +static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
89 + __be32 *oldip, __be32 *newip,
90 + __be16 *oldport, __be16 *newport)
91 +{
92 + struct tcphdr *th;
93 + struct iphdr *iph;
94 +
95 + if (*oldip == *newip && *oldport == *newport)
96 + return;
97 +
98 + th = tcp_hdr(seg);
99 + iph = ip_hdr(seg);
100 +
101 + inet_proto_csum_replace4(&th->check, seg, *oldip, *newip, true);
102 + inet_proto_csum_replace2(&th->check, seg, *oldport, *newport, false);
103 + *oldport = *newport;
104 +
105 + csum_replace4(&iph->check, *oldip, *newip);
106 + *oldip = *newip;
107 +}
108 +
109 +static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
110 +{
111 + struct sk_buff *seg;
112 + struct tcphdr *th, *th2;
113 + struct iphdr *iph, *iph2;
114 +
115 + seg = segs;
116 + th = tcp_hdr(seg);
117 + iph = ip_hdr(seg);
118 + th2 = tcp_hdr(seg->next);
119 + iph2 = ip_hdr(seg->next);
120 +
121 + if (!(*(u32 *)&th->source ^ *(u32 *)&th2->source) &&
122 + iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
123 + return segs;
124 +
125 + while ((seg = seg->next)) {
126 + th2 = tcp_hdr(seg);
127 + iph2 = ip_hdr(seg);
128 +
129 + __tcpv4_gso_segment_csum(seg,
130 + &iph2->saddr, &iph->saddr,
131 + &th2->source, &th->source);
132 + __tcpv4_gso_segment_csum(seg,
133 + &iph2->daddr, &iph->daddr,
134 + &th2->dest, &th->dest);
135 + }
136 +
137 + return segs;
138 +}
139 +
140 +static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
141 + netdev_features_t features)
142 +{
143 + skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
144 + if (IS_ERR(skb))
145 + return skb;
146 +
147 + return __tcpv4_gso_segment_list_csum(skb);
148 +}
149 +
150 static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
151 netdev_features_t features)
152 {
153 @@ -37,6 +99,9 @@ static struct sk_buff *tcp4_gso_segment(
154 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
155 return ERR_PTR(-EINVAL);
156
157 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
158 + return __tcp4_gso_segment_list(skb, features);
159 +
160 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
161 const struct iphdr *iph = ip_hdr(skb);
162 struct tcphdr *th = tcp_hdr(skb);
163 @@ -178,61 +243,76 @@ out:
164 return segs;
165 }
166
167 -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
168 +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
169 {
170 - struct sk_buff *pp = NULL;
171 + struct tcphdr *th2;
172 struct sk_buff *p;
173 +
174 + list_for_each_entry(p, head, list) {
175 + if (!NAPI_GRO_CB(p)->same_flow)
176 + continue;
177 +
178 + th2 = tcp_hdr(p);
179 + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
180 + NAPI_GRO_CB(p)->same_flow = 0;
181 + continue;
182 + }
183 +
184 + return p;
185 + }
186 +
187 + return NULL;
188 +}
189 +
190 +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
191 +{
192 + unsigned int thlen, hlen, off;
193 struct tcphdr *th;
194 - struct tcphdr *th2;
195 - unsigned int len;
196 - unsigned int thlen;
197 - __be32 flags;
198 - unsigned int mss = 1;
199 - unsigned int hlen;
200 - unsigned int off;
201 - int flush = 1;
202 - int i;
203
204 off = skb_gro_offset(skb);
205 hlen = off + sizeof(*th);
206 th = skb_gro_header(skb, hlen, off);
207 if (unlikely(!th))
208 - goto out;
209 + return NULL;
210
211 thlen = th->doff * 4;
212 if (thlen < sizeof(*th))
213 - goto out;
214 + return NULL;
215
216 hlen = off + thlen;
217 if (skb_gro_header_hard(skb, hlen)) {
218 th = skb_gro_header_slow(skb, hlen, off);
219 if (unlikely(!th))
220 - goto out;
221 + return NULL;
222 }
223
224 skb_gro_pull(skb, thlen);
225
226 - len = skb_gro_len(skb);
227 - flags = tcp_flag_word(th);
228 -
229 - list_for_each_entry(p, head, list) {
230 - if (!NAPI_GRO_CB(p)->same_flow)
231 - continue;
232 + return th;
233 +}
234
235 - th2 = tcp_hdr(p);
236 +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
237 + struct tcphdr *th)
238 +{
239 + unsigned int thlen = th->doff * 4;
240 + struct sk_buff *pp = NULL;
241 + struct sk_buff *p;
242 + struct tcphdr *th2;
243 + unsigned int len;
244 + __be32 flags;
245 + unsigned int mss = 1;
246 + int flush = 1;
247 + int i;
248
249 - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
250 - NAPI_GRO_CB(p)->same_flow = 0;
251 - continue;
252 - }
253 + len = skb_gro_len(skb);
254 + flags = tcp_flag_word(th);
255
256 - goto found;
257 - }
258 - p = NULL;
259 - goto out_check_final;
260 + p = tcp_gro_lookup(head, th);
261 + if (!p)
262 + goto out_check_final;
263
264 -found:
265 /* Include the IP ID check below from the inner most IP hdr */
266 + th2 = tcp_hdr(p);
267 flush = NAPI_GRO_CB(p)->flush;
268 flush |= (__force int)(flags & TCP_FLAG_CWR);
269 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
270 @@ -269,6 +349,19 @@ found:
271 flush |= p->decrypted ^ skb->decrypted;
272 #endif
273
274 + if (NAPI_GRO_CB(p)->is_flist) {
275 + flush |= (__force int)(flags ^ tcp_flag_word(th2));
276 + flush |= skb->ip_summed != p->ip_summed;
277 + flush |= skb->csum_level != p->csum_level;
278 + flush |= !pskb_may_pull(skb, skb_gro_offset(skb));
279 + flush |= NAPI_GRO_CB(p)->count >= 64;
280 +
281 + if (flush || skb_gro_receive_list(p, skb))
282 + mss = 1;
283 +
284 + goto out_check_final;
285 + }
286 +
287 if (flush || skb_gro_receive(p, skb)) {
288 mss = 1;
289 goto out_check_final;
290 @@ -290,7 +383,6 @@ out_check_final:
291 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
292 pp = p;
293
294 -out:
295 NAPI_GRO_CB(skb)->flush |= (flush != 0);
296
297 return pp;
298 @@ -314,18 +406,56 @@ void tcp_gro_complete(struct sk_buff *sk
299 }
300 EXPORT_SYMBOL(tcp_gro_complete);
301
302 +static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
303 + struct tcphdr *th)
304 +{
305 + const struct iphdr *iph = skb_gro_network_header(skb);
306 + struct net *net = dev_net(skb->dev);
307 + struct sk_buff *p;
308 + struct sock *sk;
309 + int iif, sdif;
310 +
311 + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
312 + return;
313 +
314 + p = tcp_gro_lookup(head, th);
315 + if (p) {
316 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
317 + return;
318 + }
319 +
320 + inet_get_iif_sdif(skb, &iif, &sdif);
321 + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
322 + iph->saddr, th->source,
323 + iph->daddr, ntohs(th->dest),
324 + iif, sdif);
325 + NAPI_GRO_CB(skb)->is_flist = !sk;
326 + if (sk)
327 + sock_put(sk);
328 +}
329 +
330 INDIRECT_CALLABLE_SCOPE
331 struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
332 {
333 + struct tcphdr *th;
334 +
335 /* Don't bother verifying checksum if we're going to flush anyway. */
336 if (!NAPI_GRO_CB(skb)->flush &&
337 skb_gro_checksum_validate(skb, IPPROTO_TCP,
338 - inet_gro_compute_pseudo)) {
339 - NAPI_GRO_CB(skb)->flush = 1;
340 - return NULL;
341 - }
342 + inet_gro_compute_pseudo))
343 + goto flush;
344 +
345 + th = tcp_gro_pull_header(skb);
346 + if (!th)
347 + goto flush;
348
349 - return tcp_gro_receive(head, skb);
350 + tcp4_check_fraglist_gro(head, skb, th);
351 +
352 + return tcp_gro_receive(head, skb, th);
353 +
354 +flush:
355 + NAPI_GRO_CB(skb)->flush = 1;
356 + return NULL;
357 }
358
359 INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
360 @@ -333,6 +463,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_com
361 const struct iphdr *iph = ip_hdr(skb);
362 struct tcphdr *th = tcp_hdr(skb);
363
364 + if (NAPI_GRO_CB(skb)->is_flist) {
365 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
366 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
367 +
368 + __skb_incr_checksum_unnecessary(skb);
369 +
370 + return 0;
371 + }
372 +
373 th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
374 iph->daddr, 0);
375 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
376 --- a/net/ipv4/udp_offload.c
377 +++ b/net/ipv4/udp_offload.c
378 @@ -433,33 +433,6 @@ out:
379 return segs;
380 }
381
382 -static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
383 -{
384 - if (unlikely(p->len + skb->len >= 65536))
385 - return -E2BIG;
386 -
387 - if (NAPI_GRO_CB(p)->last == p)
388 - skb_shinfo(p)->frag_list = skb;
389 - else
390 - NAPI_GRO_CB(p)->last->next = skb;
391 -
392 - skb_pull(skb, skb_gro_offset(skb));
393 -
394 - NAPI_GRO_CB(p)->last = skb;
395 - NAPI_GRO_CB(p)->count++;
396 - p->data_len += skb->len;
397 -
398 - /* sk ownership - if any - completely transferred to the aggregated packet */
399 - skb->destructor = NULL;
400 - skb->sk = NULL;
401 - p->truesize += skb->truesize;
402 - p->len += skb->len;
403 -
404 - NAPI_GRO_CB(skb)->same_flow = 1;
405 -
406 - return 0;
407 -}
408 -
409
410 #define UDP_GRO_CNT_MAX 64
411 static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
412 --- a/net/ipv6/tcpv6_offload.c
413 +++ b/net/ipv6/tcpv6_offload.c
414 @@ -7,24 +7,65 @@
415 */
416 #include <linux/indirect_call_wrapper.h>
417 #include <linux/skbuff.h>
418 +#include <net/inet6_hashtables.h>
419 #include <net/gro.h>
420 #include <net/protocol.h>
421 #include <net/tcp.h>
422 #include <net/ip6_checksum.h>
423 #include "ip6_offload.h"
424
425 +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
426 + struct tcphdr *th)
427 +{
428 +#if IS_ENABLED(CONFIG_IPV6)
429 + const struct ipv6hdr *hdr = skb_gro_network_header(skb);
430 + struct net *net = dev_net(skb->dev);
431 + struct sk_buff *p;
432 + struct sock *sk;
433 + int iif, sdif;
434 +
435 + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
436 + return;
437 +
438 + p = tcp_gro_lookup(head, th);
439 + if (p) {
440 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
441 + return;
442 + }
443 +
444 + inet6_get_iif_sdif(skb, &iif, &sdif);
445 + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
446 + &hdr->saddr, th->source,
447 + &hdr->daddr, ntohs(th->dest),
448 + iif, sdif);
449 + NAPI_GRO_CB(skb)->is_flist = !sk;
450 + if (sk)
451 + sock_put(sk);
452 +#endif /* IS_ENABLED(CONFIG_IPV6) */
453 +}
454 +
455 INDIRECT_CALLABLE_SCOPE
456 struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
457 {
458 + struct tcphdr *th;
459 +
460 /* Don't bother verifying checksum if we're going to flush anyway. */
461 if (!NAPI_GRO_CB(skb)->flush &&
462 skb_gro_checksum_validate(skb, IPPROTO_TCP,
463 - ip6_gro_compute_pseudo)) {
464 - NAPI_GRO_CB(skb)->flush = 1;
465 - return NULL;
466 - }
467 + ip6_gro_compute_pseudo))
468 + goto flush;
469
470 - return tcp_gro_receive(head, skb);
471 + th = tcp_gro_pull_header(skb);
472 + if (!th)
473 + goto flush;
474 +
475 + tcp6_check_fraglist_gro(head, skb, th);
476 +
477 + return tcp_gro_receive(head, skb, th);
478 +
479 +flush:
480 + NAPI_GRO_CB(skb)->flush = 1;
481 + return NULL;
482 }
483
484 INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
485 @@ -32,6 +73,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
486 const struct ipv6hdr *iph = ipv6_hdr(skb);
487 struct tcphdr *th = tcp_hdr(skb);
488
489 + if (NAPI_GRO_CB(skb)->is_flist) {
490 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
491 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
492 +
493 + __skb_incr_checksum_unnecessary(skb);
494 +
495 + return 0;
496 + }
497 +
498 th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
499 &iph->daddr, 0);
500 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
501 @@ -51,6 +101,9 @@ static struct sk_buff *tcp6_gso_segment(
502 if (!pskb_may_pull(skb, sizeof(*th)))
503 return ERR_PTR(-EINVAL);
504
505 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
506 + return skb_segment_list(skb, features, skb_mac_header_len(skb));
507 +
508 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
509 const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
510 struct tcphdr *th = tcp_hdr(skb);