1 From: Felix Fietkau <nbd@nbd.name>
2 Date: Tue, 23 Apr 2024 11:23:03 +0200
3 Subject: [PATCH] net: add TCP fraglist GRO support
5 When forwarding TCP after GRO, software segmentation is very expensive,
6 especially when the checksum needs to be recalculated.
7 One case where that's currently unavoidable is when routing packets over
8 PPPoE. Performance improves significantly when using fraglist GRO
9 implemented in the same way as for UDP.
11 Here's a measurement of running 2 TCP streams through a MediaTek MT7622
12 device (2-core Cortex-A53), which runs NAT with flow offload enabled from
13 one ethernet port to PPPoE on another ethernet port + cake qdisc set to
16 rx-gro-list off: 630 Mbit/s, CPU 35% idle
17 rx-gro-list on: 770 Mbit/s, CPU 40% idle
19 Signe-off-by: Felix Fietkau <nbd@nbd.name>
22 --- a/include/net/gro.h
23 +++ b/include/net/gro.h
24 @@ -424,6 +424,7 @@ static inline __wsum ip6_gro_compute_pse
27 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
28 +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
30 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
31 static inline void gro_normal_list(struct napi_struct *napi)
32 @@ -446,5 +447,48 @@ static inline void gro_normal_one(struct
33 gro_normal_list(napi);
36 +/* This function is the alternative of 'inet_iif' and 'inet_sdif'
37 + * functions in case we can not rely on fields of IPCB.
39 + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
40 + * The caller must hold the RCU read lock.
42 +static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
44 + *iif = inet_iif(skb) ?: skb->dev->ifindex;
47 +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
48 + if (netif_is_l3_slave(skb->dev)) {
49 + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
52 + *iif = master ? master->ifindex : 0;
57 +/* This function is the alternative of 'inet6_iif' and 'inet6_sdif'
58 + * functions in case we can not rely on fields of IP6CB.
60 + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
61 + * The caller must hold the RCU read lock.
63 +static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
65 + /* using skb->dev->ifindex because skb_dst(skb) is not initialized */
66 + *iif = skb->dev->ifindex;
69 +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
70 + if (netif_is_l3_slave(skb->dev)) {
71 + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
74 + *iif = master ? master->ifindex : 0;
80 #endif /* _NET_IPV6_GRO_H */
81 --- a/include/net/tcp.h
82 +++ b/include/net/tcp.h
83 @@ -2057,7 +2057,10 @@ void tcp_v4_destroy_sock(struct sock *sk
85 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
86 netdev_features_t features);
87 -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
88 +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
89 +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
90 +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
92 INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
93 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
94 INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
97 @@ -290,6 +290,33 @@ done:
101 +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
103 + if (unlikely(p->len + skb->len >= 65536))
106 + if (NAPI_GRO_CB(p)->last == p)
107 + skb_shinfo(p)->frag_list = skb;
109 + NAPI_GRO_CB(p)->last->next = skb;
111 + skb_pull(skb, skb_gro_offset(skb));
113 + NAPI_GRO_CB(p)->last = skb;
114 + NAPI_GRO_CB(p)->count++;
115 + p->data_len += skb->len;
117 + /* sk ownership - if any - completely transferred to the aggregated packet */
118 + skb->destructor = NULL;
120 + p->truesize += skb->truesize;
121 + p->len += skb->len;
123 + NAPI_GRO_CB(skb)->same_flow = 1;
129 static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
131 --- a/net/ipv4/tcp_offload.c
132 +++ b/net/ipv4/tcp_offload.c
133 @@ -27,6 +27,70 @@ static void tcp_gso_tstamp(struct sk_buf
137 +static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
138 + __be32 *oldip, __be32 newip,
139 + __be16 *oldport, __be16 newport)
144 + if (*oldip == newip && *oldport == newport)
150 + inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
151 + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
152 + *oldport = newport;
154 + csum_replace4(&iph->check, *oldip, newip);
158 +static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
160 + const struct tcphdr *th;
161 + const struct iphdr *iph;
162 + struct sk_buff *seg;
163 + struct tcphdr *th2;
164 + struct iphdr *iph2;
169 + th2 = tcp_hdr(seg->next);
170 + iph2 = ip_hdr(seg->next);
172 + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
173 + iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
176 + while ((seg = seg->next)) {
177 + th2 = tcp_hdr(seg);
178 + iph2 = ip_hdr(seg);
180 + __tcpv4_gso_segment_csum(seg,
181 + &iph2->saddr, iph->saddr,
182 + &th2->source, th->source);
183 + __tcpv4_gso_segment_csum(seg,
184 + &iph2->daddr, iph->daddr,
185 + &th2->dest, th->dest);
191 +static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
192 + netdev_features_t features)
194 + skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
198 + return __tcpv4_gso_segment_list_csum(skb);
201 static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
202 netdev_features_t features)
204 @@ -36,6 +100,9 @@ static struct sk_buff *tcp4_gso_segment(
205 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
206 return ERR_PTR(-EINVAL);
208 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
209 + return __tcp4_gso_segment_list(skb, features);
211 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
212 const struct iphdr *iph = ip_hdr(skb);
213 struct tcphdr *th = tcp_hdr(skb);
214 @@ -177,61 +244,76 @@ out:
218 -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
219 +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
221 - struct sk_buff *pp = NULL;
222 + struct tcphdr *th2;
225 + list_for_each_entry(p, head, list) {
226 + if (!NAPI_GRO_CB(p)->same_flow)
230 + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
231 + NAPI_GRO_CB(p)->same_flow = 0;
241 +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
243 + unsigned int thlen, hlen, off;
245 - struct tcphdr *th2;
247 - unsigned int thlen;
249 - unsigned int mss = 1;
255 off = skb_gro_offset(skb);
256 hlen = off + sizeof(*th);
257 th = skb_gro_header(skb, hlen, off);
262 thlen = th->doff * 4;
263 if (thlen < sizeof(*th))
268 if (skb_gro_header_hard(skb, hlen)) {
269 th = skb_gro_header_slow(skb, hlen, off);
275 skb_gro_pull(skb, thlen);
277 - len = skb_gro_len(skb);
278 - flags = tcp_flag_word(th);
280 - list_for_each_entry(p, head, list) {
281 - if (!NAPI_GRO_CB(p)->same_flow)
287 +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
290 + unsigned int thlen = th->doff * 4;
291 + struct sk_buff *pp = NULL;
293 + struct tcphdr *th2;
296 + unsigned int mss = 1;
300 - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
301 - NAPI_GRO_CB(p)->same_flow = 0;
304 + len = skb_gro_len(skb);
305 + flags = tcp_flag_word(th);
310 - goto out_check_final;
311 + p = tcp_gro_lookup(head, th);
313 + goto out_check_final;
316 /* Include the IP ID check below from the inner most IP hdr */
318 flush = NAPI_GRO_CB(p)->flush;
319 flush |= (__force int)(flags & TCP_FLAG_CWR);
320 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
321 @@ -268,6 +350,19 @@ found:
322 flush |= p->decrypted ^ skb->decrypted;
325 + if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
326 + flush |= (__force int)(flags ^ tcp_flag_word(th2));
327 + flush |= skb->ip_summed != p->ip_summed;
328 + flush |= skb->csum_level != p->csum_level;
329 + flush |= !pskb_may_pull(skb, skb_gro_offset(skb));
330 + flush |= NAPI_GRO_CB(p)->count >= 64;
332 + if (flush || skb_gro_receive_list(p, skb))
335 + goto out_check_final;
338 if (flush || skb_gro_receive(p, skb)) {
340 goto out_check_final;
341 @@ -289,7 +384,6 @@ out_check_final:
342 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
346 NAPI_GRO_CB(skb)->flush |= (flush != 0);
349 @@ -315,18 +409,58 @@ int tcp_gro_complete(struct sk_buff *skb
351 EXPORT_SYMBOL(tcp_gro_complete);
353 +static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
356 + const struct iphdr *iph;
362 + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
365 + p = tcp_gro_lookup(head, th);
367 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
371 + inet_get_iif_sdif(skb, &iif, &sdif);
372 + iph = skb_gro_network_header(skb);
373 + net = dev_net(skb->dev);
374 + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
375 + iph->saddr, th->source,
376 + iph->daddr, ntohs(th->dest),
378 + NAPI_GRO_CB(skb)->is_flist = !sk;
383 INDIRECT_CALLABLE_SCOPE
384 struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
388 /* Don't bother verifying checksum if we're going to flush anyway. */
389 if (!NAPI_GRO_CB(skb)->flush &&
390 skb_gro_checksum_validate(skb, IPPROTO_TCP,
391 - inet_gro_compute_pseudo)) {
392 - NAPI_GRO_CB(skb)->flush = 1;
395 + inet_gro_compute_pseudo))
398 + th = tcp_gro_pull_header(skb);
402 - return tcp_gro_receive(head, skb);
403 + tcp4_check_fraglist_gro(head, skb, th);
405 + return tcp_gro_receive(head, skb, th);
408 + NAPI_GRO_CB(skb)->flush = 1;
412 INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
413 @@ -334,6 +468,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_com
414 const struct iphdr *iph = ip_hdr(skb);
415 struct tcphdr *th = tcp_hdr(skb);
417 + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
418 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
419 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
421 + __skb_incr_checksum_unnecessary(skb);
426 th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
428 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
429 --- a/net/ipv4/udp_offload.c
430 +++ b/net/ipv4/udp_offload.c
431 @@ -425,33 +425,6 @@ out:
435 -static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
437 - if (unlikely(p->len + skb->len >= 65536))
440 - if (NAPI_GRO_CB(p)->last == p)
441 - skb_shinfo(p)->frag_list = skb;
443 - NAPI_GRO_CB(p)->last->next = skb;
445 - skb_pull(skb, skb_gro_offset(skb));
447 - NAPI_GRO_CB(p)->last = skb;
448 - NAPI_GRO_CB(p)->count++;
449 - p->data_len += skb->len;
451 - /* sk ownership - if any - completely transferred to the aggregated packet */
452 - skb->destructor = NULL;
454 - p->truesize += skb->truesize;
455 - p->len += skb->len;
457 - NAPI_GRO_CB(skb)->same_flow = 1;
463 #define UDP_GRO_CNT_MAX 64
464 static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
465 --- a/net/ipv6/tcpv6_offload.c
466 +++ b/net/ipv6/tcpv6_offload.c
469 #include <linux/indirect_call_wrapper.h>
470 #include <linux/skbuff.h>
471 +#include <net/inet6_hashtables.h>
473 #include <net/protocol.h>
475 #include <net/ip6_checksum.h>
476 #include "ip6_offload.h"
478 +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
481 +#if IS_ENABLED(CONFIG_IPV6)
482 + const struct ipv6hdr *hdr;
488 + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
491 + p = tcp_gro_lookup(head, th);
493 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
497 + inet6_get_iif_sdif(skb, &iif, &sdif);
498 + hdr = skb_gro_network_header(skb);
499 + net = dev_net(skb->dev);
500 + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
501 + &hdr->saddr, th->source,
502 + &hdr->daddr, ntohs(th->dest),
504 + NAPI_GRO_CB(skb)->is_flist = !sk;
507 +#endif /* IS_ENABLED(CONFIG_IPV6) */
510 INDIRECT_CALLABLE_SCOPE
511 struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
515 /* Don't bother verifying checksum if we're going to flush anyway. */
516 if (!NAPI_GRO_CB(skb)->flush &&
517 skb_gro_checksum_validate(skb, IPPROTO_TCP,
518 - ip6_gro_compute_pseudo)) {
519 - NAPI_GRO_CB(skb)->flush = 1;
522 + ip6_gro_compute_pseudo))
525 - return tcp_gro_receive(head, skb);
526 + th = tcp_gro_pull_header(skb);
530 + tcp6_check_fraglist_gro(head, skb, th);
532 + return tcp_gro_receive(head, skb, th);
535 + NAPI_GRO_CB(skb)->flush = 1;
539 INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
540 @@ -32,6 +75,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
541 const struct ipv6hdr *iph = ipv6_hdr(skb);
542 struct tcphdr *th = tcp_hdr(skb);
544 + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
545 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
546 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
548 + __skb_incr_checksum_unnecessary(skb);
553 th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
555 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
556 @@ -39,6 +91,61 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
557 return tcp_gro_complete(skb);
560 +static void __tcpv6_gso_segment_csum(struct sk_buff *seg,
561 + __be16 *oldport, __be16 newport)
565 + if (*oldport == newport)
569 + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
570 + *oldport = newport;
573 +static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs)
575 + const struct tcphdr *th;
576 + const struct ipv6hdr *iph;
577 + struct sk_buff *seg;
578 + struct tcphdr *th2;
579 + struct ipv6hdr *iph2;
583 + iph = ipv6_hdr(seg);
584 + th2 = tcp_hdr(seg->next);
585 + iph2 = ipv6_hdr(seg->next);
587 + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
588 + ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
589 + ipv6_addr_equal(&iph->daddr, &iph2->daddr))
592 + while ((seg = seg->next)) {
593 + th2 = tcp_hdr(seg);
594 + iph2 = ipv6_hdr(seg);
596 + iph2->saddr = iph->saddr;
597 + iph2->daddr = iph->daddr;
598 + __tcpv6_gso_segment_csum(seg, &th2->source, th->source);
599 + __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest);
605 +static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb,
606 + netdev_features_t features)
608 + skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
612 + return __tcpv6_gso_segment_list_csum(skb);
615 static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
616 netdev_features_t features)
618 @@ -50,6 +157,9 @@ static struct sk_buff *tcp6_gso_segment(
619 if (!pskb_may_pull(skb, sizeof(*th)))
620 return ERR_PTR(-EINVAL);
622 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
623 + return __tcp6_gso_segment_list(skb, features);
625 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
626 const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
627 struct tcphdr *th = tcp_hdr(skb);