From: Pavel Kubelun Date: Tue, 25 Jul 2017 07:57:31 +0000 (-0400) Subject: generic: net: tcp: backport tcp tx performance patches X-Git-Url: http://git.openwrt.org/?p=openwrt%2Fstaging%2Fjow.git;a=commitdiff_plain;h=b2ea46fe236ae02d845f94cf984cbc1786870333 generic: net: tcp: backport tcp tx performance patches An overall throughput gain of 22 % for heavy TCP use over a single TX queue. Original patchset comment https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v4.13&id=3f4888adae7c1619b990d98a9b967536f71822b8 Signed-off-by: Pavel Kubelun --- diff --git a/target/linux/generic/backport-4.9/024-1-tcp-tsq-add-tsq_flags-tsq_enum.patch b/target/linux/generic/backport-4.9/024-1-tcp-tsq-add-tsq_flags-tsq_enum.patch new file mode 100644 index 0000000000..20311498aa --- /dev/null +++ b/target/linux/generic/backport-4.9/024-1-tcp-tsq-add-tsq_flags-tsq_enum.patch @@ -0,0 +1,90 @@ +From 40fc3423b983b864bf70b03199191260ae9b2ea6 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sat, 3 Dec 2016 11:14:50 -0800 +Subject: [PATCH 01/10] tcp: tsq: add tsq_flags / tsq_enum + +This is a cleanup, to ease code review of following patches. + +Old 'enum tsq_flags' is renamed, and a new enumeration is added +with the flags used in cmpxchg() operations as opposed to +single bit operations. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +--- + include/linux/tcp.h | 11 ++++++++++- + net/ipv4/tcp_output.c | 16 ++++++++-------- + 2 files changed, 18 insertions(+), 9 deletions(-) + +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -367,7 +367,7 @@ struct tcp_sock { + u32 *saved_syn; + }; + +-enum tsq_flags { ++enum tsq_enum { + TSQ_THROTTLED, + TSQ_QUEUED, + TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ +@@ -378,6 +378,15 @@ enum tsq_flags { + */ + }; + ++enum tsq_flags { ++ TSQF_THROTTLED = (1UL << TSQ_THROTTLED), ++ TSQF_QUEUED = (1UL << TSQ_QUEUED), ++ TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED), ++ TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED), ++ TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED), ++ TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED), ++}; ++ + static inline struct tcp_sock *tcp_sk(const struct sock *sk) + { + return (struct tcp_sock *)sk; +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -784,10 +784,10 @@ static void tcp_tasklet_func(unsigned lo + } + } + +-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ +- (1UL << TCP_WRITE_TIMER_DEFERRED) | \ +- (1UL << TCP_DELACK_TIMER_DEFERRED) | \ +- (1UL << TCP_MTU_REDUCED_DEFERRED)) ++#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ ++ TCPF_WRITE_TIMER_DEFERRED | \ ++ TCPF_DELACK_TIMER_DEFERRED | \ ++ TCPF_MTU_REDUCED_DEFERRED) + /** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket +@@ -808,7 +808,7 @@ void tcp_release_cb(struct sock *sk) + nflags = flags & ~TCP_DEFERRED_ALL; + } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + +- if (flags & (1UL << TCP_TSQ_DEFERRED)) ++ if (flags & TCPF_TSQ_DEFERRED) + tcp_tsq_handler(sk); + + /* Here begins the tricky part : +@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk) + */ + sock_release_ownership(sk); + +- if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { ++ if (flags & TCPF_WRITE_TIMER_DEFERRED) { + tcp_write_timer_handler(sk); + __sock_put(sk); + } +- if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { ++ if (flags & TCPF_DELACK_TIMER_DEFERRED) { + tcp_delack_timer_handler(sk); + __sock_put(sk); + } +- if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { ++ if (flags & TCPF_MTU_REDUCED_DEFERRED) { + inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); + __sock_put(sk); + } diff --git a/target/linux/generic/backport-4.9/024-2-tcp-tsq-remove-one-locked-operation-in-tcp_wfree.patch b/target/linux/generic/backport-4.9/024-2-tcp-tsq-remove-one-locked-operation-in-tcp_wfree.patch new file mode 100644 index 0000000000..914be607e7 --- /dev/null +++ b/target/linux/generic/backport-4.9/024-2-tcp-tsq-remove-one-locked-operation-in-tcp_wfree.patch @@ -0,0 +1,48 @@ +From 408f0a6c21e124cc4f6c7aa370b38aa47e55428d Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sat, 3 Dec 2016 11:14:51 -0800 +Subject: [PATCH 02/10] tcp: tsq: remove one locked operation in tcp_wfree() + +Instead of atomically clear TSQ_THROTTLED and atomically set TSQ_QUEUED +bits, use one cmpxchg() to perform a single locked operation. + +Since the following patch will also set TCP_TSQ_DEFERRED here, +this cmpxchg() will make this addition free. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +--- + net/ipv4/tcp_output.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb) + { + struct sock *sk = skb->sk; + struct tcp_sock *tp = tcp_sk(sk); ++ unsigned long flags, nval, oval; + int wmem; + + /* Keep one reference on sk_wmem_alloc. +@@ -877,11 +878,17 @@ void tcp_wfree(struct sk_buff *skb) + if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) + goto out; + +- if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && +- !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { +- unsigned long flags; ++ for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) { + struct tsq_tasklet *tsq; + ++ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) ++ goto out; ++ ++ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; ++ nval = cmpxchg(&tp->tsq_flags, oval, nval); ++ if (nval != oval) ++ continue; ++ + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = this_cpu_ptr(&tsq_tasklet); diff --git a/target/linux/generic/backport-4.9/024-3-tcp-tsq-add-shortcut-in-tcp_tasklet_func.patch b/target/linux/generic/backport-4.9/024-3-tcp-tsq-add-shortcut-in-tcp_tasklet_func.patch new file mode 100644 index 0000000000..8f45c23e3f --- /dev/null +++ b/target/linux/generic/backport-4.9/024-3-tcp-tsq-add-shortcut-in-tcp_tasklet_func.patch @@ -0,0 +1,71 @@ +From b223feb9de2a65c533ff95c08e834fa732906ea5 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sat, 3 Dec 2016 11:14:52 -0800 +Subject: [PATCH 03/10] tcp: tsq: add shortcut in tcp_tasklet_func() + +Under high stress, I've seen tcp_tasklet_func() consuming +~700 usec, handling ~150 tcp sockets. + +By setting TCP_TSQ_DEFERRED in tcp_wfree(), we give a chance +for other cpus/threads entering tcp_write_xmit() to grab it, +allowing tcp_tasklet_func() to skip sockets that already did +an xmit cycle. + +In the future, we might give to ACK processing an increased +budget to reduce even more tcp_tasklet_func() amount of work. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +--- + net/ipv4/tcp_output.c | 22 ++++++++++++---------- + 1 file changed, 12 insertions(+), 10 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -767,19 +767,19 @@ static void tcp_tasklet_func(unsigned lo + list_for_each_safe(q, n, &list) { + tp = list_entry(q, struct tcp_sock, tsq_node); + list_del(&tp->tsq_node); ++ clear_bit(TSQ_QUEUED, &tp->tsq_flags); + + sk = (struct sock *)tp; +- bh_lock_sock(sk); +- +- if (!sock_owned_by_user(sk)) { +- tcp_tsq_handler(sk); +- } else { +- /* defer the work to tcp_release_cb() */ +- set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); ++ if (!sk->sk_lock.owned && ++ test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) { ++ bh_lock_sock(sk); ++ if (!sock_owned_by_user(sk)) { ++ clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); ++ tcp_tsq_handler(sk); ++ } ++ bh_unlock_sock(sk); + } +- bh_unlock_sock(sk); + +- clear_bit(TSQ_QUEUED, &tp->tsq_flags); + sk_free(sk); + } + } +@@ -884,7 +884,7 @@ void tcp_wfree(struct sk_buff *skb) + if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) + goto out; + +- nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; ++ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; + nval = cmpxchg(&tp->tsq_flags, oval, nval); + if (nval != oval) + continue; +@@ -2179,6 +2179,8 @@ static bool tcp_write_xmit(struct sock * + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + break; + ++ if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) ++ clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); + if (tcp_small_queue_check(sk, skb, 0)) + break; + diff --git a/target/linux/generic/backport-4.9/024-4-tcp-tsq-avoid-one-atomic-in-tcp_wfree.patch b/target/linux/generic/backport-4.9/024-4-tcp-tsq-avoid-one-atomic-in-tcp_wfree.patch new file mode 100644 index 0000000000..a25cdb5717 --- /dev/null +++ b/target/linux/generic/backport-4.9/024-4-tcp-tsq-avoid-one-atomic-in-tcp_wfree.patch @@ -0,0 +1,38 @@ +From a9b204d1564702b704ad6fe74f10a102c7b87ba3 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sat, 3 Dec 2016 11:14:53 -0800 +Subject: [PATCH 04/10] tcp: tsq: avoid one atomic in tcp_wfree() + +Under high load, tcp_wfree() has an atomic operation trying +to schedule a tasklet over and over. + +We can schedule it only if our per cpu list was empty. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +--- + net/ipv4/tcp_output.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -880,6 +880,7 @@ void tcp_wfree(struct sk_buff *skb) + + for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) { + struct tsq_tasklet *tsq; ++ bool empty; + + if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) + goto out; +@@ -892,8 +893,10 @@ void tcp_wfree(struct sk_buff *skb) + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = this_cpu_ptr(&tsq_tasklet); ++ empty = list_empty(&tsq->head); + list_add(&tp->tsq_node, &tsq->head); +- tasklet_schedule(&tsq->tasklet); ++ if (empty) ++ tasklet_schedule(&tsq->tasklet); + local_irq_restore(flags); + return; + } diff --git a/target/linux/generic/backport-4.9/024-5-tcp-tsq-add-a-shortcut-in-tcp_small_queue_check.patch b/target/linux/generic/backport-4.9/024-5-tcp-tsq-add-a-shortcut-in-tcp_small_queue_check.patch new file mode 100644 index 0000000000..dd1a37eb21 --- /dev/null +++ b/target/linux/generic/backport-4.9/024-5-tcp-tsq-add-a-shortcut-in-tcp_small_queue_check.patch @@ -0,0 +1,37 @@ +From 75eefc6c59fd2c5f1ab95a3a113c217237d12a31 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sat, 3 Dec 2016 11:14:54 -0800 +Subject: [PATCH 05/10] tcp: tsq: add a shortcut in tcp_small_queue_check() + +Always allow the two first skbs in write queue to be sent, +regardless of sk_wmem_alloc/sk_pacing_rate values. + +This helps a lot in situations where TX completions are delayed either +because of driver latencies or softirq latencies. + +Test is done with no cache line misses. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +--- + net/ipv4/tcp_output.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2084,6 +2084,15 @@ static bool tcp_small_queue_check(struct + limit <<= factor; + + if (atomic_read(&sk->sk_wmem_alloc) > limit) { ++ /* Always send the 1st or 2nd skb in write queue. ++ * No need to wait for TX completion to call us back, ++ * after softirq/tasklet schedule. ++ * This helps when TX completions are delayed too much. ++ */ ++ if (skb == sk->sk_write_queue.next || ++ skb->prev == sk->sk_write_queue.next) ++ return false; ++ + set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must diff --git a/target/linux/generic/backport-4.9/024-6-tcp-tcp_mtu_probe-is-likely-to-exit-early.patch b/target/linux/generic/backport-4.9/024-6-tcp-tcp_mtu_probe-is-likely-to-exit-early.patch new file mode 100644 index 0000000000..f0ae2a938e --- /dev/null +++ b/target/linux/generic/backport-4.9/024-6-tcp-tcp_mtu_probe-is-likely-to-exit-early.patch @@ -0,0 +1,55 @@ +From 12a59abc22d6664f7d3944f625ceefee92de8820 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sat, 3 Dec 2016 11:14:55 -0800 +Subject: [PATCH 06/10] tcp: tcp_mtu_probe() is likely to exit early + +Adding a likely() in tcp_mtu_probe() moves its code which used to +be inlined in front of tcp_write_xmit() + +We still have a cache line miss to access icsk->icsk_mtup.enabled, +we will probably have to reorganize fields to help data locality. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +--- + net/ipv4/tcp_output.c | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1925,26 +1925,26 @@ static inline void tcp_mtu_check_reprobe + */ + static int tcp_mtu_probe(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); ++ struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb, *nskb, *next; + struct net *net = sock_net(sk); +- int len; + int probe_size; + int size_needed; +- int copy; ++ int copy, len; + int mss_now; + int interval; + + /* Not currently probing/verifying, + * not in recovery, + * have enough cwnd, and +- * not SACKing (the variable headers throw things off) */ +- if (!icsk->icsk_mtup.enabled || +- icsk->icsk_mtup.probe_size || +- inet_csk(sk)->icsk_ca_state != TCP_CA_Open || +- tp->snd_cwnd < 11 || +- tp->rx_opt.num_sacks || tp->rx_opt.dsack) ++ * not SACKing (the variable headers throw things off) ++ */ ++ if (likely(!icsk->icsk_mtup.enabled || ++ icsk->icsk_mtup.probe_size || ++ inet_csk(sk)->icsk_ca_state != TCP_CA_Open || ++ tp->snd_cwnd < 11 || ++ tp->rx_opt.num_sacks || tp->rx_opt.dsack)) + return -1; + + /* Use binary search for probe_size between tcp_mss_base, diff --git a/target/linux/generic/backport-4.9/024-7-net-reorganize-struct-sock-for-better-data-locality.patch b/target/linux/generic/backport-4.9/024-7-net-reorganize-struct-sock-for-better-data-locality.patch new file mode 100644 index 0000000000..e8c1915e18 --- /dev/null +++ b/target/linux/generic/backport-4.9/024-7-net-reorganize-struct-sock-for-better-data-locality.patch @@ -0,0 +1,157 @@ +From 9115e8cd2a0c6eaaa900c462721f12e1d45f326c Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sat, 3 Dec 2016 11:14:56 -0800 +Subject: [PATCH 07/10] net: reorganize struct sock for better data locality + +Group fields used in TX path, and keep some cache lines mostly read +to permit sharing among cpus. + +Gained two 4 bytes holes on 64bit arches. + +Added a place holder for tcp tsq_flags, next to sk_wmem_alloc +to speed up tcp_wfree() in the following patch. + +I have not added ____cacheline_aligned_in_smp, this might be done later. +I prefer doing this once inet and tcp/udp sockets reorg is also done. + +Tested with both TCP and UDP. + +UDP receiver performance under flood increased by ~20 % : +Accessing sk_filter/sk_wq/sk_napi_id no longer stalls because sk_drops +was moved away from a critical cache line, now mostly read and shared. + + /* --- cacheline 4 boundary (256 bytes) --- */ + unsigned int sk_napi_id; /* 0x100 0x4 */ + int sk_rcvbuf; /* 0x104 0x4 */ + struct sk_filter * sk_filter; /* 0x108 0x8 */ + union { + struct socket_wq * sk_wq; /* 0x8 */ + struct socket_wq * sk_wq_raw; /* 0x8 */ + }; /* 0x110 0x8 */ + struct xfrm_policy * sk_policy[2]; /* 0x118 0x10 */ + struct dst_entry * sk_rx_dst; /* 0x128 0x8 */ + struct dst_entry * sk_dst_cache; /* 0x130 0x8 */ + atomic_t sk_omem_alloc; /* 0x138 0x4 */ + int sk_sndbuf; /* 0x13c 0x4 */ + /* --- cacheline 5 boundary (320 bytes) --- */ + int sk_wmem_queued; /* 0x140 0x4 */ + atomic_t sk_wmem_alloc; /* 0x144 0x4 */ + long unsigned int sk_tsq_flags; /* 0x148 0x8 */ + struct sk_buff * sk_send_head; /* 0x150 0x8 */ + struct sk_buff_head sk_write_queue; /* 0x158 0x18 */ + __s32 sk_peek_off; /* 0x170 0x4 */ + int sk_write_pending; /* 0x174 0x4 */ + long int sk_sndtimeo; /* 0x178 0x8 */ + +Signed-off-by: Eric Dumazet +Tested-by: Paolo Abeni +Signed-off-by: David S. Miller +--- + include/net/sock.h | 51 +++++++++++++++++++++++++++------------------------ + 1 file changed, 27 insertions(+), 24 deletions(-) + +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -343,6 +343,9 @@ struct sock { + #define sk_rxhash __sk_common.skc_rxhash + + socket_lock_t sk_lock; ++ atomic_t sk_drops; ++ int sk_rcvlowat; ++ struct sk_buff_head sk_error_queue; + struct sk_buff_head sk_receive_queue; + /* + * The backlog queue is special, it is always used with +@@ -359,14 +362,13 @@ struct sock { + struct sk_buff *tail; + } sk_backlog; + #define sk_rmem_alloc sk_backlog.rmem_alloc +- int sk_forward_alloc; + +- __u32 sk_txhash; ++ int sk_forward_alloc; + #ifdef CONFIG_NET_RX_BUSY_POLL +- unsigned int sk_napi_id; + unsigned int sk_ll_usec; ++ /* ===== mostly read cache line ===== */ ++ unsigned int sk_napi_id; + #endif +- atomic_t sk_drops; + int sk_rcvbuf; + + struct sk_filter __rcu *sk_filter; +@@ -379,11 +381,30 @@ struct sock { + #endif + struct dst_entry *sk_rx_dst; + struct dst_entry __rcu *sk_dst_cache; +- /* Note: 32bit hole on 64bit arches */ +- atomic_t sk_wmem_alloc; + atomic_t sk_omem_alloc; + int sk_sndbuf; ++ ++ /* ===== cache line for TX ===== */ ++ int sk_wmem_queued; ++ atomic_t sk_wmem_alloc; ++ unsigned long sk_tsq_flags; ++ struct sk_buff *sk_send_head; + struct sk_buff_head sk_write_queue; ++ __s32 sk_peek_off; ++ int sk_write_pending; ++ long sk_sndtimeo; ++ struct timer_list sk_timer; ++ __u32 sk_priority; ++ __u32 sk_mark; ++ u32 sk_pacing_rate; /* bytes per second */ ++ u32 sk_max_pacing_rate; ++ struct page_frag sk_frag; ++ netdev_features_t sk_route_caps; ++ netdev_features_t sk_route_nocaps; ++ int sk_gso_type; ++ unsigned int sk_gso_max_size; ++ gfp_t sk_allocation; ++ __u32 sk_txhash; + + /* + * Because of non atomicity rules, all +@@ -399,41 +420,23 @@ struct sock { + #define SK_PROTOCOL_MAX U8_MAX + kmemcheck_bitfield_end(flags); + +- int sk_wmem_queued; +- gfp_t sk_allocation; +- u32 sk_pacing_rate; /* bytes per second */ +- u32 sk_max_pacing_rate; +- netdev_features_t sk_route_caps; +- netdev_features_t sk_route_nocaps; +- int sk_gso_type; +- unsigned int sk_gso_max_size; + u16 sk_gso_max_segs; +- int sk_rcvlowat; + unsigned long sk_lingertime; +- struct sk_buff_head sk_error_queue; + struct proto *sk_prot_creator; + rwlock_t sk_callback_lock; + int sk_err, + sk_err_soft; + u32 sk_ack_backlog; + u32 sk_max_ack_backlog; +- __u32 sk_priority; +- __u32 sk_mark; + struct pid *sk_peer_pid; + const struct cred *sk_peer_cred; + long sk_rcvtimeo; +- long sk_sndtimeo; +- struct timer_list sk_timer; + ktime_t sk_stamp; + u16 sk_tsflags; + u8 sk_shutdown; + u32 sk_tskey; + struct socket *sk_socket; + void *sk_user_data; +- struct page_frag sk_frag; +- struct sk_buff *sk_send_head; +- __s32 sk_peek_off; +- int sk_write_pending; + #ifdef CONFIG_SECURITY + void *sk_security; + #endif diff --git a/target/linux/generic/backport-4.9/024-8-tcp-tsq-move-tsq_flags-close-to-sk_wmem_alloc.patch b/target/linux/generic/backport-4.9/024-8-tcp-tsq-move-tsq_flags-close-to-sk_wmem_alloc.patch new file mode 100644 index 0000000000..545fe60c1c --- /dev/null +++ b/target/linux/generic/backport-4.9/024-8-tcp-tsq-move-tsq_flags-close-to-sk_wmem_alloc.patch @@ -0,0 +1,176 @@ +From 7aa5470c2c09265902b5e4289afa82e4e7c2987e Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Sat, 3 Dec 2016 11:14:57 -0800 +Subject: [PATCH 08/10] tcp: tsq: move tsq_flags close to sk_wmem_alloc + +tsq_flags being in the same cache line than sk_wmem_alloc +makes a lot of sense. Both fields are changed from tcp_wfree() +and more generally by various TSQ related functions. + +Prior patch made room in struct sock and added sk_tsq_flags, +this patch deletes tsq_flags from struct tcp_sock. + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +--- + include/linux/tcp.h | 1 - + net/ipv4/tcp.c | 4 ++-- + net/ipv4/tcp_ipv4.c | 2 +- + net/ipv4/tcp_output.c | 24 +++++++++++------------- + net/ipv4/tcp_timer.c | 4 ++-- + net/ipv6/tcp_ipv6.c | 2 +- + 6 files changed, 17 insertions(+), 20 deletions(-) + +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -192,7 +192,6 @@ struct tcp_sock { + u32 tsoffset; /* timestamp offset */ + + struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ +- unsigned long tsq_flags; + + /* Data for direct copy to user */ + struct { +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -665,9 +665,9 @@ static void tcp_push(struct sock *sk, in + if (tcp_should_autocork(sk, skb, size_goal)) { + + /* avoid atomic op if TSQ_THROTTLED bit is already set */ +- if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { ++ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); +- set_bit(TSQ_THROTTLED, &tp->tsq_flags); ++ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + } + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -446,7 +446,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb + if (!sock_owned_by_user(sk)) { + tcp_v4_mtu_reduced(sk); + } else { +- if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) ++ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + } + goto out; +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -767,14 +767,15 @@ static void tcp_tasklet_func(unsigned lo + list_for_each_safe(q, n, &list) { + tp = list_entry(q, struct tcp_sock, tsq_node); + list_del(&tp->tsq_node); +- clear_bit(TSQ_QUEUED, &tp->tsq_flags); + + sk = (struct sock *)tp; ++ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); ++ + if (!sk->sk_lock.owned && +- test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) { ++ test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) { + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { +- clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); ++ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); + tcp_tsq_handler(sk); + } + bh_unlock_sock(sk); +@@ -797,16 +798,15 @@ static void tcp_tasklet_func(unsigned lo + */ + void tcp_release_cb(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + unsigned long flags, nflags; + + /* perform an atomic operation only if at least one flag is set */ + do { +- flags = tp->tsq_flags; ++ flags = sk->sk_tsq_flags; + if (!(flags & TCP_DEFERRED_ALL)) + return; + nflags = flags & ~TCP_DEFERRED_ALL; +- } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); ++ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); + + if (flags & TCPF_TSQ_DEFERRED) + tcp_tsq_handler(sk); +@@ -878,7 +878,7 @@ void tcp_wfree(struct sk_buff *skb) + if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) + goto out; + +- for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) { ++ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { + struct tsq_tasklet *tsq; + bool empty; + +@@ -886,7 +886,7 @@ void tcp_wfree(struct sk_buff *skb) + goto out; + + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; +- nval = cmpxchg(&tp->tsq_flags, oval, nval); ++ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); + if (nval != oval) + continue; + +@@ -2093,7 +2093,7 @@ static bool tcp_small_queue_check(struct + skb->prev == sk->sk_write_queue.next) + return false; + +- set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); ++ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. +@@ -2191,8 +2191,8 @@ static bool tcp_write_xmit(struct sock * + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + break; + +- if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) +- clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); ++ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) ++ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); + if (tcp_small_queue_check(sk, skb, 0)) + break; + +@@ -3495,8 +3495,6 @@ void tcp_send_ack(struct sock *sk) + /* We do not want pure acks influencing TCP Small Queues or fq/pacing + * too much. + * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 +- * We also avoid tcp_wfree() overhead (cache line miss accessing +- * tp->tsq_flags) by using regular sock_wfree() + */ + skb_set_tcp_pure_ack(buff); + +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -311,7 +311,7 @@ static void tcp_delack_timer(unsigned lo + inet_csk(sk)->icsk_ack.blocked = 1; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + /* deleguate our work to tcp_release_cb() */ +- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) ++ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); +@@ -594,7 +594,7 @@ static void tcp_write_timer(unsigned lon + tcp_write_timer_handler(sk); + } else { + /* delegate our work to tcp_release_cb() */ +- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) ++ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -404,7 +404,7 @@ static void tcp_v6_err(struct sk_buff *s + if (!sock_owned_by_user(sk)) + tcp_v6_mtu_reduced(sk); + else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, +- &tp->tsq_flags)) ++ &sk->sk_tsq_flags)) + sock_hold(sk); + goto out; + } diff --git a/target/linux/generic/backport-4.9/024-9-tcp-add-a-missing-barrier-in-tcp_tasklet_func.patch b/target/linux/generic/backport-4.9/024-9-tcp-add-a-missing-barrier-in-tcp_tasklet_func.patch new file mode 100644 index 0000000000..d2b8de6a04 --- /dev/null +++ b/target/linux/generic/backport-4.9/024-9-tcp-add-a-missing-barrier-in-tcp_tasklet_func.patch @@ -0,0 +1,40 @@ +From 0a9648f1293966c838dc570da73c15a76f4c89d6 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Wed, 21 Dec 2016 05:42:43 -0800 +Subject: [PATCH 09/10] tcp: add a missing barrier in tcp_tasklet_func() + +Madalin reported crashes happening in tcp_tasklet_func() on powerpc64 + +Before TSQ_QUEUED bit is cleared, we must ensure the changes done +by list_del(&tp->tsq_node); are committed to memory, otherwise +corruption might happen, as an other cpu could catch TSQ_QUEUED +clearance too soon. + +We can notice that old kernels were immune to this bug, because +TSQ_QUEUED was cleared after a bh_lock_sock(sk)/bh_unlock_sock(sk) +section, but they could have missed a kick to write additional bytes, +when NIC interrupts for a given flow are spread to multiple cpus. + +Affected TCP flows would need an incoming ACK or RTO timer to add more +packets to the pipe. So overall situation should be better now. + +Fixes: b223feb9de2a ("tcp: tsq: add shortcut in tcp_tasklet_func()") +Signed-off-by: Eric Dumazet +Reported-by: Madalin Bucur +Tested-by: Madalin Bucur +Tested-by: Xing Lei +Signed-off-by: David S. Miller +--- + net/ipv4/tcp_output.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -769,6 +769,7 @@ static void tcp_tasklet_func(unsigned lo + list_del(&tp->tsq_node); + + sk = (struct sock *)tp; ++ smp_mb__before_atomic(); + clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); + + if (!sk->sk_lock.owned &&