generic: net: tcp: backport tcp tx performance patches
authorPavel Kubelun <be.dissent@gmail.com>
Tue, 25 Jul 2017 07:57:31 +0000 (03:57 -0400)
committerStijn Tintel <stijn@linux-ipv6.be>
Sun, 8 Oct 2017 12:19:07 +0000 (15:19 +0300)
An overall throughput gain of 22 % for heavy TCP use over a single TX queue.

Original patchset comment
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v4.13&id=3f4888adae7c1619b990d98a9b967536f71822b8

Signed-off-by: Pavel Kubelun <be.dissent@gmail.com>
target/linux/generic/backport-4.9/024-1-tcp-tsq-add-tsq_flags-tsq_enum.patch [new file with mode: 0644]
target/linux/generic/backport-4.9/024-2-tcp-tsq-remove-one-locked-operation-in-tcp_wfree.patch [new file with mode: 0644]
target/linux/generic/backport-4.9/024-3-tcp-tsq-add-shortcut-in-tcp_tasklet_func.patch [new file with mode: 0644]
target/linux/generic/backport-4.9/024-4-tcp-tsq-avoid-one-atomic-in-tcp_wfree.patch [new file with mode: 0644]
target/linux/generic/backport-4.9/024-5-tcp-tsq-add-a-shortcut-in-tcp_small_queue_check.patch [new file with mode: 0644]
target/linux/generic/backport-4.9/024-6-tcp-tcp_mtu_probe-is-likely-to-exit-early.patch [new file with mode: 0644]
target/linux/generic/backport-4.9/024-7-net-reorganize-struct-sock-for-better-data-locality.patch [new file with mode: 0644]
target/linux/generic/backport-4.9/024-8-tcp-tsq-move-tsq_flags-close-to-sk_wmem_alloc.patch [new file with mode: 0644]
target/linux/generic/backport-4.9/024-9-tcp-add-a-missing-barrier-in-tcp_tasklet_func.patch [new file with mode: 0644]

diff --git a/target/linux/generic/backport-4.9/024-1-tcp-tsq-add-tsq_flags-tsq_enum.patch b/target/linux/generic/backport-4.9/024-1-tcp-tsq-add-tsq_flags-tsq_enum.patch
new file mode 100644 (file)
index 0000000..2031149
--- /dev/null
@@ -0,0 +1,90 @@
+From 40fc3423b983b864bf70b03199191260ae9b2ea6 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 3 Dec 2016 11:14:50 -0800
+Subject: [PATCH 01/10] tcp: tsq: add tsq_flags / tsq_enum
+
+This is a cleanup, to ease code review of following patches.
+
+Old 'enum tsq_flags' is renamed, and a new enumeration is added
+with the flags used in cmpxchg() operations as opposed to
+single bit operations.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ include/linux/tcp.h   | 11 ++++++++++-
+ net/ipv4/tcp_output.c | 16 ++++++++--------
+ 2 files changed, 18 insertions(+), 9 deletions(-)
+
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -367,7 +367,7 @@ struct tcp_sock {
+       u32     *saved_syn;
+ };
+-enum tsq_flags {
++enum tsq_enum {
+       TSQ_THROTTLED,
+       TSQ_QUEUED,
+       TCP_TSQ_DEFERRED,          /* tcp_tasklet_func() found socket was owned */
+@@ -378,6 +378,15 @@ enum tsq_flags {
+                                   */
+ };
++enum tsq_flags {
++      TSQF_THROTTLED                  = (1UL << TSQ_THROTTLED),
++      TSQF_QUEUED                     = (1UL << TSQ_QUEUED),
++      TCPF_TSQ_DEFERRED               = (1UL << TCP_TSQ_DEFERRED),
++      TCPF_WRITE_TIMER_DEFERRED       = (1UL << TCP_WRITE_TIMER_DEFERRED),
++      TCPF_DELACK_TIMER_DEFERRED      = (1UL << TCP_DELACK_TIMER_DEFERRED),
++      TCPF_MTU_REDUCED_DEFERRED       = (1UL << TCP_MTU_REDUCED_DEFERRED),
++};
++
+ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
+ {
+       return (struct tcp_sock *)sk;
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -784,10 +784,10 @@ static void tcp_tasklet_func(unsigned lo
+       }
+ }
+-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |         \
+-                        (1UL << TCP_WRITE_TIMER_DEFERRED) |   \
+-                        (1UL << TCP_DELACK_TIMER_DEFERRED) |  \
+-                        (1UL << TCP_MTU_REDUCED_DEFERRED))
++#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |         \
++                        TCPF_WRITE_TIMER_DEFERRED |   \
++                        TCPF_DELACK_TIMER_DEFERRED |  \
++                        TCPF_MTU_REDUCED_DEFERRED)
+ /**
+  * tcp_release_cb - tcp release_sock() callback
+  * @sk: socket
+@@ -808,7 +808,7 @@ void tcp_release_cb(struct sock *sk)
+               nflags = flags & ~TCP_DEFERRED_ALL;
+       } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+-      if (flags & (1UL << TCP_TSQ_DEFERRED))
++      if (flags & TCPF_TSQ_DEFERRED)
+               tcp_tsq_handler(sk);
+       /* Here begins the tricky part :
+@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk)
+        */
+       sock_release_ownership(sk);
+-      if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
++      if (flags & TCPF_WRITE_TIMER_DEFERRED) {
+               tcp_write_timer_handler(sk);
+               __sock_put(sk);
+       }
+-      if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
++      if (flags & TCPF_DELACK_TIMER_DEFERRED) {
+               tcp_delack_timer_handler(sk);
+               __sock_put(sk);
+       }
+-      if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
++      if (flags & TCPF_MTU_REDUCED_DEFERRED) {
+               inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
+               __sock_put(sk);
+       }
diff --git a/target/linux/generic/backport-4.9/024-2-tcp-tsq-remove-one-locked-operation-in-tcp_wfree.patch b/target/linux/generic/backport-4.9/024-2-tcp-tsq-remove-one-locked-operation-in-tcp_wfree.patch
new file mode 100644 (file)
index 0000000..914be60
--- /dev/null
@@ -0,0 +1,48 @@
+From 408f0a6c21e124cc4f6c7aa370b38aa47e55428d Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 3 Dec 2016 11:14:51 -0800
+Subject: [PATCH 02/10] tcp: tsq: remove one locked operation in tcp_wfree()
+
+Instead of atomically clear TSQ_THROTTLED and atomically set TSQ_QUEUED
+bits, use one cmpxchg() to perform a single locked operation.
+
+Since the following patch will also set TCP_TSQ_DEFERRED here,
+this cmpxchg() will make this addition free.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ net/ipv4/tcp_output.c | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb)
+ {
+       struct sock *sk = skb->sk;
+       struct tcp_sock *tp = tcp_sk(sk);
++      unsigned long flags, nval, oval;
+       int wmem;
+       /* Keep one reference on sk_wmem_alloc.
+@@ -877,11 +878,17 @@ void tcp_wfree(struct sk_buff *skb)
+       if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+               goto out;
+-      if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+-          !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+-              unsigned long flags;
++      for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
+               struct tsq_tasklet *tsq;
++              if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
++                      goto out;
++
++              nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
++              nval = cmpxchg(&tp->tsq_flags, oval, nval);
++              if (nval != oval)
++                      continue;
++
+               /* queue this socket to tasklet queue */
+               local_irq_save(flags);
+               tsq = this_cpu_ptr(&tsq_tasklet);
diff --git a/target/linux/generic/backport-4.9/024-3-tcp-tsq-add-shortcut-in-tcp_tasklet_func.patch b/target/linux/generic/backport-4.9/024-3-tcp-tsq-add-shortcut-in-tcp_tasklet_func.patch
new file mode 100644 (file)
index 0000000..8f45c23
--- /dev/null
@@ -0,0 +1,71 @@
+From b223feb9de2a65c533ff95c08e834fa732906ea5 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 3 Dec 2016 11:14:52 -0800
+Subject: [PATCH 03/10] tcp: tsq: add shortcut in tcp_tasklet_func()
+
+Under high stress, I've seen tcp_tasklet_func() consuming
+~700 usec, handling ~150 tcp sockets.
+
+By setting TCP_TSQ_DEFERRED in tcp_wfree(), we give a chance
+for other cpus/threads entering tcp_write_xmit() to grab it,
+allowing tcp_tasklet_func() to skip sockets that already did
+an xmit cycle.
+
+In the future, we might give to ACK processing an increased
+budget to reduce even more tcp_tasklet_func() amount of work.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ net/ipv4/tcp_output.c | 22 ++++++++++++----------
+ 1 file changed, 12 insertions(+), 10 deletions(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -767,19 +767,19 @@ static void tcp_tasklet_func(unsigned lo
+       list_for_each_safe(q, n, &list) {
+               tp = list_entry(q, struct tcp_sock, tsq_node);
+               list_del(&tp->tsq_node);
++              clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+               sk = (struct sock *)tp;
+-              bh_lock_sock(sk);
+-
+-              if (!sock_owned_by_user(sk)) {
+-                      tcp_tsq_handler(sk);
+-              } else {
+-                      /* defer the work to tcp_release_cb() */
+-                      set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
++              if (!sk->sk_lock.owned &&
++                  test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
++                      bh_lock_sock(sk);
++                      if (!sock_owned_by_user(sk)) {
++                              clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
++                              tcp_tsq_handler(sk);
++                      }
++                      bh_unlock_sock(sk);
+               }
+-              bh_unlock_sock(sk);
+-              clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+               sk_free(sk);
+       }
+ }
+@@ -884,7 +884,7 @@ void tcp_wfree(struct sk_buff *skb)
+               if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+                       goto out;
+-              nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
++              nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+               nval = cmpxchg(&tp->tsq_flags, oval, nval);
+               if (nval != oval)
+                       continue;
+@@ -2179,6 +2179,8 @@ static bool tcp_write_xmit(struct sock *
+                   unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+                       break;
++              if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
++                      clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+               if (tcp_small_queue_check(sk, skb, 0))
+                       break;
diff --git a/target/linux/generic/backport-4.9/024-4-tcp-tsq-avoid-one-atomic-in-tcp_wfree.patch b/target/linux/generic/backport-4.9/024-4-tcp-tsq-avoid-one-atomic-in-tcp_wfree.patch
new file mode 100644 (file)
index 0000000..a25cdb5
--- /dev/null
@@ -0,0 +1,38 @@
+From a9b204d1564702b704ad6fe74f10a102c7b87ba3 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 3 Dec 2016 11:14:53 -0800
+Subject: [PATCH 04/10] tcp: tsq: avoid one atomic in tcp_wfree()
+
+Under high load, tcp_wfree() has an atomic operation trying
+to schedule a tasklet over and over.
+
+We can schedule it only if our per cpu list was empty.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ net/ipv4/tcp_output.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -880,6 +880,7 @@ void tcp_wfree(struct sk_buff *skb)
+       for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
+               struct tsq_tasklet *tsq;
++              bool empty;
+               if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+                       goto out;
+@@ -892,8 +893,10 @@ void tcp_wfree(struct sk_buff *skb)
+               /* queue this socket to tasklet queue */
+               local_irq_save(flags);
+               tsq = this_cpu_ptr(&tsq_tasklet);
++              empty = list_empty(&tsq->head);
+               list_add(&tp->tsq_node, &tsq->head);
+-              tasklet_schedule(&tsq->tasklet);
++              if (empty)
++                      tasklet_schedule(&tsq->tasklet);
+               local_irq_restore(flags);
+               return;
+       }
diff --git a/target/linux/generic/backport-4.9/024-5-tcp-tsq-add-a-shortcut-in-tcp_small_queue_check.patch b/target/linux/generic/backport-4.9/024-5-tcp-tsq-add-a-shortcut-in-tcp_small_queue_check.patch
new file mode 100644 (file)
index 0000000..dd1a37e
--- /dev/null
@@ -0,0 +1,37 @@
+From 75eefc6c59fd2c5f1ab95a3a113c217237d12a31 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 3 Dec 2016 11:14:54 -0800
+Subject: [PATCH 05/10] tcp: tsq: add a shortcut in tcp_small_queue_check()
+
+Always allow the two first skbs in write queue to be sent,
+regardless of sk_wmem_alloc/sk_pacing_rate values.
+
+This helps a lot in situations where TX completions are delayed either
+because of driver latencies or softirq latencies.
+
+Test is done with no cache line misses.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ net/ipv4/tcp_output.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -2084,6 +2084,15 @@ static bool tcp_small_queue_check(struct
+       limit <<= factor;
+       if (atomic_read(&sk->sk_wmem_alloc) > limit) {
++              /* Always send the 1st or 2nd skb in write queue.
++               * No need to wait for TX completion to call us back,
++               * after softirq/tasklet schedule.
++               * This helps when TX completions are delayed too much.
++               */
++              if (skb == sk->sk_write_queue.next ||
++                  skb->prev == sk->sk_write_queue.next)
++                      return false;
++
+               set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+               /* It is possible TX completion already happened
+                * before we set TSQ_THROTTLED, so we must
diff --git a/target/linux/generic/backport-4.9/024-6-tcp-tcp_mtu_probe-is-likely-to-exit-early.patch b/target/linux/generic/backport-4.9/024-6-tcp-tcp_mtu_probe-is-likely-to-exit-early.patch
new file mode 100644 (file)
index 0000000..f0ae2a9
--- /dev/null
@@ -0,0 +1,55 @@
+From 12a59abc22d6664f7d3944f625ceefee92de8820 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 3 Dec 2016 11:14:55 -0800
+Subject: [PATCH 06/10] tcp: tcp_mtu_probe() is likely to exit early
+
+Adding a likely() in tcp_mtu_probe() moves its code which used to
+be inlined in front of tcp_write_xmit()
+
+We still have a cache line miss to access icsk->icsk_mtup.enabled,
+we will probably have to reorganize fields to help data locality.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ net/ipv4/tcp_output.c | 18 +++++++++---------
+ 1 file changed, 9 insertions(+), 9 deletions(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1925,26 +1925,26 @@ static inline void tcp_mtu_check_reprobe
+  */
+ static int tcp_mtu_probe(struct sock *sk)
+ {
+-      struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
++      struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *skb, *nskb, *next;
+       struct net *net = sock_net(sk);
+-      int len;
+       int probe_size;
+       int size_needed;
+-      int copy;
++      int copy, len;
+       int mss_now;
+       int interval;
+       /* Not currently probing/verifying,
+        * not in recovery,
+        * have enough cwnd, and
+-       * not SACKing (the variable headers throw things off) */
+-      if (!icsk->icsk_mtup.enabled ||
+-          icsk->icsk_mtup.probe_size ||
+-          inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+-          tp->snd_cwnd < 11 ||
+-          tp->rx_opt.num_sacks || tp->rx_opt.dsack)
++       * not SACKing (the variable headers throw things off)
++       */
++      if (likely(!icsk->icsk_mtup.enabled ||
++                 icsk->icsk_mtup.probe_size ||
++                 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
++                 tp->snd_cwnd < 11 ||
++                 tp->rx_opt.num_sacks || tp->rx_opt.dsack))
+               return -1;
+       /* Use binary search for probe_size between tcp_mss_base,
diff --git a/target/linux/generic/backport-4.9/024-7-net-reorganize-struct-sock-for-better-data-locality.patch b/target/linux/generic/backport-4.9/024-7-net-reorganize-struct-sock-for-better-data-locality.patch
new file mode 100644 (file)
index 0000000..e8c1915
--- /dev/null
@@ -0,0 +1,157 @@
+From 9115e8cd2a0c6eaaa900c462721f12e1d45f326c Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 3 Dec 2016 11:14:56 -0800
+Subject: [PATCH 07/10] net: reorganize struct sock for better data locality
+
+Group fields used in TX path, and keep some cache lines mostly read
+to permit sharing among cpus.
+
+Gained two 4 bytes holes on 64bit arches.
+
+Added a place holder for tcp tsq_flags, next to sk_wmem_alloc
+to speed up tcp_wfree() in the following patch.
+
+I have not added ____cacheline_aligned_in_smp, this might be done later.
+I prefer doing this once inet and tcp/udp sockets reorg is also done.
+
+Tested with both TCP and UDP.
+
+UDP receiver performance under flood increased by ~20 % :
+Accessing sk_filter/sk_wq/sk_napi_id no longer stalls because sk_drops
+was moved away from a critical cache line, now mostly read and shared.
+
+       /* --- cacheline 4 boundary (256 bytes) --- */
+       unsigned int               sk_napi_id;           /* 0x100   0x4 */
+       int                        sk_rcvbuf;            /* 0x104   0x4 */
+       struct sk_filter *         sk_filter;            /* 0x108   0x8 */
+       union {
+               struct socket_wq * sk_wq;                /*         0x8 */
+               struct socket_wq * sk_wq_raw;            /*         0x8 */
+       };                                               /* 0x110   0x8 */
+       struct xfrm_policy *       sk_policy[2];         /* 0x118  0x10 */
+       struct dst_entry *         sk_rx_dst;            /* 0x128   0x8 */
+       struct dst_entry *         sk_dst_cache;         /* 0x130   0x8 */
+       atomic_t                   sk_omem_alloc;        /* 0x138   0x4 */
+       int                        sk_sndbuf;            /* 0x13c   0x4 */
+       /* --- cacheline 5 boundary (320 bytes) --- */
+       int                        sk_wmem_queued;       /* 0x140   0x4 */
+       atomic_t                   sk_wmem_alloc;        /* 0x144   0x4 */
+       long unsigned int          sk_tsq_flags;         /* 0x148   0x8 */
+       struct sk_buff *           sk_send_head;         /* 0x150   0x8 */
+       struct sk_buff_head        sk_write_queue;       /* 0x158  0x18 */
+       __s32                      sk_peek_off;          /* 0x170   0x4 */
+       int                        sk_write_pending;     /* 0x174   0x4 */
+       long int                   sk_sndtimeo;          /* 0x178   0x8 */
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Tested-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ include/net/sock.h | 51 +++++++++++++++++++++++++++------------------------
+ 1 file changed, 27 insertions(+), 24 deletions(-)
+
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -343,6 +343,9 @@ struct sock {
+ #define sk_rxhash             __sk_common.skc_rxhash
+       socket_lock_t           sk_lock;
++      atomic_t                sk_drops;
++      int                     sk_rcvlowat;
++      struct sk_buff_head     sk_error_queue;
+       struct sk_buff_head     sk_receive_queue;
+       /*
+        * The backlog queue is special, it is always used with
+@@ -359,14 +362,13 @@ struct sock {
+               struct sk_buff  *tail;
+       } sk_backlog;
+ #define sk_rmem_alloc sk_backlog.rmem_alloc
+-      int                     sk_forward_alloc;
+-      __u32                   sk_txhash;
++      int                     sk_forward_alloc;
+ #ifdef CONFIG_NET_RX_BUSY_POLL
+-      unsigned int            sk_napi_id;
+       unsigned int            sk_ll_usec;
++      /* ===== mostly read cache line ===== */
++      unsigned int            sk_napi_id;
+ #endif
+-      atomic_t                sk_drops;
+       int                     sk_rcvbuf;
+       struct sk_filter __rcu  *sk_filter;
+@@ -379,11 +381,30 @@ struct sock {
+ #endif
+       struct dst_entry        *sk_rx_dst;
+       struct dst_entry __rcu  *sk_dst_cache;
+-      /* Note: 32bit hole on 64bit arches */
+-      atomic_t                sk_wmem_alloc;
+       atomic_t                sk_omem_alloc;
+       int                     sk_sndbuf;
++
++      /* ===== cache line for TX ===== */
++      int                     sk_wmem_queued;
++      atomic_t                sk_wmem_alloc;
++      unsigned long           sk_tsq_flags;
++      struct sk_buff          *sk_send_head;
+       struct sk_buff_head     sk_write_queue;
++      __s32                   sk_peek_off;
++      int                     sk_write_pending;
++      long                    sk_sndtimeo;
++      struct timer_list       sk_timer;
++      __u32                   sk_priority;
++      __u32                   sk_mark;
++      u32                     sk_pacing_rate; /* bytes per second */
++      u32                     sk_max_pacing_rate;
++      struct page_frag        sk_frag;
++      netdev_features_t       sk_route_caps;
++      netdev_features_t       sk_route_nocaps;
++      int                     sk_gso_type;
++      unsigned int            sk_gso_max_size;
++      gfp_t                   sk_allocation;
++      __u32                   sk_txhash;
+       /*
+        * Because of non atomicity rules, all
+@@ -399,41 +420,23 @@ struct sock {
+ #define SK_PROTOCOL_MAX U8_MAX
+       kmemcheck_bitfield_end(flags);
+-      int                     sk_wmem_queued;
+-      gfp_t                   sk_allocation;
+-      u32                     sk_pacing_rate; /* bytes per second */
+-      u32                     sk_max_pacing_rate;
+-      netdev_features_t       sk_route_caps;
+-      netdev_features_t       sk_route_nocaps;
+-      int                     sk_gso_type;
+-      unsigned int            sk_gso_max_size;
+       u16                     sk_gso_max_segs;
+-      int                     sk_rcvlowat;
+       unsigned long           sk_lingertime;
+-      struct sk_buff_head     sk_error_queue;
+       struct proto            *sk_prot_creator;
+       rwlock_t                sk_callback_lock;
+       int                     sk_err,
+                               sk_err_soft;
+       u32                     sk_ack_backlog;
+       u32                     sk_max_ack_backlog;
+-      __u32                   sk_priority;
+-      __u32                   sk_mark;
+       struct pid              *sk_peer_pid;
+       const struct cred       *sk_peer_cred;
+       long                    sk_rcvtimeo;
+-      long                    sk_sndtimeo;
+-      struct timer_list       sk_timer;
+       ktime_t                 sk_stamp;
+       u16                     sk_tsflags;
+       u8                      sk_shutdown;
+       u32                     sk_tskey;
+       struct socket           *sk_socket;
+       void                    *sk_user_data;
+-      struct page_frag        sk_frag;
+-      struct sk_buff          *sk_send_head;
+-      __s32                   sk_peek_off;
+-      int                     sk_write_pending;
+ #ifdef CONFIG_SECURITY
+       void                    *sk_security;
+ #endif
diff --git a/target/linux/generic/backport-4.9/024-8-tcp-tsq-move-tsq_flags-close-to-sk_wmem_alloc.patch b/target/linux/generic/backport-4.9/024-8-tcp-tsq-move-tsq_flags-close-to-sk_wmem_alloc.patch
new file mode 100644 (file)
index 0000000..545fe60
--- /dev/null
@@ -0,0 +1,176 @@
+From 7aa5470c2c09265902b5e4289afa82e4e7c2987e Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 3 Dec 2016 11:14:57 -0800
+Subject: [PATCH 08/10] tcp: tsq: move tsq_flags close to sk_wmem_alloc
+
+tsq_flags being in the same cache line than sk_wmem_alloc
+makes a lot of sense. Both fields are changed from tcp_wfree()
+and more generally by various TSQ related functions.
+
+Prior patch made room in struct sock and added sk_tsq_flags,
+this patch deletes tsq_flags from struct tcp_sock.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ include/linux/tcp.h   |  1 -
+ net/ipv4/tcp.c        |  4 ++--
+ net/ipv4/tcp_ipv4.c   |  2 +-
+ net/ipv4/tcp_output.c | 24 +++++++++++-------------
+ net/ipv4/tcp_timer.c  |  4 ++--
+ net/ipv6/tcp_ipv6.c   |  2 +-
+ 6 files changed, 17 insertions(+), 20 deletions(-)
+
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -192,7 +192,6 @@ struct tcp_sock {
+       u32     tsoffset;       /* timestamp offset */
+       struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+-      unsigned long   tsq_flags;
+       /* Data for direct copy to user */
+       struct {
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -665,9 +665,9 @@ static void tcp_push(struct sock *sk, in
+       if (tcp_should_autocork(sk, skb, size_goal)) {
+               /* avoid atomic op if TSQ_THROTTLED bit is already set */
+-              if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
++              if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
+                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+-                      set_bit(TSQ_THROTTLED, &tp->tsq_flags);
++                      set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+               }
+               /* It is possible TX completion already happened
+                * before we set TSQ_THROTTLED.
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -446,7 +446,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb
+                       if (!sock_owned_by_user(sk)) {
+                               tcp_v4_mtu_reduced(sk);
+                       } else {
+-                              if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
++                              if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
+                                       sock_hold(sk);
+                       }
+                       goto out;
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -767,14 +767,15 @@ static void tcp_tasklet_func(unsigned lo
+       list_for_each_safe(q, n, &list) {
+               tp = list_entry(q, struct tcp_sock, tsq_node);
+               list_del(&tp->tsq_node);
+-              clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+               sk = (struct sock *)tp;
++              clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
++
+               if (!sk->sk_lock.owned &&
+-                  test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
++                  test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
+                       bh_lock_sock(sk);
+                       if (!sock_owned_by_user(sk)) {
+-                              clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
++                              clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+                               tcp_tsq_handler(sk);
+                       }
+                       bh_unlock_sock(sk);
+@@ -797,16 +798,15 @@ static void tcp_tasklet_func(unsigned lo
+  */
+ void tcp_release_cb(struct sock *sk)
+ {
+-      struct tcp_sock *tp = tcp_sk(sk);
+       unsigned long flags, nflags;
+       /* perform an atomic operation only if at least one flag is set */
+       do {
+-              flags = tp->tsq_flags;
++              flags = sk->sk_tsq_flags;
+               if (!(flags & TCP_DEFERRED_ALL))
+                       return;
+               nflags = flags & ~TCP_DEFERRED_ALL;
+-      } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
++      } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
+       if (flags & TCPF_TSQ_DEFERRED)
+               tcp_tsq_handler(sk);
+@@ -878,7 +878,7 @@ void tcp_wfree(struct sk_buff *skb)
+       if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+               goto out;
+-      for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
++      for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
+               struct tsq_tasklet *tsq;
+               bool empty;
+@@ -886,7 +886,7 @@ void tcp_wfree(struct sk_buff *skb)
+                       goto out;
+               nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+-              nval = cmpxchg(&tp->tsq_flags, oval, nval);
++              nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+               if (nval != oval)
+                       continue;
+@@ -2093,7 +2093,7 @@ static bool tcp_small_queue_check(struct
+                   skb->prev == sk->sk_write_queue.next)
+                       return false;
+-              set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
++              set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+               /* It is possible TX completion already happened
+                * before we set TSQ_THROTTLED, so we must
+                * test again the condition.
+@@ -2191,8 +2191,8 @@ static bool tcp_write_xmit(struct sock *
+                   unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+                       break;
+-              if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
+-                      clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
++              if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
++                      clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+               if (tcp_small_queue_check(sk, skb, 0))
+                       break;
+@@ -3495,8 +3495,6 @@ void tcp_send_ack(struct sock *sk)
+       /* We do not want pure acks influencing TCP Small Queues or fq/pacing
+        * too much.
+        * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
+-       * We also avoid tcp_wfree() overhead (cache line miss accessing
+-       * tp->tsq_flags) by using regular sock_wfree()
+        */
+       skb_set_tcp_pure_ack(buff);
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -311,7 +311,7 @@ static void tcp_delack_timer(unsigned lo
+               inet_csk(sk)->icsk_ack.blocked = 1;
+               __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+               /* deleguate our work to tcp_release_cb() */
+-              if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
++              if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
+                       sock_hold(sk);
+       }
+       bh_unlock_sock(sk);
+@@ -594,7 +594,7 @@ static void tcp_write_timer(unsigned lon
+               tcp_write_timer_handler(sk);
+       } else {
+               /* delegate our work to tcp_release_cb() */
+-              if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
++              if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
+                       sock_hold(sk);
+       }
+       bh_unlock_sock(sk);
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -404,7 +404,7 @@ static void tcp_v6_err(struct sk_buff *s
+               if (!sock_owned_by_user(sk))
+                       tcp_v6_mtu_reduced(sk);
+               else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
+-                                         &tp->tsq_flags))
++                                         &sk->sk_tsq_flags))
+                       sock_hold(sk);
+               goto out;
+       }
diff --git a/target/linux/generic/backport-4.9/024-9-tcp-add-a-missing-barrier-in-tcp_tasklet_func.patch b/target/linux/generic/backport-4.9/024-9-tcp-add-a-missing-barrier-in-tcp_tasklet_func.patch
new file mode 100644 (file)
index 0000000..d2b8de6
--- /dev/null
@@ -0,0 +1,40 @@
+From 0a9648f1293966c838dc570da73c15a76f4c89d6 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 21 Dec 2016 05:42:43 -0800
+Subject: [PATCH 09/10] tcp: add a missing barrier in tcp_tasklet_func()
+
+Madalin reported crashes happening in tcp_tasklet_func() on powerpc64
+
+Before TSQ_QUEUED bit is cleared, we must ensure the changes done
+by list_del(&tp->tsq_node); are committed to memory, otherwise
+corruption might happen, as an other cpu could catch TSQ_QUEUED
+clearance too soon.
+
+We can notice that old kernels were immune to this bug, because
+TSQ_QUEUED was cleared after a bh_lock_sock(sk)/bh_unlock_sock(sk)
+section, but they could have missed a kick to write additional bytes,
+when NIC interrupts for a given flow are spread to multiple cpus.
+
+Affected TCP flows would need an incoming ACK or RTO timer to add more
+packets to the pipe. So overall situation should be better now.
+
+Fixes: b223feb9de2a ("tcp: tsq: add shortcut in tcp_tasklet_func()")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Madalin Bucur <madalin.bucur@nxp.com>
+Tested-by: Madalin Bucur <madalin.bucur@nxp.com>
+Tested-by: Xing Lei <xing.lei@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ net/ipv4/tcp_output.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -769,6 +769,7 @@ static void tcp_tasklet_func(unsigned lo
+               list_del(&tp->tsq_node);
+               sk = (struct sock *)tp;
++              smp_mb__before_atomic();
+               clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+               if (!sk->sk_lock.owned &&