From 39771b127b412377d6354893c7d43ee8f2edecfd Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 2 Apr 2016 23:08:06 -0400
Subject: sock: break up sock_cmsg_snd into __sock_cmsg_snd and loop

To process cmsg's of the SOL_SOCKET level in addition to
cmsgs of another level, protocols can call sock_cmsg_send().
This causes a double walk on the cmsghdr list, one for SOL_SOCKET
and one for the other level.

Extract the inner demultiplex logic from the loop that walks the list,
to allow having this called directly from a walker in the protocol
specific code.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index b67b9aedb230..66976f88566b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1866,27 +1866,38 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
 }
 EXPORT_SYMBOL(sock_alloc_send_skb);
 
+int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+		     struct sockcm_cookie *sockc)
+{
+	switch (cmsg->cmsg_type) {
+	case SO_MARK:
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
+		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+			return -EINVAL;
+		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__sock_cmsg_send);
+
 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
 		   struct sockcm_cookie *sockc)
 {
 	struct cmsghdr *cmsg;
+	int ret;
 
 	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 		if (cmsg->cmsg_level != SOL_SOCKET)
 			continue;
-		switch (cmsg->cmsg_type) {
-		case SO_MARK:
-			if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
-				return -EPERM;
-			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
-				return -EINVAL;
-			sockc->mark = *(u32 *)CMSG_DATA(cmsg);
-			break;
-		default:
-			return -EINVAL;
-		}
+		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
+		if (ret)
+			return ret;
 	}
 	return 0;
 }
-- 
cgit 


From 6db8b963a7a31047573f229492ff6fc0f51cc377 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Sat, 2 Apr 2016 23:08:07 -0400
Subject: tcp: accept SOF_TIMESTAMPING_OPT_ID for passive TFO

SOF_TIMESTAMPING_OPT_ID is set to get data-independent IDs
to associate timestamps with send calls. For TCP connections,
tp->snd_una is used as the starting point to calculate
relative IDs.

This socket option will fail if set before the handshake on a
passive TCP fast open connection with data in SYN or SYN/ACK,
since setsockopt requires the connection to be in the
ESTABLISHED state.

To address these, instead of limiting the option to the
ESTABLISHED state, accept the SOF_TIMESTAMPING_OPT_ID option as
long as the connection is not in LISTEN or CLOSE states.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index 66976f88566b..0a64fe20ce5a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,7 +832,8 @@ set_rcvbuf:
 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 			if (sk->sk_protocol == IPPROTO_TCP &&
 			    sk->sk_type == SOCK_STREAM) {
-				if (sk->sk_state != TCP_ESTABLISHED) {
+				if ((1 << sk->sk_state) &
+				    (TCPF_CLOSE | TCPF_LISTEN)) {
 					ret = -EINVAL;
 					break;
 				}
-- 
cgit 


From 3dd17e63f5131bf2528f34aa5e3e57758175af92 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Sat, 2 Apr 2016 23:08:09 -0400
Subject: sock: accept SO_TIMESTAMPING flags in socket cmsg

Accept SO_TIMESTAMPING in control messages of the SOL_SOCKET level
as a basis to accept timestamping requests per write.

This implementation only accepts TX recording flags (i.e.,
SOF_TIMESTAMPING_TX_HARDWARE, SOF_TIMESTAMPING_TX_SOFTWARE,
SOF_TIMESTAMPING_TX_SCHED, and SOF_TIMESTAMPING_TX_ACK) in
control messages. Users need to set reporting flags (e.g.,
SOF_TIMESTAMPING_OPT_ID) per socket via socket options.

This commit adds a tsflags field in sockcm_cookie which is
set in __sock_cmsg_send. It only override the SOF_TIMESTAMPING_TX_*
bits in sockcm_cookie.tsflags allowing the control message
to override the recording behavior per write, yet maintaining
the value of other flags.

This patch implements validating the control message and setting
tsflags in struct sockcm_cookie. Next commits in this series will
actually implement timestamping per write for different protocols.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index 0a64fe20ce5a..315f5e57fffe 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1870,6 +1870,8 @@ EXPORT_SYMBOL(sock_alloc_send_skb);
 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
 		     struct sockcm_cookie *sockc)
 {
+	u32 tsflags;
+
 	switch (cmsg->cmsg_type) {
 	case SO_MARK:
 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
@@ -1878,6 +1880,17 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
 			return -EINVAL;
 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
 		break;
+	case SO_TIMESTAMPING:
+		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+			return -EINVAL;
+
+		tsflags = *(u32 *)CMSG_DATA(cmsg);
+		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
+			return -EINVAL;
+
+		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
+		sockc->tsflags |= tsflags;
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit 


From a4298e4522d687a79af8f8fbb7eca68399ab2d81 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Apr 2016 08:52:12 -0700
Subject: net: add SOCK_RCU_FREE socket flag

We want a generic way to insert an RCU grace period before socket
freeing for cases where RCU_SLAB_DESTROY_BY_RCU is adding too
much overhead.

SLAB_DESTROY_BY_RCU strict rules force us to take a reference
on the socket sk_refcnt, and it is a performance problem for UDP
encapsulation, or TCP synflood behavior, as many CPUs might
attempt the atomic operations on a shared sk_refcnt

UDP sockets and TCP listeners can set SOCK_RCU_FREE so that their
lookup can use traditional RCU rules, without refcount changes.
They can set the flag only once hashed and visible by other cpus.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Tested-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index 315f5e57fffe..7a6a063b28b3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1419,8 +1419,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 }
 EXPORT_SYMBOL(sk_alloc);
 
-void sk_destruct(struct sock *sk)
+/* Sockets having SOCK_RCU_FREE will call this function after one RCU
+ * grace period. This is the case for UDP sockets and TCP listeners.
+ */
+static void __sk_destruct(struct rcu_head *head)
 {
+	struct sock *sk = container_of(head, struct sock, sk_rcu);
 	struct sk_filter *filter;
 
 	if (sk->sk_destruct)
@@ -1449,6 +1453,14 @@ void sk_destruct(struct sock *sk)
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
+void sk_destruct(struct sock *sk)
+{
+	if (sock_flag(sk, SOCK_RCU_FREE))
+		call_rcu(&sk->sk_rcu, __sk_destruct);
+	else
+		__sk_destruct(&sk->sk_rcu);
+}
+
 static void __sk_free(struct sock *sk)
 {
 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
-- 
cgit 


From 9caad864151e525929d323de96cad382da49c3b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Apr 2016 08:52:20 -0700
Subject: tcp: increment sk_drops for listeners

Goal: packets dropped by a listener are accounted for.

This adds tcp_listendrop() helper, and clears sk_drops in sk_clone_lock()
so that children do not inherit their parent drop count.

Note that we no longer increment LINUX_MIB_LISTENDROPS counter when
sending a SYNCOOKIE, since the SYN packet generated a SYNACK.
We already have a separate LINUX_MIB_SYNCOOKIESSENT

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index 7a6a063b28b3..2f517ea56786 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1525,6 +1525,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_dst_cache	= NULL;
 		newsk->sk_wmem_queued	= 0;
 		newsk->sk_forward_alloc = 0;
+		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 
-- 
cgit 


From e6afc8ace6dd5cef5e812f26c72579da8806f5ac Mon Sep 17 00:00:00 2001
From: samanthakumar <samanthakumar@google.com>
Date: Tue, 5 Apr 2016 12:41:15 -0400
Subject: udp: remove headers from UDP packets before queueing

Remove UDP transport headers before queueing packets for reception.
This change simplifies a follow-up patch to add MSG_PEEK support.

Signed-off-by: Sam Kumar <samanthakumar@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index 2f517ea56786..e12197b359fd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -402,9 +402,8 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 }
 
 
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
-	int err;
 	unsigned long flags;
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 
@@ -414,10 +413,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		return -ENOMEM;
 	}
 
-	err = sk_filter(sk, skb);
-	if (err)
-		return err;
-
 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 		atomic_inc(&sk->sk_drops);
 		return -ENOBUFS;
@@ -440,6 +435,18 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		sk->sk_data_ready(sk);
 	return 0;
 }
+EXPORT_SYMBOL(__sock_queue_rcv_skb);
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int err;
+
+	err = sk_filter(sk, skb);
+	if (err)
+		return err;
+
+	return __sock_queue_rcv_skb(sk, skb);
+}
 EXPORT_SYMBOL(sock_queue_rcv_skb);
 
 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
-- 
cgit 


From 627d2d6b550094d88f9e518e15967e7bf906ebbf Mon Sep 17 00:00:00 2001
From: samanthakumar <samanthakumar@google.com>
Date: Tue, 5 Apr 2016 12:41:16 -0400
Subject: udp: enable MSG_PEEK at non-zero offset

Enable peeking at UDP datagrams at the offset specified with socket
option SOL_SOCKET/SO_PEEK_OFF. Peek at any datagram in the queue, up
to the end of the given datagram.

Implement the SO_PEEK_OFF semantics introduced in commit ef64a54f6e55
("sock: Introduce the SO_PEEK_OFF sock option"). Increase the offset
on peek, decrease it on regular reads.

When peeking, always checksum the packet immediately, to avoid
recomputation on subsequent peeks and final read.

The socket lock is not held for the duration of udp_recvmsg, so
peek and read operations can run concurrently. Only the last store
to sk_peek_off is preserved.

Signed-off-by: Sam Kumar <samanthakumar@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index e12197b359fd..2ce76e82857f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2187,6 +2187,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
+int sk_set_peek_off(struct sock *sk, int val)
+{
+	if (val < 0)
+		return -EINVAL;
+
+	sk->sk_peek_off = val;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sk_set_peek_off);
 
 /*
  * Set of default routines for initialising struct proto_ops when
-- 
cgit 


From 61881cfb5ad80c1d0a46ca6d08b7e271892b2ff6 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 5 Apr 2016 17:10:14 +0200
Subject: sock: fix lockdep annotation in release_sock

During release_sock we use callbacks to finish the processing
of outstanding skbs on the socket. We actually are still locked,
sk_locked.owned == 1, but we already told lockdep that the mutex
is released. This could lead to false positives in lockdep for
lockdep_sock_is_held (we don't hold the slock spinlock during processing
the outstanding skbs).

I took over this patch from Eric Dumazet and tested it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index 2ce76e82857f..152274d188ef 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2483,11 +2483,6 @@ EXPORT_SYMBOL(lock_sock_nested);
 
 void release_sock(struct sock *sk)
 {
-	/*
-	 * The sk_lock has mutex_unlock() semantics:
-	 */
-	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-
 	spin_lock_bh(&sk->sk_lock.slock);
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
-- 
cgit 


From 5413d1babe8f10de13d72496c12b862eef8ba613 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Apr 2016 14:16:52 -0700
Subject: net: do not block BH while processing socket backlog

Socket backlog processing is a major latency source.

With current TCP socket sk_rcvbuf limits, I have sampled __release_sock()
holding cpu for more than 5 ms, and packets being dropped by the NIC
once ring buffer is filled.

All users are now ready to be called from process context,
we can unblock BH and let interrupts be serviced faster.

cond_resched_softirq() could be removed, as it has no more user.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index e16a5db853c6..70744dbb6c3f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2019,33 +2019,27 @@ static void __release_sock(struct sock *sk)
 	__releases(&sk->sk_lock.slock)
 	__acquires(&sk->sk_lock.slock)
 {
-	struct sk_buff *skb = sk->sk_backlog.head;
+	struct sk_buff *skb, *next;
 
-	do {
+	while ((skb = sk->sk_backlog.head) != NULL) {
 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
-		bh_unlock_sock(sk);
 
-		do {
-			struct sk_buff *next = skb->next;
+		spin_unlock_bh(&sk->sk_lock.slock);
 
+		do {
+			next = skb->next;
 			prefetch(next);
 			WARN_ON_ONCE(skb_dst_is_noref(skb));
 			skb->next = NULL;
 			sk_backlog_rcv(sk, skb);
 
-			/*
-			 * We are in process context here with softirqs
-			 * disabled, use cond_resched_softirq() to preempt.
-			 * This is safe to do because we've taken the backlog
-			 * queue private:
-			 */
-			cond_resched_softirq();
+			cond_resched();
 
 			skb = next;
 		} while (skb != NULL);
 
-		bh_lock_sock(sk);
-	} while ((skb = sk->sk_backlog.head) != NULL);
+		spin_lock_bh(&sk->sk_lock.slock);
+	}
 
 	/*
 	 * Doing the zeroing here guarantee we can not loop forever
-- 
cgit 


From d41a69f1d390fa3f2546498103cdcd78b30676ff Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Apr 2016 14:16:53 -0700
Subject: tcp: make tcp_sendmsg() aware of socket backlog

Large sendmsg()/write() hold socket lock for the duration of the call,
unless sk->sk_sndbuf limit is hit. This is bad because incoming packets
are parked into socket backlog for a long time.
Critical decisions like fast retransmit might be delayed.
Receivers have to maintain a big out of order queue with additional cpu
overhead, and also possible stalls in TX once windows are full.

Bidirectional flows are particularly hurt since the backlog can become
quite big if the copy from user space triggers IO (page faults)

Some applications learnt to use sendmsg() (or sendmmsg()) with small
chunks to avoid this issue.

Kernel should know better, right ?

Add a generic sk_flush_backlog() helper and use it right
before a new skb is allocated. Typically we put 64KB of payload
per skb (unless MSG_EOR is requested) and checking socket backlog
every 64KB gives good results.

As a matter of fact, tests with TSO/GSO disabled give very nice
results, as we manage to keep a small write queue and smaller
perceived rtt.

Note that sk_flush_backlog() maintains socket ownership,
so is not equivalent to a {release_sock(sk); lock_sock(sk);},
to ensure implicit atomicity rules that sendmsg() was
giving to (possibly buggy) applications.

In this simple implementation, I chose to not call tcp_release_cb(),
but we might consider this later.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index 70744dbb6c3f..f615e9391170 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2048,6 +2048,13 @@ static void __release_sock(struct sock *sk)
 	sk->sk_backlog.len = 0;
 }
 
+void __sk_flush_backlog(struct sock *sk)
+{
+	spin_lock_bh(&sk->sk_lock.slock);
+	__release_sock(sk);
+	spin_unlock_bh(&sk->sk_lock.slock);
+}
+
 /**
  * sk_wait_data - wait for data to arrive at sk_receive_queue
  * @sk:    sock to wait on
-- 
cgit 


From 1d2077ac0165c0d173a2255e37cf4dc5033d92c7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 2 May 2016 10:56:27 -0700
Subject: net: add __sock_wfree() helper

Hosts sending lot of ACK packets exhibit high sock_wfree() cost
because of cache line miss to test SOCK_USE_WRITE_QUEUE

We could move this flag close to sk_wmem_alloc but it is better
to perform the atomic_sub_and_test() on a clean cache line,
as it avoid one extra bus transaction.

skb_orphan_partial() can also have a fast track for packets that either
are TCP acks, or already went through another skb_orphan_partial()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'net/core/sock.c')

diff --git a/net/core/sock.c b/net/core/sock.c
index f615e9391170..08bf97eceeb3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1655,6 +1655,17 @@ void sock_wfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_wfree);
 
+/* This variant of sock_wfree() is used by TCP,
+ * since it sets SOCK_USE_WRITE_QUEUE.
+ */
+void __sock_wfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
+		__sk_free(sk);
+}
+
 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 {
 	skb_orphan(skb);
@@ -1677,8 +1688,21 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 }
 EXPORT_SYMBOL(skb_set_owner_w);
 
+/* This helper is used by netem, as it can hold packets in its
+ * delay queue. We want to allow the owner socket to send more
+ * packets, as if they were already TX completed by a typical driver.
+ * But we also want to keep skb->sk set because some packet schedulers
+ * rely on it (sch_fq for example). So we set skb->truesize to a small
+ * amount (1) and decrease sk_wmem_alloc accordingly.
+ */
 void skb_orphan_partial(struct sk_buff *skb)
 {
+	/* If this skb is a TCP pure ACK or already went here,
+	 * we have nothing to do. 2 is already a very small truesize.
+	 */
+	if (skb->truesize <= 2)
+		return;
+
 	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
 	 * so we do not completely orphan skb, but transfert all
 	 * accounted bytes but one, to avoid unexpected reorders.
-- 
cgit