From 27da6d37e207e7ad9c692d3d0924f09e63a98e38 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 1 Dec 2017 12:52:32 -0800
Subject: tcp: Enable 2nd listener hashtable in TCP

Enable the second listener hashtable in TCP.
The scale is the same as UDP which is one slot per 2MB.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bf97317e6c97..180311636023 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3577,6 +3577,9 @@ void __init tcp_init(void)
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
 	inet_hashinfo_init(&tcp_hashinfo);
+	inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+			    thash_entries, 21,  /* one slot per 2 MB*/
+			    0, 64 * 1024);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-- 
cgit 


From 563e0bb0dc74b3ca888e24f8c08f0239fe4016b0 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 20 Dec 2017 11:12:51 +0800
Subject: net: tracepoint: replace tcp_set_state tracepoint with
 inet_sock_set_state tracepoint

As sk_state is a common field for struct sock, so the state
transition tracepoint should not be a TCP specific feature.
Currently it traces all AF_INET state transition, so I rename this
tracepoint to inet_sock_set_state tracepoint with some minor changes and move it
into trace/events/sock.h.
We dont need to create a file named trace/events/inet_sock.h for this one single
tracepoint.

Two helpers are introduced to trace sk_state transition
    - void inet_sk_state_store(struct sock *sk, int newstate);
    - void inet_sk_set_state(struct sock *sk, int state);
As trace header should not be included in other header files,
so they are defined in sock.c.

The protocol such as SCTP maybe compiled as a ko, hence export
inet_sk_set_state().

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c470fec9062f..d408fb41c804 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,8 +283,6 @@
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
 
-#include <trace/events/tcp.h>
-
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
@@ -2040,8 +2038,6 @@ void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
-	trace_tcp_set_state(sk, oldstate, state);
-
 	switch (state) {
 	case TCP_ESTABLISHED:
 		if (oldstate != TCP_ESTABLISHED)
@@ -2065,7 +2061,7 @@ void tcp_set_state(struct sock *sk, int state)
 	/* Change state AFTER socket is unhashed to avoid closed
 	 * socket sitting in hash tables.
 	 */
-	sk_state_store(sk, state);
+	inet_sk_state_store(sk, state);
 
 #ifdef STATE_TRACE
 	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
-- 
cgit 


From 986ffdfd08dbaae721e82720e6bfc2c307e732dd Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 20 Dec 2017 11:12:52 +0800
Subject: net: sock: replace sk_state_load with inet_sk_state_load and remove
 sk_state_store

sk_state_load is only used by AF_INET/AF_INET6, so rename it to
inet_sk_state_load and move it into inet_sock.h.

sk_state_store is removed as it is not used any more.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d408fb41c804..67d39b79c801 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -502,7 +502,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 
-	state = sk_state_load(sk);
+	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
 		return inet_csk_listen_poll(sk);
 
@@ -2916,7 +2916,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	if (sk->sk_type != SOCK_STREAM)
 		return;
 
-	info->tcpi_state = sk_state_load(sk);
+	info->tcpi_state = inet_sk_state_load(sk);
 
 	/* Report meaningful fields for all TCP states, including listeners */
 	rate = READ_ONCE(sk->sk_pacing_rate);
-- 
cgit 


From 111856c758d9a06145da446e0db8f71988eebf02 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Dec 2017 19:00:18 -0500
Subject: tcp: push full zerocopy packets

Skbs that reach MAX_SKB_FRAGS cannot be extended further. Do the
same for zerocopy frags as non-zerocopy frags and set the PSH bit.
This improves GRO assembly.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 67d39b79c801..44102484a76f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1371,8 +1371,10 @@ new_segment:
 			pfrag->offset += copy;
 		} else {
 			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
-			if (err == -EMSGSIZE || err == -EEXIST)
+			if (err == -EMSGSIZE || err == -EEXIST) {
+				tcp_mark_push(tp, skb);
 				goto new_segment;
+			}
 			if (err < 0)
 				goto do_error;
 			copy = err;
-- 
cgit 


From 02583adeff12fa0a4d72558853a926867afb226c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Dec 2017 19:00:19 -0500
Subject: tcp: place all zerocopy payload in frags

This avoids an unnecessary copy of 1-2KB and improves tso_fragment,
which has to fall back to tcp_fragment if skb->len != skb_data_len.

It also avoids a surprising inconsistency in notifications:
Zerocopy packets sent over loopback have their frags copied, so set
SO_EE_CODE_ZEROCOPY_COPIED in the notification. But this currently
does not happen for small packets, because when all data fits in the
linear fragment, data is not copied in skb_orphan_frags_rx.

Reported-by: Tom Deseyn <tom.deseyn@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 44102484a76f..947348872c3e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1186,7 +1186,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 	int flags, err, copied = 0;
 	int mss_now = 0, size_goal, copied_syn = 0;
 	bool process_backlog = false;
-	bool sg;
+	bool sg, zc = false;
 	long timeo;
 
 	flags = msg->msg_flags;
@@ -1204,7 +1204,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 		}
 
-		if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
+		zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG;
+		if (!zc)
 			uarg->zerocopy = 0;
 	}
 
@@ -1325,13 +1326,13 @@ new_segment:
 			copy = msg_data_left(msg);
 
 		/* Where to copy to? */
-		if (skb_availroom(skb) > 0) {
+		if (skb_availroom(skb) > 0 && !zc) {
 			/* We have some space in skb head. Superb! */
 			copy = min_t(int, copy, skb_availroom(skb));
 			err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
 			if (err)
 				goto do_fault;
-		} else if (!uarg || !uarg->zerocopy) {
+		} else if (!zc) {
 			bool merge = true;
 			int i = skb_shinfo(skb)->nr_frags;
 			struct page_frag *pfrag = sk_page_frag(sk);
-- 
cgit 


From 8ddab50839e29e965460b2cf794fd2b06a946893 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Dec 2017 19:00:20 -0500
Subject: tcp: do not allocate linear memory for zerocopy skbs

Zerocopy payload is now always stored in frags, and space for headers
is reversed, so this memory is unused.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 947348872c3e..7ac583a2b9fe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1104,12 +1104,15 @@ static int linear_payload_sz(bool first_skb)
 	return 0;
 }
 
-static int select_size(const struct sock *sk, bool sg, bool first_skb)
+static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int tmp = tp->mss_cache;
 
 	if (sg) {
+		if (zc)
+			return 0;
+
 		if (sk_can_gso(sk)) {
 			tmp = linear_payload_sz(first_skb);
 		} else {
@@ -1282,6 +1285,7 @@ restart:
 
 		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
 			bool first_skb;
+			int linear;
 
 new_segment:
 			/* Allocate new segment. If the interface is SG,
@@ -1295,9 +1299,8 @@ new_segment:
 				goto restart;
 			}
 			first_skb = tcp_rtx_and_write_queues_empty(sk);
-			skb = sk_stream_alloc_skb(sk,
-						  select_size(sk, sg, first_skb),
-						  sk->sk_allocation,
+			linear = select_size(sk, sg, first_skb, zc);
+			skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
 						  first_skb);
 			if (!skb)
 				goto wait_for_memory;
-- 
cgit 


From 0a38806f31729c8931383d2ce944115312855931 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Wed, 3 Jan 2018 21:47:11 -0500
Subject: net: revert "Update RFS target at poll for tcp/udp"

On multi-threaded processes, one common architecture is to have
one (or a small number of) threads polling sockets, and a
considerably larger pool of threads reading form and writing to the
sockets. When we set RPS core on tcp_poll() or udp_poll() we essentially
steer all packets of all the polled FDs to one (or small number of)
cores, creaing a bottleneck and/or RPS misprediction.

Another common architecture is to shard FDs among threads pinned
to cores. In such a setting, setting RPS core in tcp_poll() and
udp_poll() is redundant because the RFS core is correctly
set in recvmsg and sendmsg.

Thus, revert the following commit:
c3f1dbaf6e28 ("net: Update RFS target at poll for tcp/udp").

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7ac583a2b9fe..f68cb33d50d1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -498,8 +498,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int state;
 
-	sock_rps_record_flow(sk);
-
 	sock_poll_wait(file, sk_sleep(sk), wait);
 
 	state = inet_sk_state_load(sk);
-- 
cgit 


From 809a79e913eed4ca02bfe5f78d3b7a56c7d8d7a6 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 10 Jan 2018 07:43:15 +0000
Subject: tcp: make local function tcp_recv_timestamp static

Fixes the following sparse warning:

net/ipv4/tcp.c:1736:6: warning:
 symbol 'tcp_recv_timestamp' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f68cb33d50d1..d7cf861bf699 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1733,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb,
 }
 
 /* Similar to __sock_recv_timestamp, but does not require an skb */
-void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
-			struct scm_timestamping *tss)
+static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+			       struct scm_timestamping *tss)
 {
 	struct timeval tv;
 	bool has_timestamping = false;
-- 
cgit 


From de525be2ca2734865d29c4b67ddd29913b214906 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:09 -0800
Subject: bpf: Support passing args to sock_ops bpf function

Adds support for passing up to 4 arguments to sock_ops bpf functions. It
reusues the reply union, so the bpf_sock_ops structures are not
increased in size.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d7cf861bf699..88b62441e7e9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
 	tcp_mtup_init(sk);
 	icsk->icsk_af_ops->rebuild_header(sk);
 	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, bpf_op);
+	tcp_call_bpf(sk, bpf_op, 0, NULL);
 	tcp_init_congestion_control(sk);
 	tcp_init_buffer_space(sk);
 }
-- 
cgit 


From d44874910a26f3a8f81edf873a2473363f07f660 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Thu, 25 Jan 2018 16:14:15 -0800
Subject: bpf: Add BPF_SOCK_OPS_STATE_CB

Adds support for calling sock_ops BPF program when there is a TCP state
change. Two arguments are used; one for the old state and another for
the new state.

There is a new enum in include/uapi/linux/bpf.h that exports the TCP
states that prepends BPF_ to the current TCP state names. If it is ever
necessary to change the internal TCP state values (other than adding
more to the end), then it will become necessary to convert from the
internal TCP state value to the BPF value before calling the BPF
sock_ops function. There are a set of compile checks added in tcp.c
to detect if the internal and BPF values differ so we can make the
necessary fixes.

New op: BPF_SOCK_OPS_STATE_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/ipv4/tcp.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 88b62441e7e9..f013ddc191e0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
+	/* We defined a new enum for TCP states that are exported in BPF
+	 * so as not force the internal TCP states to be frozen. The
+	 * following checks will detect if an internal state value ever
+	 * differs from the BPF value. If this ever happens, then we will
+	 * need to remap the internal value to the BPF value before calling
+	 * tcp_call_bpf_2arg.
+	 */
+	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
+	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
+	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
+	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+
 	switch (state) {
 	case TCP_ESTABLISHED:
 		if (oldstate != TCP_ESTABLISHED)
-- 
cgit 


From 9b42d55a66d388e4dd5550107df051a9637564fc Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 26 Jan 2018 16:40:41 +0800
Subject: tcp: release sk_frag.page in tcp_disconnect

socket can be disconnected and gets transformed back to a listening
socket, if sk_frag.page is not released, which will be cloned into
a new socket by sk_clone_lock, but the reference count of this page
is increased, lead to a use after free or double free issue

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c8ed3a04b504..874c9317b8df 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2458,6 +2458,12 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
+	if (sk->sk_frag.page) {
+		put_page(sk->sk_frag.page);
+		sk->sk_frag.page = NULL;
+		sk->sk_frag.offset = 0;
+	}
+
 	sk->sk_error_report(sk);
 	return err;
 }
-- 
cgit