From 27da6d37e207e7ad9c692d3d0924f09e63a98e38 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Dec 2017 12:52:32 -0800 Subject: tcp: Enable 2nd listener hashtable in TCP Enable the second listener hashtable in TCP. The scale is the same as UDP which is one slot per 2MB. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bf97317e6c97..180311636023 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3577,6 +3577,9 @@ void __init tcp_init(void) percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); inet_hashinfo_init(&tcp_hashinfo); + inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", + thash_entries, 21, /* one slot per 2 MB*/ + 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, -- cgit From 563e0bb0dc74b3ca888e24f8c08f0239fe4016b0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 20 Dec 2017 11:12:51 +0800 Subject: net: tracepoint: replace tcp_set_state tracepoint with inet_sock_set_state tracepoint As sk_state is a common field for struct sock, so the state transition tracepoint should not be a TCP specific feature. Currently it traces all AF_INET state transition, so I rename this tracepoint to inet_sock_set_state tracepoint with some minor changes and move it into trace/events/sock.h. We dont need to create a file named trace/events/inet_sock.h for this one single tracepoint. Two helpers are introduced to trace sk_state transition - void inet_sk_state_store(struct sock *sk, int newstate); - void inet_sk_set_state(struct sock *sk, int state); As trace header should not be included in other header files, so they are defined in sock.c. The protocol such as SCTP maybe compiled as a ko, hence export inet_sk_set_state(). Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c470fec9062f..d408fb41c804 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -283,8 +283,6 @@ #include #include -#include - struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -2040,8 +2038,6 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; - trace_tcp_set_state(sk, oldstate, state); - switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) @@ -2065,7 +2061,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk_state_store(sk, state); + inet_sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -- cgit From 986ffdfd08dbaae721e82720e6bfc2c307e732dd Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 20 Dec 2017 11:12:52 +0800 Subject: net: sock: replace sk_state_load with inet_sk_state_load and remove sk_state_store sk_state_load is only used by AF_INET/AF_INET6, so rename it to inet_sk_state_load and move it into inet_sock.h. sk_state_store is removed as it is not used any more. Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d408fb41c804..67d39b79c801 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -502,7 +502,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) sock_poll_wait(file, sk_sleep(sk), wait); - state = sk_state_load(sk); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); @@ -2916,7 +2916,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (sk->sk_type != SOCK_STREAM) return; - info->tcpi_state = sk_state_load(sk); + info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); -- cgit From 111856c758d9a06145da446e0db8f71988eebf02 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Dec 2017 19:00:18 -0500 Subject: tcp: push full zerocopy packets Skbs that reach MAX_SKB_FRAGS cannot be extended further. Do the same for zerocopy frags as non-zerocopy frags and set the PSH bit. This improves GRO assembly. Suggested-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 67d39b79c801..44102484a76f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1371,8 +1371,10 @@ new_segment: pfrag->offset += copy; } else { err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); - if (err == -EMSGSIZE || err == -EEXIST) + if (err == -EMSGSIZE || err == -EEXIST) { + tcp_mark_push(tp, skb); goto new_segment; + } if (err < 0) goto do_error; copy = err; -- cgit From 02583adeff12fa0a4d72558853a926867afb226c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Dec 2017 19:00:19 -0500 Subject: tcp: place all zerocopy payload in frags This avoids an unnecessary copy of 1-2KB and improves tso_fragment, which has to fall back to tcp_fragment if skb->len != skb_data_len. It also avoids a surprising inconsistency in notifications: Zerocopy packets sent over loopback have their frags copied, so set SO_EE_CODE_ZEROCOPY_COPIED in the notification. But this currently does not happen for small packets, because when all data fits in the linear fragment, data is not copied in skb_orphan_frags_rx. Reported-by: Tom Deseyn Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 44102484a76f..947348872c3e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1186,7 +1186,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool process_backlog = false; - bool sg; + bool sg, zc = false; long timeo; flags = msg->msg_flags; @@ -1204,7 +1204,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) + zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG; + if (!zc) uarg->zerocopy = 0; } @@ -1325,13 +1326,13 @@ new_segment: copy = msg_data_left(msg); /* Where to copy to? */ - if (skb_availroom(skb) > 0) { + if (skb_availroom(skb) > 0 && !zc) { /* We have some space in skb head. Superb! */ copy = min_t(int, copy, skb_availroom(skb)); err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; - } else if (!uarg || !uarg->zerocopy) { + } else if (!zc) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); -- cgit From 8ddab50839e29e965460b2cf794fd2b06a946893 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Dec 2017 19:00:20 -0500 Subject: tcp: do not allocate linear memory for zerocopy skbs Zerocopy payload is now always stored in frags, and space for headers is reversed, so this memory is unused. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 947348872c3e..7ac583a2b9fe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1104,12 +1104,15 @@ static int linear_payload_sz(bool first_skb) return 0; } -static int select_size(const struct sock *sk, bool sg, bool first_skb) +static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc) { const struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sg) { + if (zc) + return 0; + if (sk_can_gso(sk)) { tmp = linear_payload_sz(first_skb); } else { @@ -1282,6 +1285,7 @@ restart: if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; + int linear; new_segment: /* Allocate new segment. If the interface is SG, @@ -1295,9 +1299,8 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - skb = sk_stream_alloc_skb(sk, - select_size(sk, sg, first_skb), - sk->sk_allocation, + linear = select_size(sk, sg, first_skb, zc); + skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, first_skb); if (!skb) goto wait_for_memory; -- cgit From 0a38806f31729c8931383d2ce944115312855931 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Wed, 3 Jan 2018 21:47:11 -0500 Subject: net: revert "Update RFS target at poll for tcp/udp" On multi-threaded processes, one common architecture is to have one (or a small number of) threads polling sockets, and a considerably larger pool of threads reading form and writing to the sockets. When we set RPS core on tcp_poll() or udp_poll() we essentially steer all packets of all the polled FDs to one (or small number of) cores, creaing a bottleneck and/or RPS misprediction. Another common architecture is to shard FDs among threads pinned to cores. In such a setting, setting RPS core in tcp_poll() and udp_poll() is redundant because the RFS core is correctly set in recvmsg and sendmsg. Thus, revert the following commit: c3f1dbaf6e28 ("net: Update RFS target at poll for tcp/udp"). Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Willem de Bruijn Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7ac583a2b9fe..f68cb33d50d1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -498,8 +498,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_rps_record_flow(sk); - sock_poll_wait(file, sk_sleep(sk), wait); state = inet_sk_state_load(sk); -- cgit From 809a79e913eed4ca02bfe5f78d3b7a56c7d8d7a6 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 10 Jan 2018 07:43:15 +0000 Subject: tcp: make local function tcp_recv_timestamp static Fixes the following sparse warning: net/ipv4/tcp.c:1736:6: warning: symbol 'tcp_recv_timestamp' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f68cb33d50d1..d7cf861bf699 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1733,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb, } /* Similar to __sock_recv_timestamp, but does not require an skb */ -void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping *tss) +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping *tss) { struct timeval tv; bool has_timestamping = false; -- cgit From de525be2ca2734865d29c4b67ddd29913b214906 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:09 -0800 Subject: bpf: Support passing args to sock_ops bpf function Adds support for passing up to 4 arguments to sock_ops bpf functions. It reusues the reply union, so the bpf_sock_ops structures are not increased in size. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d7cf861bf699..88b62441e7e9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op); + tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } -- cgit From d44874910a26f3a8f81edf873a2473363f07f660 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:15 -0800 Subject: bpf: Add BPF_SOCK_OPS_STATE_CB Adds support for calling sock_ops BPF program when there is a TCP state change. Two arguments are used; one for the old state and another for the new state. There is a new enum in include/uapi/linux/bpf.h that exports the TCP states that prepends BPF_ to the current TCP state names. If it is ever necessary to change the internal TCP state values (other than adding more to the end), then it will become necessary to convert from the internal TCP state value to the BPF value before calling the BPF sock_ops function. There are a set of compile checks added in tcp.c to detect if the internal and BPF values differ so we can make the necessary fixes. New op: BPF_SOCK_OPS_STATE_CB. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/ipv4/tcp.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 88b62441e7e9..f013ddc191e0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) -- cgit From 9b42d55a66d388e4dd5550107df051a9637564fc Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 26 Jan 2018 16:40:41 +0800 Subject: tcp: release sk_frag.page in tcp_disconnect socket can be disconnected and gets transformed back to a listening socket, if sk_frag.page is not released, which will be cloned into a new socket by sk_clone_lock, but the reference count of this page is increased, lead to a use after free or double free issue Signed-off-by: Li RongQing Cc: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c8ed3a04b504..874c9317b8df 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2458,6 +2458,12 @@ int tcp_disconnect(struct sock *sk, int flags) WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } + sk->sk_error_report(sk); return err; } -- cgit