From 9d691539eea2d977e3eb86766c389a19a9c13146 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Feb 2016 21:03:08 -0800 Subject: tcp: do not enqueue skb with SYN flag If we remove the SYN flag from the skbs that tcp_fastopen_add_skb() places in socket receive queue, then we can remove the test that tcp_recvmsg() has to perform in fast path. All we have to do is to adjust SEQ in the slow path. For the moment, we place an unlikely() and output a message if we find an skb having SYN flag set. Goal would be to get rid of the test completely. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 19746b3fcbbe..c5075779e017 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1466,8 +1466,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; @@ -1657,8 +1659,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) -- cgit From 6fa251663069e05daadd1666cbf3b658bf840ea4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:49 +0200 Subject: ipv4: Namespaceify tcp syn retries sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c5075779e017..3dbb3637bb4b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2731,6 +2731,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int val, len; if (get_user(len, optlen)) @@ -2765,7 +2766,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; -- cgit From 1043e25ff96a1efc7bd34d11f5f32203a28a3bd7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:52 +0200 Subject: ipv4: Namespaceify tcp reordering sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3dbb3637bb4b..f4db6b04cdb4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -406,7 +406,7 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; u64_stats_init(&tp->syncp); - tp->reordering = sysctl_tcp_reordering; + tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); -- cgit From 1e579caa18b96f9eb18f4f5416658cd15f37c062 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:56 +0200 Subject: ipv4: Namespaceify tcp_fin_timeout sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f4db6b04cdb4..014f18e2f7b3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,8 +282,6 @@ #include #include -int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; - int sysctl_tcp_min_tso_segs __read_mostly = 2; int sysctl_tcp_autocorking __read_mostly = 1; @@ -2330,6 +2328,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); int val; int err = 0; @@ -2526,7 +2525,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_LINGER2: if (val < 0) tp->linger2 = -1; - else if (val > sysctl_tcp_fin_timeout / HZ) + else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) tp->linger2 = 0; else tp->linger2 = val * HZ; @@ -2771,7 +2770,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_LINGER2: val = tp->linger2; if (val >= 0) - val = (val ? : sysctl_tcp_fin_timeout) / HZ; + val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, -- cgit From cd9b266095f422267bddbec88f9098b48ea548fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Feb 2016 22:02:53 -0800 Subject: tcp: add tcpi_min_rtt and tcpi_notsent_bytes to tcp_info tcpi_min_rtt reports the minimal rtt observed by TCP stack for the flow, in usec unit. Might be ~0U if not yet known. tcpi_notsent_bytes reports the amount of bytes in the write queue that were not yet sent. This is done in a single patch to not add a temporary 32bit padding hole in tcp_info. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 014f18e2f7b3..f93150d15199 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2642,6 +2642,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + int notsent_bytes; u64 rate64; u32 rate; @@ -2722,6 +2723,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; + + notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + info->tcpi_notsent_bytes = max(0, notsent_bytes); + + info->tcpi_min_rtt = tcp_min_rtt(tp); } EXPORT_SYMBOL_GPL(tcp_get_info); -- cgit From 473bd239b808a8af5241ce9996a16d283d88ddff Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:05 -0800 Subject: tcp: Add tcp_inq to get available receive bytes on socket Create a common kernel function to get the number of bytes available on a TCP socket. This is based on code in INQ getsockopt and we now call the function for that getsockopt. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f9faadb42485..a265f00b9df9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -556,20 +556,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -EINVAL; slow = lock_sock_fast(sk); - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) - answ = 0; - else if (sock_flag(sk, SOCK_URGINLINE) || - !tp->urg_data || - before(tp->urg_seq, tp->copied_seq) || - !before(tp->urg_seq, tp->rcv_nxt)) { - - answ = tp->rcv_nxt - tp->copied_seq; - - /* Subtract 1, if FIN was received */ - if (answ && sock_flag(sk, SOCK_DONE)) - answ--; - } else - answ = tp->urg_seq - tp->copied_seq; + answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: -- cgit From a44d6eacdaf56f74fad699af7f4925a5f5ac0e7f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 14 Mar 2016 10:52:15 -0700 Subject: tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In Per RFC4898, they count segments sent/received containing a positive length data segment (that includes retransmission segments carrying data). Unlike tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments carrying no data (e.g. pure ack). The patch also updates the segs_in in tcp_fastopen_add_skb() so that segs_in >= data_segs_in property is kept. Together with retransmission data, tcpi_data_segs_out gives a better signal on the rxmit rate. v6: Rebase on the latest net-next v5: Eric pointed out that checking skb->len is still needed in tcp_fastopen_add_skb() because skb can carry a FIN without data. Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in() helper is used. Comment is added to the fastopen case to explain why segs_in has to be reset and tcp_segs_in() has to be called before __skb_pull(). v4: Add comment to the changes in tcp_fastopen_add_skb() and also add remark on this case in the commit message. v3: Add const modifier to the skb parameter in tcp_segs_in() v2: Rework based on recent fix by Eric: commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment") Signed-off-by: Martin KaFai Lau Cc: Chris Rapier Cc: Eric Dumazet Cc: Marcelo Ricardo Leitner Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a265f00b9df9..992b3103ec3e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2715,6 +2715,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_notsent_bytes = max(0, notsent_bytes); info->tcpi_min_rtt = tcp_min_rtt(tp); + info->tcpi_data_segs_in = tp->data_segs_in; + info->tcpi_data_segs_out = tp->data_segs_out; } EXPORT_SYMBOL_GPL(tcp_get_info); -- cgit