From 4a5ab4e224288403b0b4b6b8c4d339323150c312 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Feb 2014 15:57:10 -0800 Subject: tcp: remove 1ms offset in srtt computation TCP pacing depends on an accurate srtt estimation. Current srtt estimation is using jiffie resolution, and has an artificial offset of at least 1 ms, which can produce slowdowns when FQ/pacing is used, especially in DC world, where typical rtt is below 1 ms. We are planning a switch to usec resolution for linux-3.15, but in the meantime, this patch removes the 1 ms offset. All we need is to have tp->srtt minimal value of 1 to differentiate the case of srtt being initialized or not, not 8. The problematic behavior was observed on a 40Gbit testbed, where 32 concurrent netperf were reaching 12Gbps of aggregate speed, instead of line speed. This patch also has the effect of reporting more accurate srtt and send rates to iproute2 ss command as in : $ ss -i dst cca2 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 0 10.244.129.1:56984 10.244.129.2:12865 cubic wscale:6,6 rto:200 rtt:0.25/0.25 ato:40 mss:1448 cwnd:10 send 463.4Mbps rcv_rtt:1 rcv_space:29200 tcp ESTAB 0 390960 10.244.129.1:60247 10.244.129.2:50204 cubic wscale:6,6 rto:200 rtt:0.875/0.75 mss:1448 cwnd:73 ssthresh:51 send 966.4Mbps unacked:73 retrans:0/121 rcv_space:29200 Reported-by: Vytautas Valancius Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 03d26b85eab8..10435b3b9d0f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1977,7 +1977,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; -- cgit From bf06200e732de613a1277984bf34d1a21c2de03d Mon Sep 17 00:00:00 2001 From: John Ogness Date: Sun, 9 Feb 2014 18:40:11 -0800 Subject: tcp: tsq: fix nonagle handling Commit 46d3ceabd8d9 ("tcp: TCP Small Queues") introduced a possible regression for applications using TCP_NODELAY. If TCP session is throttled because of tsq, we should consult tp->nonagle when TX completion is done and allow us to send additional segment, especially if this segment is not a full MSS. Otherwise this segment is sent after an RTO. [edumazet] : Cooked the changelog, added another fix about testing sk_wmem_alloc twice because TX completion can happen right before setting TSQ_THROTTLED bit. This problem is particularly visible with recent auto corking, but might also be triggered with low tcp_limit_output_bytes values or NIC drivers delaying TX completion by hundred of usec, and very low rtt. Thomas Glanzmann for example reported an iscsi regression, caused by tcp auto corking making this bug quite visible. Fixes: 46d3ceabd8d9 ("tcp: TCP Small Queues") Signed-off-by: John Ogness Signed-off-by: Eric Dumazet Reported-by: Thomas Glanzmann Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 10435b3b9d0f..3be16727f058 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -698,7 +698,8 @@ static void tcp_tsq_handler(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) - tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); + tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + 0, GFP_ATOMIC); } /* * One tasklet per cpu tries to send more skbs. @@ -1904,7 +1905,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); - break; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + * We abuse smp_mb__after_clear_bit() because + * there is no smp_mb__after_set_bit() yet + */ + smp_mb__after_clear_bit(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) + break; } limit = mss_now; -- cgit From f5ddcbbb40aa0ba7fbfe22355d287603dbeeaaac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Feb 2014 10:09:18 -0800 Subject: net-tcp: fastopen: fix high order allocations This patch fixes two bugs in fastopen : 1) The tcp_sendmsg(..., @size) argument was ignored. Code was relying on user not fooling the kernel with iovec mismatches 2) When MTU is about 64KB, tcp_send_syn_data() attempts order-5 allocations, which are likely to fail when memory gets fragmented. Fixes: 783237e8daf13 ("net-tcp: Fast Open client - sending SYN-data") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Acked-by: Yuchung Cheng Tested-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3be16727f058..09805817627b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2908,7 +2908,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; - syn_data = skb_copy_expand(syn, skb_headroom(syn), space, + space = min_t(size_t, space, fo->size); + + /* limit to order-0 allocations */ + space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); + + syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, sk->sk_allocation); if (syn_data == NULL) goto fallback; -- cgit From 9a9bfd032f0207dd6e63c84b7676b58f160af04b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Feb 2014 04:31:03 -0800 Subject: net: tcp: use NET_INC_STATS() While LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES can only be incremented in tcp_transmit_skb() from softirq (incoming message or timer activation), it is better to use NET_INC_STATS() instead of NET_INC_STATS_BH() as tcp_transmit_skb() can be called from process context. This will avoid copy/paste confusion when/if we want to add other SNMP counters in tcp_transmit_skb() Signed-off-by: Eric Dumazet Cc: Hannes Frederic Sowa Cc: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 09805817627b..d718482fd11c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -864,8 +864,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (unlikely(skb->fclone == SKB_FCLONE_ORIG && fclone->fclone == SKB_FCLONE_CLONE)) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); -- cgit From c84a57113f59486e6688be1cd443b96e3118efa0 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 28 Feb 2014 16:42:26 -0800 Subject: tcp: fix bogus RTT on special retransmission RTT may be bogus with tall loss probe (TLP) when a packet is retransmitted and latter (s)acked without TCPCB_SACKED_RETRANS flag. For example, TLP calls __tcp_retransmit_skb() instead of tcp_retransmit_skb(). The skb timestamps are updated but the sacked flag is not marked with TCPCB_SACKED_RETRANS. As a result we'll get bogus RTT in tcp_clean_rtx_queue() or in tcp_sacktag_one() on spurious retransmission. The fix is to apply the sticky flag TCP_EVER_RETRANS to enforce Karn's check on RTT sampling. However this will disable F-RTO if timeout occurs after TLP, by resetting undo_marker in tcp_enter_loss(). We relax this check to only if any pending retransmists are still in-flight. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d718482fd11c..f0eb4e337ec8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2337,6 +2337,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); unsigned int cur_mss; + int err; /* Inconslusive MTU probe */ if (icsk->icsk_mtup.probe_size) { @@ -2400,11 +2401,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; } else { - return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + + if (likely(!err)) + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) -- cgit