From 662051215c758ae8545451628816204ed6cd372d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Jun 2020 20:37:07 -0700 Subject: tcp: grow window for OOO packets only for SACK flows Back in 2013, we made a change that broke fast retransmit for non SACK flows. Indeed, for these flows, a sender needs to receive three duplicate ACK before starting fast retransmit. Sending ACK with different receive window do not count. Even if enabling SACK is strongly recommended these days, there still are some cases where it has to be disabled. Not increasing the window seems better than having to rely on RTO. After the fix, following packetdrill test gives : // Initialize connection 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 +0 < . 1:1(0) ack 1 win 514 +0 accept(3, ..., ...) = 4 +0 < . 1:1001(1000) ack 1 win 514 // Quick ack +0 > . 1:1(0) ack 1001 win 264 +0 < . 2001:3001(1000) ack 1 win 514 // DUPACK : Normally we should not change the window +0 > . 1:1(0) ack 1001 win 264 +0 < . 3001:4001(1000) ack 1 win 514 // DUPACK : Normally we should not change the window +0 > . 1:1(0) ack 1001 win 264 +0 < . 4001:5001(1000) ack 1 win 514 // DUPACK : Normally we should not change the window +0 > . 1:1(0) ack 1001 win 264 +0 < . 1001:2001(1000) ack 1 win 514 // Hole is repaired. +0 > . 1:1(0) ack 5001 win 272 Fixes: 4e4f1fc22681 ("tcp: properly increase rcv_ssthresh for ofo packets") Signed-off-by: Eric Dumazet Reported-by: Venkat Venkatsubra Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 83330a6cb242..12fda8f27b08 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4605,7 +4605,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4689,7 +4693,11 @@ add_sack: tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb); skb_condense(skb); skb_set_owner_r(skb, sk); } -- cgit From 2570284060b48f3f79d8f1a2698792f36c385e9a Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 25 Jun 2020 14:51:06 +0300 Subject: tcp: don't ignore ECN CWR on pure ACK there is a problem with the CWR flag set in an incoming ACK segment and it leads to the situation when the ECE flag is latched forever the following packetdrill script shows what happens: // Stack receives incoming segments with CE set +0.1 <[ect0] . 11001:12001(1000) ack 1001 win 65535 +0.0 <[ce] . 12001:13001(1000) ack 1001 win 65535 +0.0 <[ect0] P. 13001:14001(1000) ack 1001 win 65535 // Stack repsonds with ECN ECHO +0.0 >[noecn] . 1001:1001(0) ack 12001 +0.0 >[noecn] E. 1001:1001(0) ack 13001 +0.0 >[noecn] E. 1001:1001(0) ack 14001 // Write a packet +0.1 write(3, ..., 1000) = 1000 +0.0 >[ect0] PE. 1001:2001(1000) ack 14001 // Pure ACK received +0.01 <[noecn] W. 14001:14001(0) ack 2001 win 65535 // Since CWR was sent, this packet should NOT have ECE set +0.1 write(3, ..., 1000) = 1000 +0.0 >[ect0] P. 2001:3001(1000) ack 14001 // but Linux will still keep ECE latched here, with packetdrill // flagging a missing ECE flag, expecting // >[ect0] PE. 2001:3001(1000) ack 14001 // in the script In the situation above we will continue to send ECN ECHO packets and trigger the peer to reduce the congestion window. To avoid that we can check CWR on pure ACKs received. v3: - Add a sequence check to avoid sending an ACK to an ACK v2: - Adjusted the comment - move CWR check before checking for unacknowledged packets Signed-off-by: Denis Kirjanov Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 12fda8f27b08..f3a0eb139b76 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -261,7 +261,8 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) * cwnd may be very low (even just 1 packet), so we should ACK * immediately. */ - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } } @@ -3665,6 +3666,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_in_ack_event(sk, ack_ev_flags); } + /* This is a deviation from RFC3168 since it states that: + * "When the TCP data sender is ready to set the CWR bit after reducing + * the congestion window, it SHOULD set the CWR bit only on the first + * new data packet that it transmits." + * We accept CWR on pure ACKs to be more robust + * with widely-deployed TCP implementations that do this. + */ + tcp_ecn_accept_cwr(sk, skb); + /* We passed data and got it acked, remove any soft error * log. Something worked... */ @@ -4800,8 +4810,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); - tcp_ecn_accept_cwr(sk, skb); - tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. -- cgit From ba3bb0e76ccd464bb66665a1941fabe55dadb3ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jun 2020 13:51:28 -0700 Subject: tcp: fix SO_RCVLOWAT possible hangs under high mem pressure Whenever tcp_try_rmem_schedule() returns an error, we are under trouble and should make sure to wakeup readers so that they can drain socket queues and eventually make room. Fixes: 03f45c883c6f ("tcp: avoid extra wakeups for SO_RCVLOWAT users") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f3a0eb139b76..9615e72656d1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4582,6 +4582,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); + sk->sk_data_ready(sk); tcp_drop(sk, skb); return; } @@ -4828,6 +4829,7 @@ queue_and_out: sk_forced_mem_schedule(sk, skb->truesize); else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + sk->sk_data_ready(sk); goto drop; } -- cgit From 76be93fc0702322179bb0ea87295d820ee46ad14 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 23 Jul 2020 12:00:06 -0700 Subject: tcp: allow at most one TLP probe per flight Previously TLP may send multiple probes of new data in one flight. This happens when the sender is cwnd limited. After the initial TLP containing new data is sent, the sender receives another ACK that acks partial inflight. It may re-arm another TLP timer to send more, if no further ACK returns before the next TLP timeout (PTO) expires. The sender may send in theory a large amount of TLP until send queue is depleted. This only happens if the sender sees such irregular uncommon ACK pattern. But it is generally undesirable behavior during congestion especially. The original TLP design restrict only one TLP probe per inflight as published in "Reducing Web Latency: the Virtue of Gentle Aggression", SIGCOMM 2013. This patch changes TLP to send at most one probe per inflight. Note that if the sender is app-limited, TLP retransmits old data and did not have this issue. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9615e72656d1..518f04355fbf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3488,10 +3488,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { @@ -3500,7 +3498,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (before(ack, tp->tlp_high_seq)) return; - if (flag & FLAG_DSACKING_ACK) { + if (!tp->tlp_retrans) { + /* TLP of new data has been acknowledged */ + tp->tlp_high_seq = 0; + } else if (flag & FLAG_DSACKING_ACK) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { -- cgit