From 7e901ee7b6ab0b7c1a5e29b8513af23709285a29 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 30 Oct 2020 18:34:12 -0700 Subject: tcp: avoid slow start during fast recovery on new losses During TCP fast recovery, the congestion control in charge is by default the Proportional Rate Reduction (PRR) unless the congestion control module specified otherwise (e.g. BBR). Previously when tcp_packets_in_flight() is below snd_ssthresh PRR would slow start upon receiving an ACK that 1) cumulatively acknowledges retransmitted data and 2) does not detect further lost retransmission Such conditions indicate the repair is in good steady progress after the first round trip of recovery. Otherwise PRR adopts the packet conservation principle to send only the amount that was newly delivered (indicated by this ACK). This patch generalizes the previous design principle to include also the newly sent data beside retransmission: as long as the delivery is making good progress, both retransmission and new data should be accounted to make PRR more cautious in slow starting. Suggested-by: Matt Mathis Suggested-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20201031013412.1973112-1-ycheng@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 389d1b340248..fb3a7750f623 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2546,7 +2546,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. * 2) Otherwise PRR uses packet conservation to send as much as delivered. - * But when the retransmits are acked without further losses, PRR + * But when SND_UNA is acked without further losses, * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) @@ -2563,7 +2563,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2577,8 +2577,7 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == - FLAG_RETRANS_DATA_ACKED) { + } else if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); @@ -3419,7 +3418,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ - tcp_cwnd_reduction(sk, acked_sacked, flag); + tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag); } else if (tcp_may_raise_cwnd(sk, flag)) { /* Advance cwnd if state allows */ tcp_cong_avoid(sk, ack, acked_sacked); -- cgit From 7ea851d19b23593d7601ecb8091d529f4f475bf5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 30 Nov 2020 16:36:30 +0100 Subject: tcp: merge 'init_req' and 'route_req' functions The Multipath-TCP standard (RFC 8684) says that an MPTCP host should send a TCP reset if the token in a MP_JOIN request is unknown. At this time we don't do this, the 3whs completes and the 'new subflow' is reset afterwards. There are two ways to allow MPTCP to send the reset. 1. override 'send_synack' callback and emit the rst from there. The drawback is that the request socket gets inserted into the listeners queue just to get removed again right away. 2. Send the reset from the 'route_req' function instead. This avoids the 'add&remove request socket', but route_req lacks the skb that is required to send the TCP reset. Instead of just adding the skb to that function for MPTCP sake alone, Paolo suggested to merge init_req and route_req functions. This saves one indirection from syn processing path and provides the skb to the merged function at the same time. 'send reset on unknown mptcp join token' is added in next patch. Suggested-by: Paolo Abeni Cc: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb3a7750f623..9e8a6c1aa019 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6799,18 +6799,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - af_ops->init_req(req, sk, skb); - - if (security_inet_conn_request(sk, skb, req)) + dst = af_ops->route_req(sk, skb, &fl, req); + if (!dst) goto drop_and_free; if (tmp_opt.tstamp_ok) tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - dst = af_ops->route_req(sk, &fl, req); - if (!dst) - goto drop_and_free; - if (!want_cookie && !isn) { /* Kill the following clause, if you dislike this way. */ if (!net->ipv4.sysctl_tcp_syncookies && -- cgit From 049fe386d35353398eee40ba8d76ab62cb5a24e5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Dec 2020 14:25:03 -0800 Subject: tcp: parse mptcp options contained in reset packets Because TCP-level resets only affect the subflow, there is a MPTCP option to indicate that the MPTCP-level connection should be closed immediately without a mptcp-level fin exchange. This is the 'MPTCP fast close option'. It can be carried on ack segments or TCP resets. In the latter case, its needed to parse mptcp options also for reset packets so that MPTCP can act accordingly. Next patch will add receive side fastclose support in MPTCP. Acked-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6ad3b5c38e7..48ee476aa031 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4218,10 +4218,13 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) } /* When we get a reset we do this. */ -void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk, struct sk_buff *skb) { trace_tcp_receive_reset(sk); + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -5604,7 +5607,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); } else if (tcp_reset_check(sk, skb)) { - tcp_reset(sk); + tcp_reset(sk, skb); } goto discard; } @@ -5640,7 +5643,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } if (rst_seq_match) - tcp_reset(sk); + tcp_reset(sk, skb); else { /* Disable TFO if RST is out-of-order * and no data has been received @@ -6077,7 +6080,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ if (th->rst) { - tcp_reset(sk); + tcp_reset(sk, skb); goto discard; } @@ -6519,7 +6522,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - tcp_reset(sk); + tcp_reset(sk, skb); return 1; } } -- cgit From c31b70c9968fe9c4194d1b5d06d07596a3b680de Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sat, 12 Dec 2020 12:31:24 -0800 Subject: tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit There are cases where a fastopen SYN may trigger either a ICMP_TOOBIG message in the case of IPv6 or a fragmentation request in the case of IPv4. This results in the socket stalling for a second or more as it does not respond to the message by retransmitting the SYN frame. Normally a SYN frame should not be able to trigger a ICMP_TOOBIG or ICMP_FRAG_NEEDED however in the case of fastopen we can have a frame that makes use of the entire MSS. In the case of fastopen it does, and an additional complication is that the retransmit queue doesn't contain the original frames. As a result when tcp_simple_retransmit is called and walks the list of frames in the queue it may not mark the frames as lost because both the SYN and the data packet each individually are smaller than the MSS size after the adjustment. This results in the socket being stalled until the retransmit timer kicks in and forces the SYN frame out again without the data attached. In order to resolve this we can reduce the MSS the packets are compared to in tcp_simple_retransmit to -1 for cases where we are still in the TCP_SYN_SENT state for a fastopen socket. Doing this we will mark all of the packets related to the fastopen SYN as lost. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Link: https://lore.kernel.org/r/160780498125.3272.15437756269539236825.stgit@localhost.localdomain Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 48ee476aa031..c7e16b0ed791 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2689,7 +2689,22 @@ void tcp_simple_retransmit(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk); + int mss; + + /* A fastopen SYN request is stored as two separate packets within + * the retransmit queue, this is done by tcp_send_syn_data(). + * As a result simply checking the MSS of the frames in the queue + * will not work for the SYN packet. + * + * Us being here is an indication of a path MTU issue so we can + * assume that the fastopen SYN was lost and just mark all the + * frames in the retransmit queue as lost. We will use an MSS of + * -1 to mark all frames as lost, otherwise compute the current MSS. + */ + if (tp->syn_data && sk->sk_state == TCP_SYN_SENT) + mss = -1; + else + mss = tcp_current_mss(sk); skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss) -- cgit