tcp: higher throughput under reordering with adaptive RACK reordering wnd

Currently TCP RACK loss detection does not work well if packets are being reordered beyond its static reordering window (min_rtt/4).Under such reordering it may falsely trigger loss recoveries and reduce TCP throughput significantly. This patch improves that by increasing and reducing the reordering window based on DSACK, which is now supported in major TCP implementations. It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries. - If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded by srtt), since there is possibility that spurious retransmission was due to reordering delay longer than reo_wnd. - Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) no. of successful recoveries (accounts for full DSACK-based loss recovery undo). After that, reset it to default (min_rtt/4). - At max, reo_wnd is incremented only once per rtt. So that the new DSACK on which we are reacting, is due to the spurious retx (approx) after the reo_wnd has been updated last time. - reo_wnd is tracked in terms of steps (of min_rtt/4), rather than absolute value to account for change in rtt. In our internal testing, we observed significant increase in throughput, in scenarios where reordering exceeds min_rtt/4 (previous static value). Signed-off-by: Priyaranjan Jha <priyarjha@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Priyaranjan Jha <priyarjha@google.com> 2017-11-03 16:38:48 -0700
committer: David S. Miller <davem@davemloft.net> 2017-11-05 23:15:42 +0900
commit: 1f2556916d974cfb62b6af51660186b5f58bd869 (patch)
tree: fb63694954f33401a8641c794e118f845ca61057 /net/ipv4
parent: 6c49b5e26004eef86e7a47093a53be290554351c (diff)
4 files changed, 58 insertions, 2 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a7a0f316eb86..c4cb19ed4628 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk)
 	tcp_assign_congestion_control(sk);
 
 	tp->tsoffset = 0;
+	tp->rack.reo_wnd_steps = 1;
 
 	sk->sk_state = TCP_CLOSE;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8393b405ea98..0ada8bfc2ebd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -856,6 +856,7 @@ void tcp_disable_fack(struct tcp_sock *tp)
 static void tcp_dsack_seen(struct tcp_sock *tp)
 {
 	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+	tp->rack.dsack_seen = 1;
 }
 
 static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -2408,6 +2409,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
 			mib_idx = LINUX_MIB_TCPFULLUNDO;
 
 		NET_INC_STATS(sock_net(sk), mib_idx);
+	} else if (tp->rack.reo_wnd_persist) {
+		tp->rack.reo_wnd_persist--;
 	}
 	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
 		/* Hold old state until something *above* high_seq
@@ -2427,6 +2430,8 @@ static bool tcp_try_undo_dsack(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tp->undo_marker && !tp->undo_retrans) {
+		tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
+					       tp->rack.reo_wnd_persist + 1);
 		DBGUNDO(sk, "D-SACK");
 		tcp_undo_cwnd_reduction(sk, false);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
@@ -3644,6 +3649,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
 				    &sack_state);
 
+	tcp_rack_update_reo_wnd(sk, &rs);
+
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 	/* If needed, reset TLP/RTO timer; RACK may later override this. */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3c65c1a3f944..4bb86580decd 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -551,6 +551,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->syn_data_acked = 0;
 		newtp->rack.mstamp = 0;
 		newtp->rack.advanced = 0;
+		newtp->rack.reo_wnd_steps = 1;
+		newtp->rack.last_delivered = 0;
+		newtp->rack.reo_wnd_persist = 0;
+		newtp->rack.dsack_seen = 0;
 
 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index ac3e9c6d3a3d..d3ea89020c69 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -44,6 +44,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 min_rtt = tcp_min_rtt(tp);
 	struct sk_buff *skb, *n;
 	u32 reo_wnd;
 
@@ -54,8 +55,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 	 * to queuing or delayed ACKs.
 	 */
 	reo_wnd = 1000;
-	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
-		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+	if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) {
+		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
+		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
+	}
 
 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
 				 tcp_tsorted_anchor) {
@@ -160,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk)
 	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
 		tcp_rearm_rto(sk);
 }
+
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
+	    !rs->prior_delivered)
+		return;
+
+	/* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+	if (before(rs->prior_delivered, tp->rack.last_delivered))
+		tp->rack.dsack_seen = 0;
+
+	/* Adjust the reo_wnd if update is pending */
+	if (tp->rack.dsack_seen) {
+		tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+					       tp->rack.reo_wnd_steps + 1);
+		tp->rack.dsack_seen = 0;
+		tp->rack.last_delivered = tp->delivered;
+		tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+	} else if (!tp->rack.reo_wnd_persist) {
+		tp->rack.reo_wnd_steps = 1;
+	}
+}
author	Priyaranjan Jha <priyarjha@google.com>	2017-11-03 16:38:48 -0700
committer	David S. Miller <davem@davemloft.net>	2017-11-05 23:15:42 +0900
commit	1f2556916d974cfb62b6af51660186b5f58bd869 (patch)
tree	fb63694954f33401a8641c794e118f845ca61057 /net/ipv4
parent	6c49b5e26004eef86e7a47093a53be290554351c (diff)