summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c80
1 files changed, 53 insertions, 27 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 734cfc8ff76e..cfa51cfd2d99 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
+#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -508,9 +509,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
u32 new_sample = tp->rcv_rtt_est.rtt_us;
long m = sample;
- if (m == 0)
- m = 1;
-
if (new_sample != 0) {
/* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially
@@ -547,6 +545,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
+ if (!delta_us)
+ delta_us = 1;
tcp_rcv_rtt_update(tp, delta_us, 1);
new_measure:
@@ -563,8 +563,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
(TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
- u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ u32 delta_us;
+ if (!delta)
+ delta = 1;
+ delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
tcp_rcv_rtt_update(tp, delta_us, 0);
}
}
@@ -576,9 +579,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 copied;
int time;
- int copied;
+ tcp_mstamp_refresh(tp);
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
return;
@@ -599,38 +603,31 @@ void tcp_rcv_space_adjust(struct sock *sk)
if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvwin, rcvmem, rcvbuf;
+ int rcvmem, rcvbuf;
+ u64 rcvwin, grow;
/* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations.
*/
- rcvwin = (copied << 1) + 16 * tp->advmss;
+ rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
- /* If rate increased by 25%,
- * assume slow start, rcvwin = 3 * copied
- * If rate increased by 50%,
- * assume sender can use 2x growth, rcvwin = 4 * copied
- */
- if (copied >=
- tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
- if (copied >=
- tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
- rcvwin <<= 1;
- else
- rcvwin += (rcvwin >> 1);
- }
+ /* Accommodate for sender rate increase (eg. slow start) */
+ grow = rcvwin * (copied - tp->rcvq_space.space);
+ do_div(grow, tp->rcvq_space.space);
+ rcvwin += (grow << 1);
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
rcvmem += 128;
- rcvbuf = min(rcvwin / tp->advmss * rcvmem,
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ do_div(rcvwin, tp->advmss);
+ rcvbuf = min_t(u64, rcvwin * rcvmem,
+ sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
sk->sk_rcvbuf = rcvbuf;
/* Make the window clamp follow along. */
- tp->window_clamp = rcvwin;
+ tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
}
}
tp->rcvq_space.space = copied;
@@ -1941,6 +1938,8 @@ void tcp_enter_loss(struct sock *sk)
if (is_reneg) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
+ /* Mark SACK reneging until we recover from this loss event. */
+ tp->is_sack_reneg = 1;
}
tcp_clear_all_retrans_hints(tp);
@@ -2326,6 +2325,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
}
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->undo_marker = 0;
+ tp->rack.advanced = 1; /* Force RACK to re-exam losses */
}
static inline bool tcp_may_undo(const struct tcp_sock *tp)
@@ -2364,6 +2364,7 @@ static bool tcp_try_undo_recovery(struct sock *sk)
return true;
}
tcp_set_ca_state(sk, TCP_CA_Open);
+ tp->is_sack_reneg = 0;
return false;
}
@@ -2397,8 +2398,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
- if (frto_undo || tcp_is_sack(tp))
+ if (frto_undo || tcp_is_sack(tp)) {
tcp_set_ca_state(sk, TCP_CA_Open);
+ tp->is_sack_reneg = 0;
+ }
return true;
}
return false;
@@ -2855,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
*rexmit = REXMIT_LOST;
}
-static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
{
u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
struct tcp_sock *tp = tcp_sk(sk);
+ if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
+ /* If the remote keeps returning delayed ACKs, eventually
+ * the min filter would pick it up and overestimate the
+ * prop. delay when it expires. Skip suspected delayed ACKs.
+ */
+ return;
+ }
minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
rtt_us ? : jiffies_to_usecs(1));
}
@@ -2899,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* always taken together with ACK, SACK, or TS-opts. Any negative
* values will be skipped with the seq_rtt_us < 0 check above.
*/
- tcp_update_rtt_min(sk, ca_rtt_us);
+ tcp_update_rtt_min(sk, ca_rtt_us, flag);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
@@ -3123,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
+
+ if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
+ last_in_flight && !prior_sacked && fully_acked &&
+ sack->rate->prior_delivered + 1 == tp->delivered &&
+ !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
+ /* Conservatively mark a delayed ACK. It's typically
+ * from a lone runt packet over the round trip to
+ * a receiver w/o out-of-order or CE events.
+ */
+ flag |= FLAG_ACK_MAYBE_DELAYED;
+ }
}
if (sack->first_sackt) {
sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
@@ -3495,6 +3516,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
struct tcp_sacktag_state sack_state;
struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
+ bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
@@ -3611,7 +3633,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
- tcp_rate_gen(sk, delivered, lost, sack_state.rate);
+ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+ tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;
@@ -5296,6 +5319,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
unsigned int len = skb->len;
struct tcp_sock *tp = tcp_sk(sk);
+ /* TCP congestion window tracking */
+ trace_tcp_probe(sk, skb);
+
tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);