summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c227
1 files changed, 60 insertions, 167 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 68bc79eb9019..71b76e98371a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1451,11 +1451,6 @@ static u8 tcp_sacktag_one(struct sock *sk,
tp->sacked_out += pcount;
/* Out-of-order packets delivered */
state->sack_delivered += pcount;
-
- /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
- if (tp->lost_skb_hint &&
- before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
- tp->lost_cnt_hint += pcount;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1496,9 +1491,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
- if (skb == tp->lost_skb_hint)
- tp->lost_cnt_hint += pcount;
-
TCP_SKB_CB(prev)->end_seq += shifted;
TCP_SKB_CB(skb)->seq += shifted;
@@ -1531,10 +1523,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
- if (skb == tp->lost_skb_hint) {
- tp->lost_skb_hint = prev;
- tp->lost_cnt_hint -= tcp_skb_pcount(prev);
- }
TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
@@ -2151,12 +2139,6 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
tp->undo_retrans = -1;
}
-static bool tcp_is_rack(const struct sock *sk)
-{
- return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
- TCP_RACK_LOSS_DETECTION;
-}
-
/* If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
@@ -2182,8 +2164,7 @@ static void tcp_timeout_mark_lost(struct sock *sk)
skb_rbtree_walk_from(skb) {
if (is_reneg)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
- else if (tcp_is_rack(sk) && skb != head &&
- tcp_rack_skb_timeout(tp, skb, 0) > 0)
+ else if (skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0)
continue; /* Don't mark recently sent ones lost yet */
tcp_mark_skb_lost(sk, skb);
}
@@ -2264,22 +2245,6 @@ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag)
return false;
}
-/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
- * counter when SACK is enabled (without SACK, sacked_out is used for
- * that purpose).
- *
- * With reordering, holes may still be in flight, so RFC3517 recovery
- * uses pure sacked_out (total number of SACKed segments) even though
- * it violates the RFC that uses duplicate ACKs, often these are equal
- * but when e.g. out-of-window ACKs or packet duplication occurs,
- * they differ. Since neither occurs due to loss, TCP should really
- * ignore them.
- */
-static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
-{
- return tp->sacked_out + 1;
-}
-
/* Linux NewReno/SACK/ECN state machine.
* --------------------------------------
*
@@ -2332,13 +2297,7 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
*
* If the receiver supports SACK:
*
- * RFC6675/3517: It is the conventional algorithm. A packet is
- * considered lost if the number of higher sequence packets
- * SACKed is greater than or equal the DUPACK thoreshold
- * (reordering). This is implemented in tcp_mark_head_lost and
- * tcp_update_scoreboard.
- *
- * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ * RACK (RFC8985): RACK is a newer loss detection algorithm
* (2017-) that checks timing instead of counting DUPACKs.
* Essentially a packet is considered lost if it's not S/ACKed
* after RTT + reordering_window, where both metrics are
@@ -2353,8 +2312,8 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
- * Really tricky (and requiring careful tuning) part of algorithm
- * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The really tricky (and requiring careful tuning) part of the algorithm
+ * is hidden in the RACK code in tcp_recovery.c and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
@@ -2377,83 +2336,10 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
-static bool tcp_time_to_recover(struct sock *sk, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- /* Trick#1: The loss is proven. */
- if (tp->lost_out)
- return true;
-
- /* Not-A-Trick#2 : Classic rule... */
- if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
- return true;
-
- return false;
-}
-
-/* Detect loss in event "A" above by marking head of queue up as lost.
- * For RFC3517 SACK, a segment is considered lost if it
- * has at least tp->reordering SACKed seqments above it; "packets" refers to
- * the maximum SACKed segments to pass before reaching this limit.
- */
-static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt;
- /* Use SACK to deduce losses of new sequences sent during recovery */
- const u32 loss_high = tp->snd_nxt;
-
- WARN_ON(packets > tp->packets_out);
- skb = tp->lost_skb_hint;
- if (skb) {
- /* Head already handled? */
- if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
- return;
- cnt = tp->lost_cnt_hint;
- } else {
- skb = tcp_rtx_queue_head(sk);
- cnt = 0;
- }
-
- skb_rbtree_walk_from(skb) {
- /* TODO: do this better */
- /* this is not the most efficient way to do this... */
- tp->lost_skb_hint = skb;
- tp->lost_cnt_hint = cnt;
-
- if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
- break;
-
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
- cnt += tcp_skb_pcount(skb);
-
- if (cnt > packets)
- break;
-
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
- tcp_mark_skb_lost(sk, skb);
-
- if (mark_head)
- break;
- }
- tcp_verify_left_out(tp);
-}
-
-/* Account newly detected lost packet(s) */
-
-static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
+static bool tcp_time_to_recover(const struct tcp_sock *tp)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (tcp_is_sack(tp)) {
- int sacked_upto = tp->sacked_out - tp->reordering;
- if (sacked_upto >= 0)
- tcp_mark_head_lost(sk, sacked_upto, 0);
- else if (fast_rexmit)
- tcp_mark_head_lost(sk, 1, 1);
- }
+ /* Has loss detection marked at least one packet lost? */
+ return tp->lost_out != 0;
}
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
@@ -2894,8 +2780,6 @@ void tcp_simple_retransmit(struct sock *sk)
tcp_mark_skb_lost(sk, skb);
}
- tcp_clear_retrans_hints_partial(tp);
-
if (!tp->lost_out)
return;
@@ -3003,17 +2887,8 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
*rexmit = REXMIT_LOST;
}
-static bool tcp_force_fast_retransmit(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- return after(tcp_highest_sack_seq(tp),
- tp->snd_una + tp->reordering * tp->mss_cache);
-}
-
/* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
- bool *do_lost)
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3038,9 +2913,6 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
tcp_undo_cwnd_reduction(sk, true);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
- } else {
- /* Partial ACK arrived. Force fast retransmit. */
- *do_lost = tcp_force_fast_retransmit(sk);
}
return false;
}
@@ -3054,7 +2926,7 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
if (unlikely(tcp_is_reno(tp))) {
tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
- } else if (tcp_is_rack(sk)) {
+ } else {
u32 prior_retrans = tp->retrans_out;
if (tcp_rack_mark_lost(sk))
@@ -3081,10 +2953,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int fast_rexmit = 0, flag = *ack_flag;
+ int flag = *ack_flag;
bool ece_ack = flag & FLAG_ECE;
- bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
- tcp_force_fast_retransmit(sk));
if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
@@ -3133,7 +3003,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp))
tcp_add_reno_sack(sk, num_dupack, ece_ack);
- } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
+ } else if (tcp_try_undo_partial(sk, prior_snd_una))
return;
if (tcp_try_undo_dsack(sk))
@@ -3141,7 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_identify_packet_loss(sk, ack_flag);
if (icsk->icsk_ca_state != TCP_CA_Recovery) {
- if (!tcp_time_to_recover(sk, flag))
+ if (!tcp_time_to_recover(tp))
return;
/* Undo reverts the recovery state. If loss is evident,
* starts a new recovery (e.g. reordering then loss);
@@ -3170,7 +3040,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_try_undo_dsack(sk);
tcp_identify_packet_loss(sk, ack_flag);
- if (!tcp_time_to_recover(sk, flag)) {
+ if (!tcp_time_to_recover(tp)) {
tcp_try_to_open(sk, flag);
return;
}
@@ -3188,11 +3058,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
/* Otherwise enter Recovery state */
tcp_enter_recovery(sk, ece_ack);
- fast_rexmit = 1;
}
- if (!tcp_is_rack(sk) && do_lost)
- tcp_update_scoreboard(sk, fast_rexmit);
*rexmit = REXMIT_LOST;
}
@@ -3448,8 +3315,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
- if (unlikely(skb == tp->lost_skb_hint))
- tp->lost_skb_hint = NULL;
tcp_highest_sack_replace(sk, skb, next);
tcp_rtx_queue_unlink_and_free(skb, sk);
}
@@ -3507,14 +3372,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (flag & FLAG_RETRANS_DATA_ACKED)
flag &= ~FLAG_ORIG_SACK_ACKED;
} else {
- int delta;
-
/* Non-retransmitted hole got filled? That's reordering */
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
-
- delta = prior_sacked - tp->sacked_out;
- tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
@@ -3854,7 +3714,7 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
}
/* This routine deals with acks during a TLP episode and ends an episode by
- * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+ * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985
*/
static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
@@ -4531,14 +4391,22 @@ static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
* (borrowed from freebsd)
*/
-static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
+static enum skb_drop_reason tcp_sequence(const struct sock *sk,
u32 seq, u32 end_seq)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
if (before(end_seq, tp->rcv_wup))
return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
- if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
- return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+ if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) {
+ if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+
+ /* Only accept this packet if receive queue is empty. */
+ if (skb_queue_len(&sk->sk_receive_queue))
+ return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE;
+ }
return SKB_NOT_DROPPED_YET;
}
@@ -4985,8 +4853,9 @@ static void tcp_ofo_queue(struct sock *sk)
if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
__u32 dsack = dsack_high;
+
if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
- dsack_high = TCP_SKB_CB(skb)->end_seq;
+ dsack = TCP_SKB_CB(skb)->end_seq;
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
p = rb_next(p);
@@ -5019,10 +4888,20 @@ static void tcp_ofo_queue(struct sock *sk)
static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
-static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+/* Check if this incoming skb can be added to socket receive queues
+ * while satisfying sk->sk_rcvbuf limit.
+ */
+static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
+{
+ unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize;
+
+ return new_mem <= sk->sk_rcvbuf;
+}
+
+static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
unsigned int size)
{
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ if (!tcp_can_ingest(sk, skb) ||
!sk_rmem_schedule(sk, skb, size)) {
if (tcp_prune_queue(sk, skb) < 0)
@@ -5054,6 +4933,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
return;
}
+ tcp_measure_rcv_mss(sk, skb);
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
@@ -5637,7 +5517,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
tp->ooo_last_skb = rb_to_skb(prev);
if (!prev || goal <= 0) {
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+ if (tcp_can_ingest(sk, in_skb) &&
!tcp_under_memory_pressure(sk))
break;
goal = sk->sk_rcvbuf >> 3;
@@ -5669,14 +5549,18 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ /* Do nothing if our queues are empty. */
+ if (!atomic_read(&sk->sk_rmem_alloc))
+ return -1;
+
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ if (!tcp_can_ingest(sk, in_skb))
tcp_clamp_window(sk);
else if (tcp_under_memory_pressure(sk))
tcp_adjust_rcv_ssthresh(sk);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
tcp_collapse_ofo_queue(sk);
@@ -5686,7 +5570,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
NULL,
tp->copied_seq, tp->rcv_nxt);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
/* Collapsing did not help, destructive actions follow.
@@ -5694,7 +5578,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
tcp_prune_ofo_queue(sk, in_skb);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
/* If we are really being abused, tell the caller to silently
@@ -6020,7 +5904,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
step1:
/* Step 1: check sequence number */
- reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
if (reason) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields."
@@ -6031,6 +5915,11 @@ step1:
if (!th->rst) {
if (th->syn)
goto syn_challenge;
+
+ if (reason == SKB_DROP_REASON_TCP_INVALID_SEQUENCE ||
+ reason == SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_BEYOND_WINDOW);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
@@ -6249,6 +6138,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (tcp_checksum_complete(skb))
goto csum_error;
+ if (after(TCP_SKB_CB(skb)->end_seq,
+ tp->rcv_nxt + tcp_receive_window(tp)))
+ goto validate;
+
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
@@ -6304,7 +6197,7 @@ slow_path:
/*
* Standard slow path.
*/
-
+validate:
if (!tcp_validate_incoming(sk, skb, th, 1))
return;