summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c396
1 files changed, 363 insertions, 33 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d77c0d29e239..d67b6e9cc540 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1002,7 +1002,8 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
}
}
-void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
+ struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
@@ -1241,26 +1242,47 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
* aligned portion of it that matches. Therefore we might need to fragment
* which may fail and creates some hassle (caller must handle error case
* returns).
+ *
+ * FIXME: this could be merged to shift decision code
*/
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
u32 start_seq, u32 end_seq)
{
int in_sack, err;
unsigned int pkt_len;
+ unsigned int mss;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
if (tcp_skb_pcount(skb) > 1 && !in_sack &&
after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
-
+ mss = tcp_skb_mss(skb);
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
- if (!in_sack)
+ if (!in_sack) {
pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
- else
+ if (pkt_len < mss)
+ pkt_len = mss;
+ } else {
pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
- err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size);
+ if (pkt_len < mss)
+ return -EINVAL;
+ }
+
+ /* Round if necessary so that SACKs cover only full MSSes
+ * and/or the remaining small portion (if present)
+ */
+ if (pkt_len > mss) {
+ unsigned int new_len = (pkt_len / mss) * mss;
+ if (!in_sack && new_len < pkt_len) {
+ new_len += mss;
+ if (new_len > skb->len)
+ return 0;
+ }
+ pkt_len = new_len;
+ }
+ err = tcp_fragment(sk, skb, pkt_len, mss);
if (err < 0)
return err;
}
@@ -1269,7 +1291,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
}
static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
- int *reord, int dup_sack, int fack_count)
+ int *reord, int dup_sack, int fack_count,
+ u8 *sackedto, int pcount)
{
struct tcp_sock *tp = tcp_sk(sk);
u8 sacked = TCP_SKB_CB(skb)->sacked;
@@ -1294,10 +1317,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
* that retransmission is still in flight.
*/
if (sacked & TCPCB_LOST) {
- TCP_SKB_CB(skb)->sacked &=
- ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
- tp->lost_out -= tcp_skb_pcount(skb);
- tp->retrans_out -= tcp_skb_pcount(skb);
+ *sackedto &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+ tp->lost_out -= pcount;
+ tp->retrans_out -= pcount;
}
} else {
if (!(sacked & TCPCB_RETRANS)) {
@@ -1314,48 +1336,280 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
}
if (sacked & TCPCB_LOST) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- tp->lost_out -= tcp_skb_pcount(skb);
+ *sackedto &= ~TCPCB_LOST;
+ tp->lost_out -= pcount;
}
}
- TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
+ *sackedto |= TCPCB_SACKED_ACKED;
flag |= FLAG_DATA_SACKED;
- tp->sacked_out += tcp_skb_pcount(skb);
+ tp->sacked_out += pcount;
- fack_count += tcp_skb_pcount(skb);
+ fack_count += pcount;
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->lost_skb_hint)->seq))
- tp->lost_cnt_hint += tcp_skb_pcount(skb);
+ tp->lost_cnt_hint += pcount;
if (fack_count > tp->fackets_out)
tp->fackets_out = fack_count;
-
- if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
- tcp_advance_highest_sack(sk, skb);
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
* frames and clear it. undo_retrans is decreased above, L|R frames
* are accounted above as well.
*/
- if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
+ if (dup_sack && (*sackedto & TCPCB_SACKED_RETRANS)) {
+ *sackedto &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= pcount;
}
return flag;
}
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ struct sk_buff *skb, unsigned int pcount,
+ int shifted, int fack_count, int *reord,
+ int *flag, int mss)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */
+
+ BUG_ON(!pcount);
+
+ /* Tweak before seqno plays */
+ if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint &&
+ !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
+ tp->lost_cnt_hint += pcount;
+
+ TCP_SKB_CB(prev)->end_seq += shifted;
+ TCP_SKB_CB(skb)->seq += shifted;
+
+ skb_shinfo(prev)->gso_segs += pcount;
+ BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
+ skb_shinfo(skb)->gso_segs -= pcount;
+
+ /* When we're adding to gso_segs == 1, gso_size will be zero,
+ * in theory this shouldn't be necessary but as long as DSACK
+ * code can come after this skb later on it's better to keep
+ * setting gso_size to something.
+ */
+ if (!skb_shinfo(prev)->gso_size) {
+ skb_shinfo(prev)->gso_size = mss;
+ skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+ }
+
+ /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+ if (skb_shinfo(skb)->gso_segs <= 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ }
+
+ *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
+ pcount);
+
+ /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+ if (skb->len > 0) {
+ BUG_ON(!tcp_skb_pcount(skb));
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+ return 0;
+ }
+
+ /* Whole SKB was eaten :-) */
+
+ if (skb == tp->retransmit_skb_hint)
+ tp->retransmit_skb_hint = prev;
+ if (skb == tp->scoreboard_skb_hint)
+ tp->scoreboard_skb_hint = prev;
+ if (skb == tp->lost_skb_hint) {
+ tp->lost_skb_hint = prev;
+ tp->lost_cnt_hint -= tcp_skb_pcount(prev);
+ }
+
+ TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+ if (skb == tcp_highest_sack(sk))
+ tcp_advance_highest_sack(sk, skb);
+
+ tcp_unlink_write_queue(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
+
+ return 1;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_shift_mss(struct sk_buff *skb)
+{
+ int mss = tcp_skb_mss(skb);
+
+ if (!mss)
+ mss = skb->len;
+
+ return mss;
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(struct sk_buff *skb)
+{
+ return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ u32 start_seq, u32 end_seq,
+ int dup_sack, int *fack_count,
+ int *reord, int *flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev;
+ int mss;
+ int pcount = 0;
+ int len;
+ int in_sack;
+
+ if (!sk_can_gso(sk))
+ goto fallback;
+
+ /* Normally R but no L won't result in plain S */
+ if (!dup_sack &&
+ (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
+ goto fallback;
+ if (!skb_can_shift(skb))
+ goto fallback;
+ /* This frame is about to be dropped (was ACKed). */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ goto fallback;
+
+ /* Can only happen with delayed DSACK + discard craziness */
+ if (unlikely(skb == tcp_write_queue_head(sk)))
+ goto fallback;
+ prev = tcp_write_queue_prev(sk, skb);
+
+ if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+ goto fallback;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (in_sack) {
+ len = skb->len;
+ pcount = tcp_skb_pcount(skb);
+ mss = tcp_shift_mss(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_shift_mss(prev))
+ goto fallback;
+ } else {
+ if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+ goto noop;
+ /* CHECKME: This is non-MSS split case only?, this will
+ * cause skipped skbs due to advancing loop btw, original
+ * has that feature too
+ */
+ if (tcp_skb_pcount(skb) <= 1)
+ goto noop;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+ if (!in_sack) {
+ /* TODO: head merge to next could be attempted here
+ * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+ * though it might not be worth of the additional hassle
+ *
+ * ...we can probably just fallback to what was done
+ * previously. We could try merging non-SACKed ones
+ * as well but it probably isn't going to buy off
+ * because later SACKs might again split them, and
+ * it would make skb timestamp tracking considerably
+ * harder problem.
+ */
+ goto fallback;
+ }
+
+ len = end_seq - TCP_SKB_CB(skb)->seq;
+ BUG_ON(len < 0);
+ BUG_ON(len > skb->len);
+
+ /* MSS boundaries should be honoured or else pcount will
+ * severely break even though it makes things bit trickier.
+ * Optimize common case to avoid most of the divides
+ */
+ mss = tcp_skb_mss(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_shift_mss(prev))
+ goto fallback;
+
+ if (len == mss) {
+ pcount = 1;
+ } else if (len < mss) {
+ goto noop;
+ } else {
+ pcount = len / mss;
+ len = pcount * mss;
+ }
+ }
+
+ if (!skb_shift(prev, skb, len))
+ goto fallback;
+ if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
+ flag, mss))
+ goto out;
+
+ /* Hole filled allows collapsing with the next as well, this is very
+ * useful when hole on every nth skb pattern happens
+ */
+ if (prev == tcp_write_queue_tail(sk))
+ goto out;
+ skb = tcp_write_queue_next(sk, prev);
+
+ if (!skb_can_shift(skb))
+ goto out;
+ if (skb == tcp_send_head(sk))
+ goto out;
+ if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+ goto out;
+
+ len = skb->len;
+ if (skb_shift(prev, skb, len)) {
+ pcount += tcp_skb_pcount(skb);
+ tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
+ *fack_count, reord, flag, mss);
+ }
+
+out:
+ *fack_count += pcount;
+ return prev;
+
+noop:
+ return skb;
+
+fallback:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+ return NULL;
+}
+
static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sack_block *next_dup,
u32 start_seq, u32 end_seq,
int dup_sack_in, int *fack_count,
int *reord, int *flag)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *tmp;
+
tcp_for_write_queue_from(skb, sk) {
int in_sack = 0;
int dup_sack = dup_sack_in;
@@ -1376,15 +1630,41 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
dup_sack = 1;
}
- if (in_sack <= 0)
- in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
- end_seq);
+ /* skb reference here is a bit tricky to get right, since
+ * shifting can eat and free both this skb and the next,
+ * so not even _safe variant of the loop is enough.
+ */
+ if (in_sack <= 0) {
+ tmp = tcp_shift_skb_data(sk, skb, start_seq,
+ end_seq, dup_sack,
+ fack_count, reord, flag);
+ if (tmp != NULL) {
+ if (tmp != skb) {
+ skb = tmp;
+ continue;
+ }
+
+ in_sack = 0;
+ } else {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ start_seq,
+ end_seq);
+ }
+ }
+
if (unlikely(in_sack < 0))
break;
- if (in_sack)
+ if (in_sack) {
*flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
- *fack_count);
+ *fack_count,
+ &(TCP_SKB_CB(skb)->sacked),
+ tcp_skb_pcount(skb));
+
+ if (!before(TCP_SKB_CB(skb)->seq,
+ tcp_highest_sack_seq(tp)))
+ tcp_advance_highest_sack(sk, skb);
+ }
*fack_count += tcp_skb_pcount(skb);
}
@@ -1401,7 +1681,7 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
if (skb == tcp_send_head(sk))
break;
- if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
+ if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
break;
*fack_count += tcp_skb_pcount(skb);
@@ -1660,7 +1940,7 @@ out:
/* Limits sacked_out so that sum with lost_out isn't ever larger than
* packets_out. Returns zero if sacked_out adjustement wasn't necessary.
*/
-int tcp_limit_reno_sacked(struct tcp_sock *tp)
+static int tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
@@ -2336,9 +2616,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
- printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n",
+ printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
msg,
- NIPQUAD(inet->daddr), ntohs(inet->dport),
+ &inet->daddr, ntohs(inet->dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2346,9 +2626,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n",
+ printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
- NIP6(np->daddr), ntohs(inet->dport),
+ &np->daddr, ntohs(inet->dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2559,6 +2839,56 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery.
+ * The socket is already locked here.
+ */
+void tcp_simple_retransmit(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ unsigned int mss = tcp_current_mss(sk, 0);
+ u32 prior_lost = tp->lost_out;
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ if (skb->len > mss &&
+ !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ }
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ }
+ }
+
+ tcp_clear_retrans_hints_partial(tp);
+
+ if (prior_lost == tp->lost_out)
+ return;
+
+ if (tcp_is_reno(tp))
+ tcp_limit_reno_sacked(tp);
+
+ tcp_verify_left_out(tp);
+
+ /* Don't muck with the congestion window here.
+ * Reason is that we do not increase amount of _data_
+ * in network, but units changed and effective
+ * cwnd/ssthresh really reduced now.
+ */
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = 0;
+ tcp_set_ca_state(sk, TCP_CA_Loss);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and