diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 511 | 
1 files changed, 417 insertions, 94 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d77c0d29e239..99b7ecbe8893 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -701,13 +701,10 @@ static inline void tcp_set_rto(struct sock *sk)  	 *    all the algo is pure shit and should be replaced  	 *    with correct one. It is exactly, which we pretend to do.  	 */ -} -/* NOTE: clamping at TCP_RTO_MIN is not required, current algo - * guarantees that rto is higher. - */ -static inline void tcp_bound_rto(struct sock *sk) -{ +	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo +	 * guarantees that rto is higher. +	 */  	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)  		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;  } @@ -928,7 +925,6 @@ static void tcp_init_metrics(struct sock *sk)  		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));  	}  	tcp_set_rto(sk); -	tcp_bound_rto(sk);  	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)  		goto reset;  	tp->snd_cwnd = tcp_init_cwnd(tp, dst); @@ -1002,7 +998,8 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)  	}  } -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, +					    struct sk_buff *skb)  {  	tcp_verify_retransmit_hint(tp, skb); @@ -1236,31 +1233,58 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,  	return dup_sack;  } +struct tcp_sacktag_state { +	int reord; +	int fack_count; +	int flag; +}; +  /* Check if skb is fully within the SACK block. In presence of GSO skbs,   * the incoming SACK may not exactly match but we can find smaller MSS   * aligned portion of it that matches. Therefore we might need to fragment   * which may fail and creates some hassle (caller must handle error case   * returns). + * + * FIXME: this could be merged to shift decision code   */  static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  				 u32 start_seq, u32 end_seq)  {  	int in_sack, err;  	unsigned int pkt_len; +	unsigned int mss;  	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&  		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);  	if (tcp_skb_pcount(skb) > 1 && !in_sack &&  	    after(TCP_SKB_CB(skb)->end_seq, start_seq)) { - +		mss = tcp_skb_mss(skb);  		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); -		if (!in_sack) +		if (!in_sack) {  			pkt_len = start_seq - TCP_SKB_CB(skb)->seq; -		else +			if (pkt_len < mss) +				pkt_len = mss; +		} else {  			pkt_len = end_seq - TCP_SKB_CB(skb)->seq; -		err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); +			if (pkt_len < mss) +				return -EINVAL; +		} + +		/* Round if necessary so that SACKs cover only full MSSes +		 * and/or the remaining small portion (if present) +		 */ +		if (pkt_len > mss) { +			unsigned int new_len = (pkt_len / mss) * mss; +			if (!in_sack && new_len < pkt_len) { +				new_len += mss; +				if (new_len > skb->len) +					return 0; +			} +			pkt_len = new_len; +		} +		err = tcp_fragment(sk, skb, pkt_len, mss);  		if (err < 0)  			return err;  	} @@ -1268,24 +1292,25 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  	return in_sack;  } -static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, -			   int *reord, int dup_sack, int fack_count) +static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, +			  struct tcp_sacktag_state *state, +			  int dup_sack, int pcount)  {  	struct tcp_sock *tp = tcp_sk(sk);  	u8 sacked = TCP_SKB_CB(skb)->sacked; -	int flag = 0; +	int fack_count = state->fack_count;  	/* Account D-SACK for retransmitted packet. */  	if (dup_sack && (sacked & TCPCB_RETRANS)) {  		if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))  			tp->undo_retrans--;  		if (sacked & TCPCB_SACKED_ACKED) -			*reord = min(fack_count, *reord); +			state->reord = min(fack_count, state->reord);  	}  	/* Nothing to do; acked frame is about to be dropped (was ACKed). */  	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) -		return flag; +		return sacked;  	if (!(sacked & TCPCB_SACKED_ACKED)) {  		if (sacked & TCPCB_SACKED_RETRANS) { @@ -1294,10 +1319,9 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,  			 * that retransmission is still in flight.  			 */  			if (sacked & TCPCB_LOST) { -				TCP_SKB_CB(skb)->sacked &= -					~(TCPCB_LOST|TCPCB_SACKED_RETRANS); -				tp->lost_out -= tcp_skb_pcount(skb); -				tp->retrans_out -= tcp_skb_pcount(skb); +				sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); +				tp->lost_out -= pcount; +				tp->retrans_out -= pcount;  			}  		} else {  			if (!(sacked & TCPCB_RETRANS)) { @@ -1306,56 +1330,280 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,  				 */  				if (before(TCP_SKB_CB(skb)->seq,  					   tcp_highest_sack_seq(tp))) -					*reord = min(fack_count, *reord); +					state->reord = min(fack_count, +							   state->reord);  				/* SACK enhanced F-RTO (RFC4138; Appendix B) */  				if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) -					flag |= FLAG_ONLY_ORIG_SACKED; +					state->flag |= FLAG_ONLY_ORIG_SACKED;  			}  			if (sacked & TCPCB_LOST) { -				TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; -				tp->lost_out -= tcp_skb_pcount(skb); +				sacked &= ~TCPCB_LOST; +				tp->lost_out -= pcount;  			}  		} -		TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; -		flag |= FLAG_DATA_SACKED; -		tp->sacked_out += tcp_skb_pcount(skb); +		sacked |= TCPCB_SACKED_ACKED; +		state->flag |= FLAG_DATA_SACKED; +		tp->sacked_out += pcount; -		fack_count += tcp_skb_pcount(skb); +		fack_count += pcount;  		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */  		if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&  		    before(TCP_SKB_CB(skb)->seq,  			   TCP_SKB_CB(tp->lost_skb_hint)->seq)) -			tp->lost_cnt_hint += tcp_skb_pcount(skb); +			tp->lost_cnt_hint += pcount;  		if (fack_count > tp->fackets_out)  			tp->fackets_out = fack_count; - -		if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) -			tcp_advance_highest_sack(sk, skb);  	}  	/* D-SACK. We can detect redundant retransmission in S|R and plain R  	 * frames and clear it. undo_retrans is decreased above, L|R frames  	 * are accounted above as well.  	 */ -	if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { -		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; -		tp->retrans_out -= tcp_skb_pcount(skb); +	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) { +		sacked &= ~TCPCB_SACKED_RETRANS; +		tp->retrans_out -= pcount;  	} -	return flag; +	return sacked; +} + +static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, +			   struct tcp_sacktag_state *state, +			   unsigned int pcount, int shifted, int mss) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *prev = tcp_write_queue_prev(sk, skb); + +	BUG_ON(!pcount); + +	/* Tweak before seqno plays */ +	if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && +	    !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) +		tp->lost_cnt_hint += pcount; + +	TCP_SKB_CB(prev)->end_seq += shifted; +	TCP_SKB_CB(skb)->seq += shifted; + +	skb_shinfo(prev)->gso_segs += pcount; +	BUG_ON(skb_shinfo(skb)->gso_segs < pcount); +	skb_shinfo(skb)->gso_segs -= pcount; + +	/* When we're adding to gso_segs == 1, gso_size will be zero, +	 * in theory this shouldn't be necessary but as long as DSACK +	 * code can come after this skb later on it's better to keep +	 * setting gso_size to something. +	 */ +	if (!skb_shinfo(prev)->gso_size) { +		skb_shinfo(prev)->gso_size = mss; +		skb_shinfo(prev)->gso_type = sk->sk_gso_type; +	} + +	/* CHECKME: To clear or not to clear? Mimics normal skb currently */ +	if (skb_shinfo(skb)->gso_segs <= 1) { +		skb_shinfo(skb)->gso_size = 0; +		skb_shinfo(skb)->gso_type = 0; +	} + +	/* We discard results */ +	tcp_sacktag_one(skb, sk, state, 0, pcount); + +	/* Difference in this won't matter, both ACKed by the same cumul. ACK */ +	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); + +	if (skb->len > 0) { +		BUG_ON(!tcp_skb_pcount(skb)); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); +		return 0; +	} + +	/* Whole SKB was eaten :-) */ + +	if (skb == tp->retransmit_skb_hint) +		tp->retransmit_skb_hint = prev; +	if (skb == tp->scoreboard_skb_hint) +		tp->scoreboard_skb_hint = prev; +	if (skb == tp->lost_skb_hint) { +		tp->lost_skb_hint = prev; +		tp->lost_cnt_hint -= tcp_skb_pcount(prev); +	} + +	TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; +	if (skb == tcp_highest_sack(sk)) +		tcp_advance_highest_sack(sk, skb); + +	tcp_unlink_write_queue(skb, sk); +	sk_wmem_free_skb(sk, skb); + +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); + +	return 1; +} + +/* I wish gso_size would have a bit more sane initialization than + * something-or-zero which complicates things + */ +static int tcp_skb_seglen(struct sk_buff *skb) +{ +	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); +} + +/* Shifting pages past head area doesn't work */ +static int skb_can_shift(struct sk_buff *skb) +{ +	return !skb_headlen(skb) && skb_is_nonlinear(skb); +} + +/* Try collapsing SACK blocks spanning across multiple skbs to a single + * skb. + */ +static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, +					  struct tcp_sacktag_state *state, +					  u32 start_seq, u32 end_seq, +					  int dup_sack) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *prev; +	int mss; +	int pcount = 0; +	int len; +	int in_sack; + +	if (!sk_can_gso(sk)) +		goto fallback; + +	/* Normally R but no L won't result in plain S */ +	if (!dup_sack && +	    (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) +		goto fallback; +	if (!skb_can_shift(skb)) +		goto fallback; +	/* This frame is about to be dropped (was ACKed). */ +	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) +		goto fallback; + +	/* Can only happen with delayed DSACK + discard craziness */ +	if (unlikely(skb == tcp_write_queue_head(sk))) +		goto fallback; +	prev = tcp_write_queue_prev(sk, skb); + +	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) +		goto fallback; + +	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && +		  !before(end_seq, TCP_SKB_CB(skb)->end_seq); + +	if (in_sack) { +		len = skb->len; +		pcount = tcp_skb_pcount(skb); +		mss = tcp_skb_seglen(skb); + +		/* TODO: Fix DSACKs to not fragment already SACKed and we can +		 * drop this restriction as unnecessary +		 */ +		if (mss != tcp_skb_seglen(prev)) +			goto fallback; +	} else { +		if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) +			goto noop; +		/* CHECKME: This is non-MSS split case only?, this will +		 * cause skipped skbs due to advancing loop btw, original +		 * has that feature too +		 */ +		if (tcp_skb_pcount(skb) <= 1) +			goto noop; + +		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); +		if (!in_sack) { +			/* TODO: head merge to next could be attempted here +			 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), +			 * though it might not be worth of the additional hassle +			 * +			 * ...we can probably just fallback to what was done +			 * previously. We could try merging non-SACKed ones +			 * as well but it probably isn't going to buy off +			 * because later SACKs might again split them, and +			 * it would make skb timestamp tracking considerably +			 * harder problem. +			 */ +			goto fallback; +		} + +		len = end_seq - TCP_SKB_CB(skb)->seq; +		BUG_ON(len < 0); +		BUG_ON(len > skb->len); + +		/* MSS boundaries should be honoured or else pcount will +		 * severely break even though it makes things bit trickier. +		 * Optimize common case to avoid most of the divides +		 */ +		mss = tcp_skb_mss(skb); + +		/* TODO: Fix DSACKs to not fragment already SACKed and we can +		 * drop this restriction as unnecessary +		 */ +		if (mss != tcp_skb_seglen(prev)) +			goto fallback; + +		if (len == mss) { +			pcount = 1; +		} else if (len < mss) { +			goto noop; +		} else { +			pcount = len / mss; +			len = pcount * mss; +		} +	} + +	if (!skb_shift(prev, skb, len)) +		goto fallback; +	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss)) +		goto out; + +	/* Hole filled allows collapsing with the next as well, this is very +	 * useful when hole on every nth skb pattern happens +	 */ +	if (prev == tcp_write_queue_tail(sk)) +		goto out; +	skb = tcp_write_queue_next(sk, prev); + +	if (!skb_can_shift(skb) || +	    (skb == tcp_send_head(sk)) || +	    ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || +	    (mss != tcp_skb_seglen(skb))) +		goto out; + +	len = skb->len; +	if (skb_shift(prev, skb, len)) { +		pcount += tcp_skb_pcount(skb); +		tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss); +	} + +out: +	state->fack_count += pcount; +	return prev; + +noop: +	return skb; + +fallback: +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); +	return NULL;  }  static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,  					struct tcp_sack_block *next_dup, +					struct tcp_sacktag_state *state,  					u32 start_seq, u32 end_seq, -					int dup_sack_in, int *fack_count, -					int *reord, int *flag) +					int dup_sack_in)  { +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *tmp; +  	tcp_for_write_queue_from(skb, sk) {  		int in_sack = 0;  		int dup_sack = dup_sack_in; @@ -1376,17 +1624,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,  				dup_sack = 1;  		} -		if (in_sack <= 0) -			in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, -							end_seq); +		/* skb reference here is a bit tricky to get right, since +		 * shifting can eat and free both this skb and the next, +		 * so not even _safe variant of the loop is enough. +		 */ +		if (in_sack <= 0) { +			tmp = tcp_shift_skb_data(sk, skb, state, +						 start_seq, end_seq, dup_sack); +			if (tmp != NULL) { +				if (tmp != skb) { +					skb = tmp; +					continue; +				} + +				in_sack = 0; +			} else { +				in_sack = tcp_match_skb_to_sack(sk, skb, +								start_seq, +								end_seq); +			} +		} +  		if (unlikely(in_sack < 0))  			break; -		if (in_sack) -			*flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, -						 *fack_count); +		if (in_sack) { +			TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, +								  state, +								  dup_sack, +								  tcp_skb_pcount(skb)); + +			if (!before(TCP_SKB_CB(skb)->seq, +				    tcp_highest_sack_seq(tp))) +				tcp_advance_highest_sack(sk, skb); +		} -		*fack_count += tcp_skb_pcount(skb); +		state->fack_count += tcp_skb_pcount(skb);  	}  	return skb;  } @@ -1395,16 +1668,17 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,   * a normal way   */  static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, -					u32 skip_to_seq, int *fack_count) +					struct tcp_sacktag_state *state, +					u32 skip_to_seq)  {  	tcp_for_write_queue_from(skb, sk) {  		if (skb == tcp_send_head(sk))  			break; -		if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) +		if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))  			break; -		*fack_count += tcp_skb_pcount(skb); +		state->fack_count += tcp_skb_pcount(skb);  	}  	return skb;  } @@ -1412,18 +1686,17 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,  static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,  						struct sock *sk,  						struct tcp_sack_block *next_dup, -						u32 skip_to_seq, -						int *fack_count, int *reord, -						int *flag) +						struct tcp_sacktag_state *state, +						u32 skip_to_seq)  {  	if (next_dup == NULL)  		return skb;  	if (before(next_dup->start_seq, skip_to_seq)) { -		skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); -		skb = tcp_sacktag_walk(skb, sk, NULL, -				     next_dup->start_seq, next_dup->end_seq, -				     1, fack_count, reord, flag); +		skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq); +		skb = tcp_sacktag_walk(skb, sk, NULL, state, +				       next_dup->start_seq, next_dup->end_seq, +				       1);  	}  	return skb; @@ -1445,16 +1718,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  	struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);  	struct tcp_sack_block sp[TCP_NUM_SACKS];  	struct tcp_sack_block *cache; +	struct tcp_sacktag_state state;  	struct sk_buff *skb;  	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);  	int used_sacks; -	int reord = tp->packets_out; -	int flag = 0;  	int found_dup_sack = 0; -	int fack_count;  	int i, j;  	int first_sack_index; +	state.flag = 0; +	state.reord = tp->packets_out; +  	if (!tp->sacked_out) {  		if (WARN_ON(tp->fackets_out))  			tp->fackets_out = 0; @@ -1464,7 +1738,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,  					 num_sacks, prior_snd_una);  	if (found_dup_sack) -		flag |= FLAG_DSACKING_ACK; +		state.flag |= FLAG_DSACKING_ACK;  	/* Eliminate too old ACKs, but take into  	 * account more or less fresh ones, they can @@ -1533,7 +1807,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  	}  	skb = tcp_write_queue_head(sk); -	fack_count = 0; +	state.fack_count = 0;  	i = 0;  	if (!tp->sacked_out) { @@ -1558,7 +1832,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  		/* Event "B" in the comment above. */  		if (after(end_seq, tp->high_seq)) -			flag |= FLAG_DATA_LOST; +			state.flag |= FLAG_DATA_LOST;  		/* Skip too early cached blocks */  		while (tcp_sack_cache_ok(tp, cache) && @@ -1571,13 +1845,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  			/* Head todo? */  			if (before(start_seq, cache->start_seq)) { -				skb = tcp_sacktag_skip(skb, sk, start_seq, -						       &fack_count); +				skb = tcp_sacktag_skip(skb, sk, &state, +						       start_seq);  				skb = tcp_sacktag_walk(skb, sk, next_dup, +						       &state,  						       start_seq,  						       cache->start_seq, -						       dup_sack, &fack_count, -						       &reord, &flag); +						       dup_sack);  			}  			/* Rest of the block already fully processed? */ @@ -1585,9 +1859,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  				goto advance_sp;  			skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, -						       cache->end_seq, -						       &fack_count, &reord, -						       &flag); +						       &state, +						       cache->end_seq);  			/* ...tail remains todo... */  			if (tcp_highest_sack_seq(tp) == cache->end_seq) { @@ -1595,13 +1868,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  				skb = tcp_highest_sack(sk);  				if (skb == NULL)  					break; -				fack_count = tp->fackets_out; +				state.fack_count = tp->fackets_out;  				cache++;  				goto walk;  			} -			skb = tcp_sacktag_skip(skb, sk, cache->end_seq, -					       &fack_count); +			skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);  			/* Check overlap against next cached too (past this one already) */  			cache++;  			continue; @@ -1611,20 +1883,20 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  			skb = tcp_highest_sack(sk);  			if (skb == NULL)  				break; -			fack_count = tp->fackets_out; +			state.fack_count = tp->fackets_out;  		} -		skb = tcp_sacktag_skip(skb, sk, start_seq, &fack_count); +		skb = tcp_sacktag_skip(skb, sk, &state, start_seq);  walk: -		skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, -				       dup_sack, &fack_count, &reord, &flag); +		skb = tcp_sacktag_walk(skb, sk, next_dup, &state, +				       start_seq, end_seq, dup_sack);  advance_sp:  		/* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct  		 * due to in-order walk  		 */  		if (after(end_seq, tp->frto_highmark)) -			flag &= ~FLAG_ONLY_ORIG_SACKED; +			state.flag &= ~FLAG_ONLY_ORIG_SACKED;  		i++;  	} @@ -1641,10 +1913,10 @@ advance_sp:  	tcp_verify_left_out(tp); -	if ((reord < tp->fackets_out) && +	if ((state.reord < tp->fackets_out) &&  	    ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&  	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) -		tcp_update_reordering(sk, tp->fackets_out - reord, 0); +		tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);  out: @@ -1654,13 +1926,13 @@ out:  	WARN_ON((int)tp->retrans_out < 0);  	WARN_ON((int)tcp_packets_in_flight(tp) < 0);  #endif -	return flag; +	return state.flag;  }  /* Limits sacked_out so that sum with lost_out isn't ever larger than   * packets_out. Returns zero if sacked_out adjustement wasn't necessary.   */ -int tcp_limit_reno_sacked(struct tcp_sock *tp) +static int tcp_limit_reno_sacked(struct tcp_sock *tp)  {  	u32 holes; @@ -2336,9 +2608,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)  	struct inet_sock *inet = inet_sk(sk);  	if (sk->sk_family == AF_INET) { -		printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n", +		printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",  		       msg, -		       NIPQUAD(inet->daddr), ntohs(inet->dport), +		       &inet->daddr, ntohs(inet->dport),  		       tp->snd_cwnd, tcp_left_out(tp),  		       tp->snd_ssthresh, tp->prior_ssthresh,  		       tp->packets_out); @@ -2346,9 +2618,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)  	else if (sk->sk_family == AF_INET6) {  		struct ipv6_pinfo *np = inet6_sk(sk); -		printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n", +		printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",  		       msg, -		       NIP6(np->daddr), ntohs(inet->dport), +		       &np->daddr, ntohs(inet->dport),  		       tp->snd_cwnd, tcp_left_out(tp),  		       tp->snd_ssthresh, tp->prior_ssthresh,  		       tp->packets_out); @@ -2559,6 +2831,56 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)  	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);  } +/* Do a simple retransmit without using the backoff mechanisms in + * tcp_timer. This is used for path mtu discovery. + * The socket is already locked here. + */ +void tcp_simple_retransmit(struct sock *sk) +{ +	const struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *skb; +	unsigned int mss = tcp_current_mss(sk, 0); +	u32 prior_lost = tp->lost_out; + +	tcp_for_write_queue(skb, sk) { +		if (skb == tcp_send_head(sk)) +			break; +		if (tcp_skb_seglen(skb) > mss && +		    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { +			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { +				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; +				tp->retrans_out -= tcp_skb_pcount(skb); +			} +			tcp_skb_mark_lost_uncond_verify(tp, skb); +		} +	} + +	tcp_clear_retrans_hints_partial(tp); + +	if (prior_lost == tp->lost_out) +		return; + +	if (tcp_is_reno(tp)) +		tcp_limit_reno_sacked(tp); + +	tcp_verify_left_out(tp); + +	/* Don't muck with the congestion window here. +	 * Reason is that we do not increase amount of _data_ +	 * in network, but units changed and effective +	 * cwnd/ssthresh really reduced now. +	 */ +	if (icsk->icsk_ca_state != TCP_CA_Loss) { +		tp->high_seq = tp->snd_nxt; +		tp->snd_ssthresh = tcp_current_ssthresh(sk); +		tp->prior_ssthresh = 0; +		tp->undo_marker = 0; +		tcp_set_ca_state(sk, TCP_CA_Loss); +	} +	tcp_xmit_retransmit_queue(sk); +} +  /* Process an event, which can update packets-in-flight not trivially.   * Main goal of this function is to calculate new estimate for left_out,   * taking into account both packets sitting in receiver's buffer and @@ -2730,6 +3052,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)  	tcp_xmit_retransmit_queue(sk);  } +static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +{ +	tcp_rtt_estimator(sk, seq_rtt); +	tcp_set_rto(sk); +	inet_csk(sk)->icsk_backoff = 0; +} +  /* Read draft-ietf-tcplw-high-performance before mucking   * with this code. (Supersedes RFC1323)   */ @@ -2751,11 +3080,8 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag)  	 * in window is lost... Voila.	 			--ANK (010210)  	 */  	struct tcp_sock *tp = tcp_sk(sk); -	const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; -	tcp_rtt_estimator(sk, seq_rtt); -	tcp_set_rto(sk); -	inet_csk(sk)->icsk_backoff = 0; -	tcp_bound_rto(sk); + +	tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);  }  static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) @@ -2772,10 +3098,7 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)  	if (flag & FLAG_RETRANS_DATA_ACKED)  		return; -	tcp_rtt_estimator(sk, seq_rtt); -	tcp_set_rto(sk); -	inet_csk(sk)->icsk_backoff = 0; -	tcp_bound_rto(sk); +	tcp_valid_rtt_meas(sk, seq_rtt);  }  static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,  | 
