diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 181 |
1 files changed, 86 insertions, 95 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 03d26b85eab8..d46f2143305c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -363,17 +363,15 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - shinfo->gso_segs = 1; - shinfo->gso_size = 0; - shinfo->gso_type = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -408,7 +406,7 @@ struct tcp_out_options { * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from - * inter-operability perspective it seems that we're somewhat stuck with + * inter-operatibility perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). @@ -681,7 +679,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. - * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc + * The invariant being skb->truesize substracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. @@ -701,9 +699,9 @@ static void tcp_tsq_handler(struct sock *sk) tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); } /* - * One tasklet per cpu tries to send more skbs. + * One tasklest per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when - * transferring tsq->head because tcp_wfree() might + * transfering tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(unsigned long data) @@ -797,7 +795,7 @@ void __init tcp_tasklet_init(void) /* * Write buffer destructor automatically called from kfree_skb. - * We can't xmit new skbs from this context, as we might already + * We cant xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) @@ -852,14 +850,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); - if (clone_it) { - const struct sk_buff *fclone = skb + 1; + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) + __net_timestamp(skb); - /* If congestion control is doing timestamping, we must - * take such a timestamp before we potentially clone/copy. - */ - if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) - __net_timestamp(skb); + if (likely(clone_it)) { + const struct sk_buff *fclone = skb + 1; if (unlikely(skb->fclone == SKB_FCLONE_ORIG && fclone->fclone == SKB_FCLONE_CLONE)) @@ -988,8 +986,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Make sure we own this skb before messing gso_size/gso_segs */ WARN_ON_ONCE(skb_cloned(skb)); @@ -997,13 +993,13 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, /* Avoid the costly divide in the normal * non-TSO case. */ - shinfo->gso_segs = 1; - shinfo->gso_size = 0; - shinfo->gso_type = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; } else { - shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); - shinfo->gso_size = mss_now; - shinfo->gso_type = sk->sk_gso_type; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + skb_shinfo(skb)->gso_size = mss_now; + skb_shinfo(skb)->gso_type = sk->sk_gso_type; } } @@ -1150,7 +1146,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, */ static void __pskb_trim_head(struct sk_buff *skb, int len) { - struct skb_shared_info *shinfo; int i, k, eat; eat = min_t(int, len, skb_headlen(skb)); @@ -1162,24 +1157,23 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) } eat = len; k = 0; - shinfo = skb_shinfo(skb); - for (i = 0; i < shinfo->nr_frags; i++) { - int size = skb_frag_size(&shinfo->frags[i]); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { - shinfo->frags[k] = shinfo->frags[i]; + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { - shinfo->frags[k].page_offset += eat; - skb_frag_size_sub(&shinfo->frags[k], eat); + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); eat = 0; } k++; } } - shinfo->nr_frags = k; + skb_shinfo(skb)->nr_frags = k; skb_reset_tail_pointer(skb); skb->data_len -= len; @@ -1384,51 +1378,23 @@ static void tcp_cwnd_validate(struct sock *sk) } } -/* Minshall's variant of the Nagle send check. */ -static bool tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml, tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Update snd_sml if this skb is under mss - * Note that a TSO packet might end with a sub-mss segment - * The test is really : - * if ((skb->len % mss) != 0) - * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; - * But we can avoid doing the divide again given we already have - * skb_pcount = skb->len / mss_now - */ -static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, - const struct sk_buff *skb) -{ - if (skb->len < tcp_skb_pcount(skb) * mss_now) - tp->snd_sml = TCP_SKB_CB(skb)->end_seq; -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. (provided by caller in %partial bool) - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. +/* Returns the portion of skb which can be sent right away without + * introducing MSS oddities to segment boundaries. In rare cases where + * mss_now != mss_cache, we will request caller to create a small skb + * per input skb which could be mostly avoided here (if desired). + * + * We explicitly want to create a request for splitting write queue tail + * to a small skb for Nagle purposes while avoiding unnecessary modulos, + * thus all the complexity (cwnd_len is always MSS multiple which we + * return whenever allowed by the other factors). Basically we need the + * modulo only when the receiver window alone is the limiting factor or + * when we would be allowed to send the split-due-to-Nagle skb fully. */ -static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, - unsigned int mss_now, int nonagle) -{ - return partial && - ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} -/* Returns the portion of skb which can be sent right away */ -static unsigned int tcp_mss_split_point(const struct sock *sk, - const struct sk_buff *skb, - unsigned int mss_now, - unsigned int max_segs, - int nonagle) +static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, + unsigned int mss_now, unsigned int max_segs) { const struct tcp_sock *tp = tcp_sk(sk); - u32 partial, needed, window, max_len; + u32 needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; @@ -1441,15 +1407,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, if (max_len <= needed) return max_len; - partial = needed % mss_now; - /* If last segment is not a full MSS, check if Nagle rules allow us - * to include this last segment in this skb. - * Otherwise, we'll split the skb at last MSS boundary - */ - if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) - return needed - partial; - - return needed; + return needed - needed % mss_now; } /* Can at least one segment of SKB be sent right now, according to the @@ -1489,6 +1447,28 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, return tso_segs; } +/* Minshall's variant of the Nagle send check. */ +static inline bool tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml, tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ +static inline bool tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned int mss_now, int nonagle) +{ + return skb->len < mss_now && + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} /* Return true if the Nagle test allows this packet to be * sent now. @@ -1509,7 +1489,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; - if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) return true; return false; @@ -1895,12 +1875,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * - better RTT estimation and ACK scheduling * - faster recovery * - high rates - * Alas, some drivers / subsystems require a fair amount - * of queued bytes to ensure line rate. - * One example is wifi aggregation (802.11 AMPDU) */ - limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, - sk->sk_pacing_rate >> 10); + limit = max(skb->truesize, sk->sk_pacing_rate >> 10); if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); @@ -1912,8 +1888,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, - sk->sk_gso_max_segs), - nonagle); + sk->sk_gso_max_segs)); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -2378,6 +2353,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tcp_retrans_try_collapse(sk, skb, cur_mss); + /* Some Solaris stacks overoptimize and ignore the FIN on a + * retransmit when old data is attached. So strip it off + * since it is cheap to do so and saves bytes on the network. + */ + if (skb->len > 0 && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + if (!pskb_trim(skb, 0)) { + /* Reuse, even though it does some unnecessary work */ + tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, + TCP_SKB_CB(skb)->tcp_flags); + skb->ip_summed = CHECKSUM_NONE; + } + } + /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ @@ -2746,8 +2736,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->syn = 1; th->ack = 1; TCP_ECN_make_synack(req, th); - th->source = htons(ireq->ir_num); - th->dest = ireq->ir_rmt_port; + th->source = ireq->loc_port; + th->dest = ireq->rmt_port; /* Setting of flags are superfluous here for callers (and ECE is * not even correctly set) */ @@ -2777,7 +2767,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ -static void tcp_connect_init(struct sock *sk) +void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3118,6 +3108,7 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; + tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq; tcp_xmit_probe_skb(sk, 0); } } |