diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 1213 |
1 files changed, 898 insertions, 315 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5200aab0ca97..d46f2143305c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -34,9 +34,12 @@ * */ +#define pr_fmt(fmt) "TCP: " fmt + #include <net/tcp.h> #include <linux/compiler.h> +#include <linux/gfp.h> #include <linux/module.h> /* People can turn this off for buggy TCP's found in printers etc. */ @@ -47,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; +/* Default TSQ limit of two TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 131072; + /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -54,28 +60,32 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; int sysctl_tcp_tso_win_divisor __read_mostly = 3; int sysctl_tcp_mtu_probing __read_mostly = 0; -int sysctl_tcp_base_mss __read_mostly = 512; +int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; +EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); + +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); + /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - /* Don't override Nagle indefinately with F-RTO */ - if (tp->frto_counter == 2) - tp->frto_counter = 3; - tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + tcp_rearm_rto(sk); + } } /* SND.NXT, if window was not shrunk. @@ -84,9 +94,9 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ -static inline __u32 tcp_acceptable_seq(struct sock *sk) +static inline __u32 tcp_acceptable_seq(const struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); if (!before(tcp_wnd_end(tp), tp->snd_nxt)) return tp->snd_nxt; @@ -111,12 +121,16 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk) static __u16 tcp_advertise_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); + const struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; - if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { - mss = dst_metric(dst, RTAX_ADVMSS); - tp->advmss = mss; + if (dst) { + unsigned int metric = dst_metric_advmss(dst); + + if (metric < mss) { + mss = metric; + tp->advmss = mss; + } } return (__u16)mss; @@ -124,7 +138,7 @@ static __u16 tcp_advertise_mss(struct sock *sk) /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) +static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) { struct tcp_sock *tp = tcp_sk(sk); s32 delta = tcp_time_stamp - tp->lsndtime; @@ -145,10 +159,11 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) /* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, - struct sk_buff *skb, struct sock *sk) + struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + const struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -159,8 +174,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && + (!dst || !dst_metric(dst, RTAX_QUICKACK))) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ @@ -170,6 +186,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } + +u32 tcp_default_init_rwnd(u32 mss) +{ + /* Initial receive window should be twice of TCP_INIT_CWND to + * enable proper sending of new unsent data during fast recovery + * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a + * limit when mss is larger than 1460. + */ + u32 init_rwnd = TCP_INIT_CWND * 2; + + if (mss > 1460) + init_rwnd = max((1460 * init_rwnd) / mss, 2U); + return init_rwnd; +} + /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. @@ -179,7 +210,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, - int wscale_ok, __u8 *rcv_wscale) + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); @@ -218,23 +250,16 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to value enough for senders, - * following RFC2414. Senders, not following this RFC, - * will be satisfied with 2. - */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = 4; - if (mss > 1460 * 3) - init_cwnd = 2; - else if (mss > 1460) - init_cwnd = 3; - if (*rcv_wnd > init_cwnd * mss) - *rcv_wnd = init_cwnd * mss; + if (!init_rcv_wnd) /* Use default unless specified otherwise */ + init_rcv_wnd = tcp_default_init_rwnd(mss); + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); } /* Set the clamp no higher than max representable value */ (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); } +EXPORT_SYMBOL(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return @@ -280,11 +305,11 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) { - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; } /* Packet ECN state for a SYN. */ @@ -293,14 +318,14 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sysctl_tcp_ecn == 1) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; } } static __inline__ void -TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) { if (inet_rsk(req)->ecn_ok) th->ece = 1; @@ -338,9 +363,10 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { + skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; - TCP_SKB_CB(skb)->flags = flags; + TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; skb_shinfo(skb)->gso_segs = 1; @@ -348,12 +374,12 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) skb_shinfo(skb)->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; - if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) + if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } -static inline int tcp_urg_mode(const struct tcp_sock *tp) +static inline bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } @@ -361,13 +387,18 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SACK_ADVERTISE (1 << 0) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) +#define OPTION_WSCALE (1 << 3) +#define OPTION_FAST_OPEN_COOKIE (1 << 8) struct tcp_out_options { - u8 options; /* bit field of OPTION_* */ + u16 options; /* bit field of OPTION_* */ + u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ - u16 mss; /* 0 to disable */ + u8 hash_size; /* bytes in hash_location */ + __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ }; /* Write previously computed TCP options to the packet. @@ -384,17 +415,16 @@ struct tcp_out_options { * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, - const struct tcp_out_options *opts, - __u8 **md5_hash) { - if (unlikely(OPTION_MD5 & opts->options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *)ptr; + struct tcp_out_options *opts) +{ + u16 options = opts->options; /* mungable copy */ + + if (unlikely(OPTION_MD5 & options)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); + /* overload cookie hash location */ + opts->hash_location = (__u8 *)ptr; ptr += 4; - } else { - *md5_hash = NULL; } if (unlikely(opts->mss)) { @@ -403,12 +433,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, opts->mss); } - if (likely(OPTION_TS & opts->options)) { - if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { + if (likely(OPTION_TS & options)) { + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -419,15 +450,14 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } - if (unlikely(OPTION_SACK_ADVERTISE & opts->options && - !(OPTION_TS & opts->options))) { + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } - if (unlikely(opts->ws)) { + if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | @@ -453,22 +483,39 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, tp->rx_opt.dsack = 0; } + + if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { + struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + + *ptr++ = htonl((TCPOPT_EXP << 24) | + ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | + TCPOPT_FASTOPEN_MAGIC); + + memcpy(ptr, foc->val, foc->len); + if ((foc->len & 3) == 2) { + u8 *align = ((u8 *)ptr) + foc->len; + align[0] = align[1] = TCPOPT_NOP; + } + ptr += (foc->len + 3) >> 2; + } } /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ -static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) { + struct tcp_md5sig_key **md5) +{ struct tcp_sock *tp = tcp_sk(sk); - unsigned size = 0; + unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_fastopen_request *fastopen = tp->fastopen_req; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + remaining -= TCPOLEN_MD5SIG_ALIGNED; } #else *md5 = NULL; @@ -484,88 +531,114 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); - size += TCPOLEN_MSS_ALIGNED; + remaining -= TCPOLEN_MSS_ALIGNED; if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; + remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(sysctl_tcp_window_scaling)) { opts->ws = tp->rx_opt.rcv_wscale; - if (likely(opts->ws)) - size += TCPOLEN_WSCALE_ALIGNED; + opts->options |= OPTION_WSCALE; + remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(sysctl_tcp_sack)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) - size += TCPOLEN_SACKPERM_ALIGNED; + remaining -= TCPOLEN_SACKPERM_ALIGNED; } - return size; + if (fastopen && fastopen->cookie.len >= 0) { + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; + need = (need + 3) & ~3U; /* Align to 32 bits */ + if (remaining >= need) { + opts->options |= OPTION_FAST_OPEN_COOKIE; + opts->fastopen_cookie = &fastopen->cookie; + remaining -= need; + tp->syn_fastopen = 1; + } + } + + return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ -static unsigned tcp_synack_options(struct sock *sk, +static unsigned int tcp_synack_options(struct sock *sk, struct request_sock *req, - unsigned mss, struct sk_buff *skb, + unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) { - unsigned size = 0; + struct tcp_md5sig_key **md5, + struct tcp_fastopen_cookie *foc) +{ struct inet_request_sock *ireq = inet_rsk(req); - char doing_ts; + unsigned int remaining = MAX_TCP_OPTION_SPACE; #ifdef CONFIG_TCP_MD5SIG *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); if (*md5) { opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + + /* We can't fit any SACK blocks in a packet with MD5 + TS + * options. There was discussion about disabling SACK + * rather than TS in order to fit in better with old, + * buggy kernels, but that was deemed to be unnecessary. + */ + ireq->tstamp_ok &= !ireq->sack_ok; } #else *md5 = NULL; #endif - /* we can't fit any SACK blocks in a packet with MD5 + TS - options. There was discussion about disabling SACK rather than TS in - order to fit in better with old, buggy kernels, but that was deemed - to be unnecessary. */ - doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); - + /* We always send an MSS option. */ opts->mss = mss; - size += TCPOLEN_MSS_ALIGNED; + remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; - if (likely(opts->ws)) - size += TCPOLEN_WSCALE_ALIGNED; + opts->options |= OPTION_WSCALE; + remaining -= TCPOLEN_WSCALE_ALIGNED; } - if (likely(doing_ts)) { + if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = TCP_SKB_CB(skb)->when; opts->tsecr = req->ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; + remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; - if (unlikely(!doing_ts)) - size += TCPOLEN_SACKPERM_ALIGNED; + if (unlikely(!ireq->tstamp_ok)) + remaining -= TCPOLEN_SACKPERM_ALIGNED; + } + if (foc != NULL) { + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; + need = (need + 3) & ~3U; /* Align to 32 bits */ + if (remaining >= need) { + opts->options |= OPTION_FAST_OPEN_COOKIE; + opts->fastopen_cookie = foc; + remaining -= need; + } } - return size; + return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the * final wire format yet. */ -static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) { + struct tcp_md5sig_key **md5) +{ struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); - unsigned size = 0; + unsigned int size = 0; unsigned int eff_sacks; + opts->options = 0; + #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); if (unlikely(*md5)) { @@ -578,16 +651,16 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when : 0; + opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { - const unsigned remaining = MAX_TCP_OPTION_SPACE - size; + const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; opts->num_sack_blocks = - min_t(unsigned, eff_sacks, + min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + @@ -597,6 +670,160 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, return size; } + +/* TCP SMALL QUEUES (TSQ) + * + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) + * to reduce RTT and bufferbloat. + * We do this using a special skb destructor (tcp_wfree). + * + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb + * needs to be reallocated in a driver. + * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * + * Since transmit from skb destructor is forbidden, we use a tasklet + * to process all sockets that eventually need to send more skbs. + * We use one tasklet per cpu, with its own queue of sockets. + */ +struct tsq_tasklet { + struct tasklet_struct tasklet; + struct list_head head; /* queue of tcp sockets */ +}; +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); + +static void tcp_tsq_handler(struct sock *sk) +{ + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | + TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) + tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); +} +/* + * One tasklest per cpu tries to send more skbs. + * We run in tasklet context but need to disable irqs when + * transfering tsq->head because tcp_wfree() might + * interrupt us (non NAPI drivers) + */ +static void tcp_tasklet_func(unsigned long data) +{ + struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + LIST_HEAD(list); + unsigned long flags; + struct list_head *q, *n; + struct tcp_sock *tp; + struct sock *sk; + + local_irq_save(flags); + list_splice_init(&tsq->head, &list); + local_irq_restore(flags); + + list_for_each_safe(q, n, &list) { + tp = list_entry(q, struct tcp_sock, tsq_node); + list_del(&tp->tsq_node); + + sk = (struct sock *)tp; + bh_lock_sock(sk); + + if (!sock_owned_by_user(sk)) { + tcp_tsq_handler(sk); + } else { + /* defer the work to tcp_release_cb() */ + set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); + } + bh_unlock_sock(sk); + + clear_bit(TSQ_QUEUED, &tp->tsq_flags); + sk_free(sk); + } +} + +#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ + (1UL << TCP_WRITE_TIMER_DEFERRED) | \ + (1UL << TCP_DELACK_TIMER_DEFERRED) | \ + (1UL << TCP_MTU_REDUCED_DEFERRED)) +/** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket + * + * called from release_sock() to perform protocol dependent + * actions before socket release. + */ +void tcp_release_cb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long flags, nflags; + + /* perform an atomic operation only if at least one flag is set */ + do { + flags = tp->tsq_flags; + if (!(flags & TCP_DEFERRED_ALL)) + return; + nflags = flags & ~TCP_DEFERRED_ALL; + } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + + if (flags & (1UL << TCP_TSQ_DEFERRED)) + tcp_tsq_handler(sk); + + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { + tcp_write_timer_handler(sk); + __sock_put(sk); + } + if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { + tcp_delack_timer_handler(sk); + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { + sk->sk_prot->mtu_reduced(sk); + __sock_put(sk); + } +} +EXPORT_SYMBOL(tcp_release_cb); + +void __init tcp_tasklet_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + + INIT_LIST_HEAD(&tsq->head); + tasklet_init(&tsq->tasklet, + tcp_tasklet_func, + (unsigned long)tsq); + } +} + +/* + * Write buffer destructor automatically called from kfree_skb. + * We cant xmit new skbs from this context, as we might already + * hold qdisc lock. + */ +void tcp_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && + !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { + unsigned long flags; + struct tsq_tasklet *tsq; + + /* Keep a ref on socket. + * This last ref will be released in tcp_tasklet_func() + */ + atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = &__get_cpu_var(tsq_tasklet); + list_add(&tp->tsq_node, &tsq->head); + tasklet_schedule(&tsq->tasklet); + local_irq_restore(flags); + } else { + sock_wfree(skb); + } +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -616,9 +843,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; - unsigned tcp_options_size, tcp_header_size; + unsigned int tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; - __u8 *md5_hash_location; struct tcphdr *th; int err; @@ -631,6 +857,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, __net_timestamp(skb); if (likely(clone_it)) { + const struct sk_buff *fclone = skb + 1; + + if (unlikely(skb->fclone == SKB_FCLONE_ORIG && + fclone->fclone == SKB_FCLONE_CLONE)) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else @@ -644,7 +877,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); else tcp_options_size = tcp_established_options(sk, skb, &opts, @@ -654,20 +887,29 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); + /* if no packet is in qdisc/device queue, then allow XPS to select + * another queue. + */ + skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; + skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); - skb_set_owner_w(skb, sk); + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = tcp_wfree; + atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ th = tcp_hdr(skb); - th->source = inet->sport; - th->dest = inet->dport; + th->source = inet->inet_sport; + th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | - tcb->flags); + tcb->tcp_flags); - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ @@ -684,36 +926,37 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { - th->urg_ptr = 0xFFFF; + th->urg_ptr = htons(0xFFFF); th->urg = 1; } } - tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); - if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) + tcp_options_write((__be32 *)(th + 1), tp, &opts); + if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - tp->af_specific->calc_md5_hash(md5_hash_location, + sk_nocaps_add(sk, NETIF_F_GSO_MASK); + tp->af_specific->calc_md5_hash(opts.hash_location, md5, sk, NULL, skb); } #endif - icsk->icsk_af_ops->send_check(sk, skb->len, skb); + icsk->icsk_af_ops->send_check(sk, skb); - if (likely(tcb->flags & TCPCB_FLAG_ACK)) + if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) - tcp_event_data_sent(tp, skb, sk); + tcp_event_data_sent(tp, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) - TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, + tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); if (likely(err <= 0)) return err; @@ -740,11 +983,13 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, +static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { - if (skb->len <= mss_now || !sk_can_gso(sk) || - skb->ip_summed == CHECKSUM_NONE) { + /* Make sure we own this skb before messing gso_size/gso_segs */ + WARN_ON_ONCE(skb_cloned(skb)); + + if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ @@ -761,7 +1006,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, /* When a modification to fackets out becomes necessary, we need to check * skb is counted to fackets_out or not. */ -static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, +static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); @@ -776,7 +1021,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ -static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr) +static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); @@ -817,15 +1062,14 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, int nlen; u8 flags; - BUG_ON(len > skb->len); + if (WARN_ON(len > skb->len)) + return -EINVAL; nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; /* Get a new skb... force flag on. */ @@ -845,9 +1089,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ - flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); - TCP_SKB_CB(buff)->flags = flags; + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); + TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { @@ -904,17 +1148,26 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) { int i, k, eat; + eat = min_t(int, len, skb_headlen(skb)); + if (eat) { + __skb_pull(skb, eat); + len -= eat; + if (!len) + return; + } eat = len; k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size <= eat) { - put_page(skb_shinfo(skb)->frags[i].page); - eat -= skb_shinfo(skb)->frags[i].size; + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size <= eat) { + skb_frag_unref(skb, i); + eat -= size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; - skb_shinfo(skb)->frags[k].size -= eat; + skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); eat = 0; } k++; @@ -930,14 +1183,10 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; - /* If len == headlen, we avoid __skb_pull to preserve alignment. */ - if (unlikely(len < skb_headlen(skb))) - __skb_pull(skb, len); - else - __pskb_trim_head(skb, len - skb_headlen(skb)); + __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_PARTIAL; @@ -947,20 +1196,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) sk_mem_uncharge(sk, len); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - /* Any change of skb->len requires recalculation of tso - * factor and mss. - */ + /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); return 0; } -/* Calculate MSS. Not accounting for SACKs here. */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +/* Calculate MSS not accounting any TCP options. */ +static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { - struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: @@ -968,6 +1215,14 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); + /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ + if (icsk->icsk_af_ops->net_frag_header_len) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_allfrag(dst)) + mss_now -= icsk->icsk_af_ops->net_frag_header_len; + } + /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; @@ -978,18 +1233,22 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - return mss_now; } +/* Calculate MSS. Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + /* Subtract TCP options size, not including SACKs */ + return __tcp_mtu_to_mss(sk, pmtu) - + (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} + /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { - struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); int mtu; mtu = mss + @@ -997,6 +1256,13 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; + /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ + if (icsk->icsk_af_ops->net_frag_header_len) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_allfrag(dst)) + mtu += icsk->icsk_af_ops->net_frag_header_len; + } return mtu; } @@ -1012,6 +1278,7 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; } +EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. @@ -1055,16 +1322,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } +EXPORT_SYMBOL(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ unsigned int tcp_current_mss(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; - unsigned header_len; + unsigned int header_len; struct tcp_out_options opts; struct tcp_md5sig_key *md5; @@ -1122,22 +1390,22 @@ static void tcp_cwnd_validate(struct sock *sk) * modulo only when the receiver window alone is the limiting factor or * when we would be allowed to send the split-due-to-Nagle skb fully. */ -static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, - unsigned int mss_now, unsigned int cwnd) +static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, + unsigned int mss_now, unsigned int max_segs) { - struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, cwnd_len; + const struct tcp_sock *tp = tcp_sk(sk); + u32 needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; - cwnd_len = mss_now * cwnd; + max_len = mss_now * max_segs; - if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) - return cwnd_len; + if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) + return max_len; needed = min(skb->len, window); - if (cwnd_len <= needed) - return cwnd_len; + if (max_len <= needed) + return max_len; return needed - needed % mss_now; } @@ -1145,13 +1413,13 @@ static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, - struct sk_buff *skb) +static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, + const struct sk_buff *skb) { u32 in_flight, cwnd; /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) return 1; @@ -1163,11 +1431,11 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, return 0; } -/* Intialize TSO state of a skb. +/* Initialize TSO state of a skb. * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, +static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1180,33 +1448,33 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, } /* Minshall's variant of the Nagle send check. */ -static inline int tcp_minshall_check(const struct tcp_sock *tp) +static inline bool tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } -/* Return 0, if packet can be sent now without violation Nagle's rules: +/* Return false, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_NODELAY was set. + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ -static inline int tcp_nagle_check(const struct tcp_sock *tp, +static inline bool tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned mss_now, int nonagle) + unsigned int mss_now, int nonagle) { - return (skb->len < mss_now && + return skb->len < mss_now && ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } -/* Return non-zero if the Nagle test allows this packet to be +/* Return true if the Nagle test allows this packet to be * sent now. */ -static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) +static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, + unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). @@ -1215,24 +1483,22 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) - return 1; + return true; - /* Don't use the nagle rule for urgent data (or for the final FIN). - * Nagle can be ignored during F-RTO too (see RFC4138). - */ - if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) - return 1; + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + return true; if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) - return 1; + return true; - return 0; + return false; } /* Does at least the first segment of SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, - unsigned int cur_mss) +static bool tcp_snd_wnd_test(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1246,10 +1512,10 @@ static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, * should be put on the wire right now. If so, it returns the number of * packets allowed by the congestion window. */ -static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, unsigned int cur_mss, int nonagle) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); unsigned int cwnd_quota; tcp_init_tso_segs(sk, skb, cur_mss); @@ -1265,15 +1531,15 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, } /* Test if sending is allowed right now. */ -int tcp_may_send_now(struct sock *sk) +bool tcp_may_send_now(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); - return (skb && + return skb && tcp_snd_test(sk, skb, tcp_current_mss(sk), (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH))); + tp->nonagle : TCP_NAGLE_PUSH)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet @@ -1284,7 +1550,7 @@ int tcp_may_send_now(struct sock *sk) * packet has never been sent out before (and thus is not cloned). */ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, - unsigned int mss_now) + unsigned int mss_now, gfp_t gfp) { struct sk_buff *buff; int nlen = skb->len - len; @@ -1294,7 +1560,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now); - buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, 0, gfp); if (unlikely(buff == NULL)) return -ENOMEM; @@ -1309,9 +1575,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ - flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); - TCP_SKB_CB(buff)->flags = flags; + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); + TCP_SKB_CB(buff)->tcp_flags = flags; /* This packet was never sent out yet, so no SACK bits. */ TCP_SKB_CB(buff)->sacked = 0; @@ -1335,13 +1601,14 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; + int win_divisor; - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto send_now; if (icsk->icsk_ca_state != TCP_CA_Open) @@ -1364,20 +1631,22 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= sk->sk_gso_max_size) + if (limit >= min_t(unsigned int, sk->sk_gso_max_size, + tp->xmit_size_goal_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - if (sysctl_tcp_tso_win_divisor) { + win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ - chunk /= sysctl_tcp_tso_win_divisor; + chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { @@ -1386,18 +1655,21 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) * frame, so if we have space for more than 3 frames * then send now. */ - if (limit > tcp_max_burst(tp) * tp->mss_cache) + if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } - /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies << 1); + /* Ok, it looks like it is advisable to defer. + * Do not rearm the timer if already set to not break TCP ACK clocking. + */ + if (!tp->tso_deferred) + tp->tso_deferred = 1 | (jiffies << 1); - return 1; + return true; send_now: tp->tso_deferred = 0; - return 0; + return false; } /* Create a new MTU probe if we are ready. @@ -1467,7 +1739,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; - TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; nskb->ip_summed = skb->ip_summed; @@ -1487,12 +1759,12 @@ static int tcp_mtu_probe(struct sock *sk) if (skb->len <= copy) { /* We've eaten all the data from this skb. * Throw it away. */ - TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); } else { - TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & - ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & + ~(TCPHDR_FIN|TCPHDR_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); if (skb->ip_summed != CHECKSUM_PARTIAL) @@ -1539,11 +1811,14 @@ static int tcp_mtu_probe(struct sock *sk) * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * - * Returns 1, if no segments are in flight and we have queued segments, but - * cannot send anything now because of SWS or another problem. + * Send at most one packet when push_one > 0. Temporarily ignore + * cwnd limit to force at most one packet out when push_one == 2. + + * Returns true, if no segments are in flight and we have queued segments, + * but cannot send anything now because of SWS or another problem. */ -static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - int push_one, gfp_t gfp) +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1557,7 +1832,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { - return 0; + return false; } else if (result > 0) { sent_pkts = 1; } @@ -1569,9 +1844,17 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) + goto repair; /* Skip network transmission */ + cwnd_quota = tcp_cwnd_test(tp, skb); - if (!cwnd_quota) - break; + if (!cwnd_quota) { + if (push_one == 2) + /* Force out a loss probe pkt. */ + cwnd_quota = 1; + else + break; + } if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; @@ -1586,13 +1869,29 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } + /* TCP Small Queues : + * Control number of packets in qdisc/devices to two packets / or ~1 ms. + * This allows for : + * - better RTT estimation and ACK scheduling + * - faster recovery + * - high rates + */ + limit = max(skb->truesize, sk->sk_pacing_rate >> 10); + + if (atomic_read(&sk->sk_wmem_alloc) > limit) { + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + break; + } + limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - cwnd_quota); + min_t(unsigned int, + cwnd_quota, + sk->sk_gso_max_segs)); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now))) + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; TCP_SKB_CB(skb)->when = tcp_time_stamp; @@ -1600,23 +1899,145 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; +repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); - sent_pkts++; + sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } if (likely(sent_pkts)) { + if (tcp_in_cwnd_reduction(sk)) + tp->prr_out += sent_pkts; + + /* Send one loss probe per tail loss episode. */ + if (push_one != 2) + tcp_schedule_loss_probe(sk); tcp_cwnd_validate(sk); - return 0; + return false; + } + return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); +} + +bool tcp_schedule_loss_probe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 timeout, tlp_time_stamp, rto_time_stamp; + u32 rtt = tp->srtt >> 3; + + if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) + return false; + /* No consecutive loss probes. */ + if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { + tcp_rearm_rto(sk); + return false; + } + /* Don't do any loss probe on a Fast Open connection before 3WHS + * finishes. + */ + if (sk->sk_state == TCP_SYN_RECV) + return false; + + /* TLP is only scheduled when next timer event is RTO. */ + if (icsk->icsk_pending != ICSK_TIME_RETRANS) + return false; + + /* Schedule a loss probe in 2*RTT for SACK capable connections + * in Open state, that are either limited by cwnd or application. + */ + if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || + !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + return false; + + if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && + tcp_send_head(sk)) + return false; + + /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account + * for delayed ack when there's one outstanding packet. + */ + timeout = rtt << 1; + if (tp->packets_out == 1) + timeout = max_t(u32, timeout, + (rtt + (rtt >> 1) + TCP_DELACK_MAX)); + timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + + /* If RTO is shorter, just schedule TLP in its place. */ + tlp_time_stamp = tcp_time_stamp + timeout; + rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; + if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { + s32 delta = rto_time_stamp - tcp_time_stamp; + if (delta > 0) + timeout = delta; } - return !tp->packets_out && tcp_send_head(sk); + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, + TCP_RTO_MAX); + return true; +} + +/* When probe timeout (PTO) fires, send a new segment if one exists, else + * retransmit the last segment. + */ +void tcp_send_loss_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int pcount; + int mss = tcp_current_mss(sk); + int err = -1; + + if (tcp_send_head(sk) != NULL) { + err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + goto rearm_timer; + } + + /* At most one outstanding TLP retransmission. */ + if (tp->tlp_high_seq) + goto rearm_timer; + + /* Retransmit last segment. */ + skb = tcp_write_queue_tail(sk); + if (WARN_ON(!skb)) + goto rearm_timer; + + pcount = tcp_skb_pcount(skb); + if (WARN_ON(!pcount)) + goto rearm_timer; + + if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { + if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) + goto rearm_timer; + skb = tcp_write_queue_tail(sk); + } + + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + + /* Probe with zero data doesn't trigger fast recovery. */ + if (skb->len > 0) + err = __tcp_retransmit_skb(sk, skb); + + /* Record snd_nxt for loss detection. */ + if (likely(!err)) + tp->tlp_high_seq = tp->snd_nxt; + +rearm_timer: + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); + + if (likely(!err)) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSSPROBES); + return; } /* Push out any pending frames which were held back due to @@ -1626,11 +2047,6 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { - struct sk_buff *skb = tcp_send_head(sk); - - if (!skb) - return; - /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. @@ -1638,7 +2054,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, if (unlikely(sk->sk_state == TCP_CLOSE)) return; - if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) + if (tcp_write_xmit(sk, cur_mss, nonagle, 0, + sk_gfp_atomic(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1727,7 +2144,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (tcp_memory_pressure) + if (sk_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -1800,7 +2217,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. This moves PSH/FIN etc. over */ - TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; + TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. @@ -1818,22 +2235,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } /* Check if coalescing SKBs is legal. */ -static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) +static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) - return 0; + return false; /* TODO: SACK collapsing could be used to remove this condition */ if (skb_shinfo(skb)->nr_frags != 0) - return 0; + return false; if (skb_cloned(skb)) - return 0; + return false; if (skb == tcp_send_head(sk)) - return 0; + return false; /* Some heurestics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) - return 0; + return false; - return 1; + return true; } /* Collapse packets in the retransmit queue to make to create @@ -1844,11 +2261,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = to, *tmp; - int first = 1; + bool first = true; if (!sysctl_tcp_retrans_collapse) return; - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; tcp_for_write_queue_from_safe(skb, tmp, sk) { @@ -1858,7 +2275,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, space -= skb->len; if (first) { - first = 0; + first = false; continue; } @@ -1867,7 +2284,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, /* Punt if not enough space exists in the first SKB for * the data in the second */ - if (skb->len > skb_tailroom(to)) + if (skb->len > skb_availroom(to)) break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) @@ -1881,12 +2298,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); unsigned int cur_mss; - int err; /* Inconslusive MTU probe */ if (icsk->icsk_mtup.probe_size) { @@ -1917,8 +2333,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * case, when window is shrunk to zero. In this case * our retransmit serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) - && TCP_SKB_CB(skb)->seq != tp->snd_una) + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && + TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; if (skb->len > cur_mss) { @@ -1928,6 +2344,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int oldpcount = tcp_skb_pcount(skb); if (unlikely(oldpcount > 1)) { + if (skb_unclone(skb, GFP_ATOMIC)) + return -ENOMEM; tcp_init_tso_segs(sk, skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } @@ -1940,12 +2358,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * since it is cheap to do so and saves bytes on the network. */ if (skb->len > 0 && - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { /* Reuse, even though it does some unnecessary work */ tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, - TCP_SKB_CB(skb)->flags); + TCP_SKB_CB(skb)->tcp_flags); skb->ip_summed = CHECKSUM_NONE; } } @@ -1955,7 +2373,25 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + /* make sure skb->data is aligned on arches that require it + * and check if ack-trimming & collapsing extended the headroom + * beyond what csum_start can cover. + */ + if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || + skb_headroom(skb) >= 0xFFFF)) { + struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, + GFP_ATOMIC); + return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } else { + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + } +} + +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err = __tcp_retransmit_skb(sk, skb); if (err == 0) { /* Update global TCP statistics. */ @@ -1965,8 +2401,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - if (net_ratelimit()) - printk(KERN_DEBUG "retrans_out leaked.\n"); + net_dbg_ratelimited("retrans_out leaked\n"); } #endif if (!tp->retrans_out) @@ -1978,12 +2413,14 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans++; + tp->undo_retrans += tcp_skb_pcount(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } else { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } return err; } @@ -1991,18 +2428,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Check if we forward retransmits are possible in the current * window/congestion state. */ -static int tcp_can_forward_retransmit(struct sock *sk) +static bool tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Forward retransmissions are possible only during Recovery. */ if (icsk->icsk_ca_state != TCP_CA_Recovery) - return 0; + return false; /* No forward retransmissions in Reno are possible. */ if (tcp_is_reno(tp)) - return 0; + return false; /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... @@ -2013,9 +2450,9 @@ static int tcp_can_forward_retransmit(struct sock *sk) */ if (tcp_may_send_now(sk)) - return 0; + return false; - return 1; + return true; } /* This gets called after a retransmit timeout, and the initially @@ -2036,6 +2473,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) int mib_idx; int fwd_rexmitting = 0; + if (!tp->packets_out) + return; + if (!tp->lost_out) tp->retransmit_high = tp->snd_una; @@ -2104,8 +2544,12 @@ begin_fwd: if (tcp_retransmit_skb(sk, skb)) return; + NET_INC_STATS_BH(sock_net(sk), mib_idx); + if (tcp_in_cwnd_reduction(sk)) + tp->prr_out += tcp_skb_pcount(skb); + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, @@ -2129,7 +2573,7 @@ void tcp_send_fin(struct sock *sk) mss_now = tcp_current_mss(sk); if (tcp_send_head(sk) != NULL) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; } else { @@ -2146,7 +2590,7 @@ void tcp_send_fin(struct sock *sk) skb_reserve(skb, MAX_TCP_HEADER); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, - TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); @@ -2171,7 +2615,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), - TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCPHDR_ACK | TCPHDR_RST); /* Send it off. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) @@ -2191,11 +2635,11 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *skb; skb = tcp_write_queue_head(sk); - if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { - printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); + if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_debug("%s: wrong queue state\n", __func__); return -EFAULT; } - if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) @@ -2209,37 +2653,47 @@ int tcp_send_synack(struct sock *sk) skb = nskb; } - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } -/* Prepare a SYN-ACK. */ +/** + * tcp_make_synack - Prepare a SYN-ACK. + * sk: listener socket + * dst: dst entry attached to the SYNACK + * req: request_sock pointer + * + * Allocate one skb and build a SYNACK packet. + * @dst is consumed : Caller should not use it again. + */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req) + struct request_sock *req, + struct tcp_fastopen_cookie *foc) { + struct tcp_out_options opts; struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; - int tcp_header_size; - struct tcp_out_options opts; struct sk_buff *skb; struct tcp_md5sig_key *md5; - __u8 *md5_hash_location; + int tcp_header_size; int mss; skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); - if (skb == NULL) + if (unlikely(!skb)) { + dst_release(dst); return NULL; - + } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); - skb_dst_set(skb, dst_clone(dst)); + skb_dst_set(skb, dst); + security_skb_owned_by(skb, sk); - mss = dst_metric(dst, RTAX_ADVMSS); + mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) mss = tp->rx_opt.user_mss; @@ -2247,13 +2701,20 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; } @@ -2264,9 +2725,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, else #endif TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_header_size = tcp_synack_options(sk, req, mss, - skb, &opts, &md5) + - sizeof(struct tcphdr); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2282,31 +2742,34 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, * not even correctly set) */ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, - TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); + TCPHDR_SYN | TCPHDR_ACK); + th->seq = htonl(TCP_SKB_CB(skb)->seq); - th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); + /* XXX data is queued and acked as is. No buffer/window check */ + th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { - tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location, + tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, md5, NULL, req, skb); } #endif return skb; } +EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ -static void tcp_connect_init(struct sock *sk) +void tcp_connect_init(struct sock *sk) { - struct dst_entry *dst = __sk_dst_get(sk); + const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; @@ -2330,18 +2793,24 @@ static void tcp_connect_init(struct sock *sk) if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); - tp->advmss = dst_metric(dst, RTAX_ADVMSS); + tp->advmss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) tp->advmss = tp->rx_opt.user_mss; tcp_initialize_rcv_mss(sk); + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) + tp->window_clamp = tcp_full_space(sk); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; @@ -2353,23 +2822,137 @@ static void tcp_connect_init(struct sock *sk) tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; - tp->rcv_nxt = 0; - tp->rcv_wup = 0; - tp->copied_seq = 0; + tp->snd_nxt = tp->write_seq; + + if (likely(!tp->repair)) + tp->rcv_nxt = 0; + else + tp->rcv_tstamp = tcp_time_stamp; + tp->rcv_wup = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } +static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + tcb->end_seq += skb->len; + skb_header_release(skb); + __tcp_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + tp->write_seq = tcb->end_seq; + tp->packets_out += tcp_skb_pcount(skb); +} + +/* Build and send a SYN with data and (cached) Fast Open cookie. However, + * queue a data-only packet after the regular SYN, such that regular SYNs + * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges + * only the SYN sequence, the data are retransmitted in the first ACK. + * If cookie is not cached or other error occurs, falls back to send a + * regular SYN with Fast Open cookie request option. + */ +static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_fastopen_request *fo = tp->fastopen_req; + int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; + struct sk_buff *syn_data = NULL, *data; + unsigned long last_syn_loss = 0; + + tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ + tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, + &syn_loss, &last_syn_loss); + /* Recurring FO SYN losses: revert to regular handshake temporarily */ + if (syn_loss > 1 && + time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { + fo->cookie.len = -1; + goto fallback; + } + + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) + fo->cookie.len = -1; + else if (fo->cookie.len <= 0) + goto fallback; + + /* MSS for SYN-data is based on cached MSS and bounded by PMTU and + * user-MSS. Reserve maximum option space for middleboxes that add + * private TCP options. The cost is reduced data space in SYN :( + */ + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + MAX_TCP_OPTION_SPACE; + + syn_data = skb_copy_expand(syn, skb_headroom(syn), space, + sk->sk_allocation); + if (syn_data == NULL) + goto fallback; + + for (i = 0; i < iovlen && syn_data->len < space; ++i) { + struct iovec *iov = &fo->data->msg_iov[i]; + unsigned char __user *from = iov->iov_base; + int len = iov->iov_len; + + if (syn_data->len + len > space) + len = space - syn_data->len; + else if (i + 1 == iovlen) + /* No more data pending in inet_wait_for_connect() */ + fo->data = NULL; + + if (skb_add_data(syn_data, from, len)) + goto fallback; + } + + /* Queue a data-only packet after the regular SYN for retransmission */ + data = pskb_copy(syn_data, sk->sk_allocation); + if (data == NULL) + goto fallback; + TCP_SKB_CB(data)->seq++; + TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; + TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); + tcp_connect_queue_skb(sk, data); + fo->copied = data->len; + + if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + tp->syn_data = (fo->copied > 0); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + goto done; + } + syn_data = NULL; + +fallback: + /* Send a regular SYN with Fast Open cookie request option */ + if (fo->cookie.len > 0) + fo->cookie.len = 0; + err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); + if (err) + tp->syn_fastopen = 0; + kfree_skb(syn_data); +done: + fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ + return err; +} + /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; + int err; tcp_connect_init(sk); + if (unlikely(tp->repair)) { + tcp_finish_connect(sk, NULL); + return 0; + } + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; @@ -2377,19 +2960,16 @@ int tcp_connect(struct sock *sk) /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); - tp->snd_nxt = tp->write_seq; - tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); + tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); + tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_connect_queue_skb(sk, buff); TCP_ECN_send_syn(sk, buff); - /* Send it off. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tp->retrans_stamp = TCP_SKB_CB(buff)->when; - skb_header_release(buff); - __tcp_add_write_queue_tail(sk, buff); - sk->sk_wmem_queued += buff->truesize; - sk_mem_charge(sk, buff->truesize); - tp->packets_out += tcp_skb_pcount(buff); - tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); + /* Send off SYN; include data in Fast Open. */ + err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : + tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); + if (err == -ECONNREFUSED) + return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. @@ -2403,6 +2983,7 @@ int tcp_connect(struct sock *sk) inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } +EXPORT_SYMBOL(tcp_connect); /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() @@ -2473,7 +3054,7 @@ void tcp_send_ack(struct sock *sk) * tcp_transmit_skb() will set the ownership to this * sock. */ - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (buff == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; @@ -2484,11 +3065,11 @@ void tcp_send_ack(struct sock *sk) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); + tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } /* This routine sends a packet with an out of date sequence @@ -2508,7 +3089,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (skb == NULL) return -1; @@ -2518,11 +3099,20 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) * end to send an ack. Don't queue or clone SKB, just * send it. */ - tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } +void tcp_send_window_probe(struct sock *sk) +{ + if (sk->sk_state == TCP_ESTABLISHED) { + tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; + tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq; + tcp_xmit_probe_skb(sk, 0); + } +} + /* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk) { @@ -2548,13 +3138,13 @@ int tcp_write_wakeup(struct sock *sk) if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; if (tcp_fragment(sk, skb, seg_size, mss)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) @@ -2607,10 +3197,3 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } } - -EXPORT_SYMBOL(tcp_select_initial_window); -EXPORT_SYMBOL(tcp_connect); -EXPORT_SYMBOL(tcp_make_synack); -EXPORT_SYMBOL(tcp_simple_retransmit); -EXPORT_SYMBOL(tcp_sync_mss); -EXPORT_SYMBOL(tcp_mtup_init); |