diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/tcp.c | 24 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 23 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 |
5 files changed, 52 insertions, 8 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bd0856ac680a..5542e3d778e6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2736,6 +2736,21 @@ static int tcp_repair_options_est(struct sock *sk, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_SYMBOL(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(void) +{ + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } +} + /* * Socket option code for TCP. */ @@ -3087,6 +3102,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_TX_DELAY: + if (val) + tcp_enable_tx_delay(); + tp->tcp_tx_delay = val; + break; default: err = -ENOPROTOOPT; break; @@ -3546,6 +3566,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_no_cookie; break; + case TCP_TX_DELAY: + val = tp->tcp_tx_delay; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f059fbd81a84..1b7e9e1fbd3b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -767,9 +767,11 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); - if (sk) + if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; + tcp_set_tx_time(skb, sk); + } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, @@ -859,9 +861,9 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); - if (sk) - ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + tcp_set_tx_time(skb, sk); ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 11011e8386dc..8bcaf2586b68 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,7 +274,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; - + tcptw->tw_tx_delay = tp->tcp_tx_delay; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f429e856e263..d954ff9069e8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1153,6 +1153,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); + tcp_add_tx_delay(skb, tp); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { @@ -2234,6 +2236,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; + if (static_branch_unlikely(&tcp_tx_delay_enabled) && + tcp_sk(sk)->tcp_tx_delay) { + u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay; + + /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we + * approximate our needs assuming an ~100% skb->truesize overhead. + * USEC_PER_SEC is approximated by 2^20. + * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. + */ + extra_bytes >>= (20 - 1); + limit += extra_bytes; + } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, @@ -3212,6 +3226,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, int tcp_header_size; struct tcphdr *th; int mss; + u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { @@ -3243,13 +3258,14 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif { - skb->skb_mstamp_ns = tcp_clock_ns(); + skb->skb_mstamp_ns = now; if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3292,8 +3308,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif - /* Do not fool tcpdump (if any), clean our debris */ - skb->tstamp = 0; + skb->skb_mstamp_ns = now; + tcp_add_tx_delay(skb, tp); + return skb; } EXPORT_SYMBOL(tcp_make_synack); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ad7039137a20..5606b2131b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -892,6 +892,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } else { mark = sk->sk_mark; } + tcp_set_tx_time(buff, sk); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; |