From 10d3be569243def8d92ac3722395ef5a59c504e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Apr 2016 10:55:23 -0700 Subject: tcp-tso: do not split TSO packets at retransmit time Linux TCP stack painfully segments all TSO/GSO packets before retransmits. This was fine back in the days when TSO/GSO were emerging, with their bugs, but we believe the dark age is over. Keeping big packets in write queues, but also in stack traversal has a lot of benefits. - Less memory overhead, because write queues have less skbs - Less cpu overhead at ACK processing. - Better SACK processing, as lot of studies mentioned how awful linux was at this ;) - Less cpu overhead to send the rtx packets (IP stack traversal, netfilter traversal, drivers...) - Better latencies in presence of losses. - Smaller spikes in fq like packet schedulers, as retransmits are not constrained by TCP Small Queues. 1 % packet losses are common today, and at 100Gbit speeds, this translates to ~80,000 losses per second. Losses are often correlated, and we see many retransmit events leading to 1-MSS train of packets, at the time hosts are already under stress. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 49bc474f8e35..373b03e78aaa 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -404,7 +404,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -436,7 +436,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { + if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ -- cgit From 02a1d6e7a6bb025a77da77012190e1efc1970f1c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Apr 2016 16:44:39 -0700 Subject: net: rename NET_{ADD|INC}_STATS_BH() Rename NET_INC_STATS_BH() to __NET_INC_STATS() and NET_ADD_STATS_BH() to __NET_ADD_STATS() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 373b03e78aaa..35f643d8ffbb 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -30,7 +30,7 @@ static void tcp_write_err(struct sock *sk) sk->sk_error_report(sk); tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /* Do not allow orphaned sockets to eat all our resources. @@ -68,7 +68,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } return 0; @@ -162,8 +162,8 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (tp->syn_data && icsk->icsk_retransmits == 1) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; @@ -178,8 +178,8 @@ static int tcp_write_timeout(struct sock *sk) tp->bytes_acked <= tp->rx_opt.mss_clamp) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -228,7 +228,7 @@ void tcp_delack_timer_handler(struct sock *sk) if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk_backlog_rcv(sk, skb); @@ -248,7 +248,7 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } out: @@ -265,7 +265,7 @@ static void tcp_delack_timer(unsigned long data) tcp_delack_timer_handler(sk); } else { inet_csk(sk)->icsk_ack.blocked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) sock_hold(sk); @@ -431,7 +431,7 @@ void tcp_retransmit_timer(struct sock *sk) } else { mib_idx = LINUX_MIB_TCPTIMEOUTS; } - NET_INC_STATS_BH(sock_net(sk), mib_idx); + __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); @@ -549,7 +549,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req) { struct net *net = read_pnet(&inet_rsk(req)->ireq_net); - NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS); + __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } EXPORT_SYMBOL(tcp_syn_ack_timeout); -- cgit From c10d9310edf5aa4a676991139d1a43ec7d87e56b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Apr 2016 14:16:47 -0700 Subject: tcp: do not assume TCP code is non preemptible We want to to make TCP stack preemptible, as draining prequeue and backlog queues can take lot of time. Many SNMP updates were assuming that BH (and preemption) was disabled. Need to convert some __NET_INC_STATS() calls to NET_INC_STATS() and some __TCP_INC_STATS() to TCP_INC_STATS() Before using this_cpu_ptr(net->ipv4.tcp_sk) in tcp_v4_send_reset() and tcp_v4_send_ack(), we add an explicit preempt disabled section. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 35f643d8ffbb..debdd8b33e69 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -162,8 +162,8 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (tp->syn_data && icsk->icsk_retransmits == 1) - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; @@ -178,8 +178,8 @@ static int tcp_write_timeout(struct sock *sk) tp->bytes_acked <= tp->rx_opt.mss_clamp) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -209,6 +209,7 @@ static int tcp_write_timeout(struct sock *sk) return 0; } +/* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -493,6 +494,7 @@ out_reset_timer: out:; } +/* Called with BH disabled */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); -- cgit