From 7c85af8810448d8ef59331be51e482413b5f503d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Sep 2015 17:16:05 -0700 Subject: tcp: avoid reorders for TFO passive connections We found that a TCP Fast Open passive connection was vulnerable to reorders, as the exchange might look like [1] C -> S S [2] S -> C S. ack request [3] S -> C . packets [2] and [3] can be generated at almost the same time. If C receives the 3rd packet before the 2nd, it will drop it as the socket is in SYN_SENT state and expects a SYNACK. S will have to retransmit the answer. Current OOO avoidance in linux is defeated because SYNACK packets are attached to the LISTEN socket, while DATA packets are attached to the children. They might be sent by different cpus, and different TX queues might be selected. It turns out that for TFO, we created a child, which is a full blown socket in TCP_SYN_RECV state, and we simply can attach the SYNACK packet to this socket. This means that at the time tcp_sendmsg() pushes DATA packet, skb->ooo_okay will be set iff the SYNACK packet had been sent and TX completed. This removes the reorder source at the host level. We also removed the export of tcp_try_fastopen(), as it is no longer called from IPv6. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'net/ipv4/tcp_fastopen.c') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f9c0fb84e435..db43c6286cf7 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -124,10 +124,10 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } -static bool tcp_fastopen_create_child(struct sock *sk, - struct sk_buff *skb, - struct dst_entry *dst, - struct request_sock *req) +static struct sock *tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; @@ -140,7 +140,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (!child) - return false; + return NULL; spin_lock(&queue->fastopenq->lock); queue->fastopenq->qlen++; @@ -216,9 +216,11 @@ static bool tcp_fastopen_create_child(struct sock *sk, tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; sk->sk_data_ready(sk); bh_unlock_sock(child); - sock_put(child); + /* Note: sock_put(child) will be done by tcp_conn_request() + * after SYNACK packet is sent. + */ WARN_ON(!req->sk); - return true; + return child; } static bool tcp_fastopen_queue_check(struct sock *sk) @@ -261,13 +263,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ -bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct dst_entry *dst) +struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) { struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -276,7 +279,7 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; - return false; + return NULL; } if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) @@ -296,11 +299,12 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, * data in SYN_RECV state. */ fastopen: - if (tcp_fastopen_create_child(sk, skb, dst, req)) { + child = tcp_fastopen_create_child(sk, skb, dst, req); + if (child) { foc->len = -1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - return true; + return child; } NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (foc->len > 0) /* Client presents an invalid cookie */ @@ -308,6 +312,5 @@ fastopen: valid_foc.exp = foc->exp; *foc = valid_foc; - return false; + return NULL; } -EXPORT_SYMBOL(tcp_try_fastopen); -- cgit From 0536fcc039a8926ec12ec587f41a83f7acafeb82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Sep 2015 07:42:52 -0700 Subject: tcp: prepare fastopen code for upcoming listener changes While auditing TCP stack for upcoming 'lockless' listener changes, I found I had to change fastopen_init_queue() to properly init the object before publishing it. Otherwise an other cpu could try to lock the spinlock before it gets properly initialized. Instead of adding appropriate barriers, just remove dynamic memory allocations : - Structure is 28 bytes on 64bit arches. Using additional 8 bytes for holding a pointer seems overkill. - Two listeners can share same cache line and performance would suffer. If we really want to save few bytes, we would instead dynamically allocate whole struct request_sock_queue in the future. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_fastopen.c') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index db43c6286cf7..f69f436fcbcc 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -142,9 +142,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, if (!child) return NULL; - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); + spin_lock(&queue->fastopenq.lock); + queue->fastopenq.qlen++; + spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created @@ -237,8 +237,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - if (!fastopenq || fastopenq->max_qlen == 0) + fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { -- cgit From ca6fb06518836ef9b65dc0aac02ff97704d52a05 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2015 11:43:35 -0700 Subject: tcp: attach SYNACK messages to request sockets instead of listener If a listen backlog is very big (to avoid syncookies), then the listener sk->sk_wmem_alloc is the main source of false sharing, as we need to touch it twice per SYNACK re-transmit and TX completion. (One SYN packet takes listener lock once, but up to 6 SYNACK are generated) By attaching the skb to the request socket, we remove this source of contention. Tested: listen(fd, 10485760); // single listener (no SO_REUSEPORT) 16 RX/TX queue NIC Sustain a SYNFLOOD attack of ~320,000 SYN per second, Sending ~1,400,000 SYNACK per second. Perf profiles now show listener spinlock being next bottleneck. 20.29% [kernel] [k] queued_spin_lock_slowpath 10.06% [kernel] [k] __inet_lookup_established 5.12% [kernel] [k] reqsk_timer_handler 3.22% [kernel] [k] get_next_timer_interrupt 3.00% [kernel] [k] tcp_make_synack 2.77% [kernel] [k] ipt_do_table 2.70% [kernel] [k] run_timer_softirq 2.50% [kernel] [k] ip_finish_output 2.04% [kernel] [k] cascade Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_fastopen.c') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f69f436fcbcc..410ac481fda0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent + * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); - atomic_set(&req->rsk_refcnt, 1); + atomic_set(&req->rsk_refcnt, 2); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, child); -- cgit From 7656d842de93fd2d2de7b403062cad757cadf1df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Oct 2015 21:08:07 -0700 Subject: tcp: fix fastopen races vs lockless listener There are multiple races that need fixes : 1) skb_get() + queue skb + kfree_skb() is racy An accept() can be done on another cpu, data consumed immediately. tcp_recvmsg() uses __kfree_skb() as it is assumed all skb found in socket receive queue are private. Then the kfree_skb() in tcp_rcv_state_process() uses an already freed skb 2) tcp_reqsk_record_syn() needs to be done before tcp_try_fastopen() for the same reasons. 3) We want to send the SYNACK before queueing child into accept queue, otherwise we might reintroduce the ooo issue fixed in commit 7c85af881044 ("tcp: avoid reorders for TFO passive connections") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'net/ipv4/tcp_fastopen.c') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 410ac481fda0..93396bf7b475 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -168,8 +168,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, TCP_TIMEOUT_INIT, TCP_RTO_MAX); atomic_set(&req->rsk_refcnt, 2); - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); /* Now finish processing the fastopen child socket. */ inet_csk(child)->icsk_af_ops->rebuild_header(child); @@ -178,12 +176,10 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * Note that IPv6 might also have used skb_get() trick - * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts) - * So we need to eventually get a clone of the packet, - * before inserting it in sk_receive_queue. + /* Queue the data carried in the SYN packet. + * We used to play tricky games with skb_get(). + * With lockless listener, it is a dead end. + * Do not think about it. * * XXX (TFO) - we honor a zero-payload TFO request for now, * (any reason not to?) but no need to queue the skb since @@ -191,12 +187,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, */ end_seq = TCP_SKB_CB(skb)->end_seq; if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2; - - if (unlikely(skb_shared(skb))) - skb2 = skb_clone(skb, GFP_ATOMIC); - else - skb2 = skb_get(skb); + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (likely(skb2)) { skb_dst_drop(skb2); @@ -214,12 +205,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, } } tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; - sk->sk_data_ready(sk); - bh_unlock_sock(child); - /* Note: sock_put(child) will be done by tcp_conn_request() - * after SYNACK packet is sent. + /* tcp_conn_request() is sending the SYNACK, + * and queues the child into listener accept queue. */ - WARN_ON(!req->sk); return child; } -- cgit