From 39771b127b412377d6354893c7d43ee8f2edecfd Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 2 Apr 2016 23:08:06 -0400 Subject: sock: break up sock_cmsg_snd into __sock_cmsg_snd and loop To process cmsg's of the SOL_SOCKET level in addition to cmsgs of another level, protocols can call sock_cmsg_send(). This causes a double walk on the cmsghdr list, one for SOL_SOCKET and one for the other level. Extract the inner demultiplex logic from the loop that walks the list, to allow having this called directly from a walker in the protocol specific code. Signed-off-by: Willem de Bruijn Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/core/sock.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index b67b9aedb230..66976f88566b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1866,27 +1866,38 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, + struct sockcm_cookie *sockc) +{ + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(__sock_cmsg_send); + int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; + int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; - switch (cmsg->cmsg_type) { - case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) - return -EINVAL; - sockc->mark = *(u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } + ret = __sock_cmsg_send(sk, msg, cmsg, sockc); + if (ret) + return ret; } return 0; } -- cgit From 6db8b963a7a31047573f229492ff6fc0f51cc377 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:07 -0400 Subject: tcp: accept SOF_TIMESTAMPING_OPT_ID for passive TFO SOF_TIMESTAMPING_OPT_ID is set to get data-independent IDs to associate timestamps with send calls. For TCP connections, tp->snd_una is used as the starting point to calculate relative IDs. This socket option will fail if set before the handshake on a passive TCP fast open connection with data in SYN or SYN/ACK, since setsockopt requires the connection to be in the ESTABLISHED state. To address these, instead of limiting the option to the ESTABLISHED state, accept the SOF_TIMESTAMPING_OPT_ID option as long as the connection is not in LISTEN or CLOSE states. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 66976f88566b..0a64fe20ce5a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -832,7 +832,8 @@ set_rcvbuf: !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - if (sk->sk_state != TCP_ESTABLISHED) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) { ret = -EINVAL; break; } -- cgit From 3dd17e63f5131bf2528f34aa5e3e57758175af92 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 2 Apr 2016 23:08:09 -0400 Subject: sock: accept SO_TIMESTAMPING flags in socket cmsg Accept SO_TIMESTAMPING in control messages of the SOL_SOCKET level as a basis to accept timestamping requests per write. This implementation only accepts TX recording flags (i.e., SOF_TIMESTAMPING_TX_HARDWARE, SOF_TIMESTAMPING_TX_SOFTWARE, SOF_TIMESTAMPING_TX_SCHED, and SOF_TIMESTAMPING_TX_ACK) in control messages. Users need to set reporting flags (e.g., SOF_TIMESTAMPING_OPT_ID) per socket via socket options. This commit adds a tsflags field in sockcm_cookie which is set in __sock_cmsg_send. It only override the SOF_TIMESTAMPING_TX_* bits in sockcm_cookie.tsflags allowing the control message to override the recording behavior per write, yet maintaining the value of other flags. This patch implements validating the control message and setting tsflags in struct sockcm_cookie. Next commits in this series will actually implement timestamping per write for different protocols. Signed-off-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 0a64fe20ce5a..315f5e57fffe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1870,6 +1870,8 @@ EXPORT_SYMBOL(sock_alloc_send_skb); int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { + u32 tsflags; + switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) @@ -1878,6 +1880,17 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; + case SO_TIMESTAMPING: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + + tsflags = *(u32 *)CMSG_DATA(cmsg); + if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) + return -EINVAL; + + sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; + sockc->tsflags |= tsflags; + break; default: return -EINVAL; } -- cgit From a4298e4522d687a79af8f8fbb7eca68399ab2d81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:12 -0700 Subject: net: add SOCK_RCU_FREE socket flag We want a generic way to insert an RCU grace period before socket freeing for cases where RCU_SLAB_DESTROY_BY_RCU is adding too much overhead. SLAB_DESTROY_BY_RCU strict rules force us to take a reference on the socket sk_refcnt, and it is a performance problem for UDP encapsulation, or TCP synflood behavior, as many CPUs might attempt the atomic operations on a shared sk_refcnt UDP sockets and TCP listeners can set SOCK_RCU_FREE so that their lookup can use traditional RCU rules, without refcount changes. They can set the flag only once hashed and visible by other cpus. Signed-off-by: Eric Dumazet Cc: Tom Herbert Tested-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/sock.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 315f5e57fffe..7a6a063b28b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1419,8 +1419,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -void sk_destruct(struct sock *sk) +/* Sockets having SOCK_RCU_FREE will call this function after one RCU + * grace period. This is the case for UDP sockets and TCP listeners. + */ +static void __sk_destruct(struct rcu_head *head) { + struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) @@ -1449,6 +1453,14 @@ void sk_destruct(struct sock *sk) sk_prot_free(sk->sk_prot_creator, sk); } +void sk_destruct(struct sock *sk) +{ + if (sock_flag(sk, SOCK_RCU_FREE)) + call_rcu(&sk->sk_rcu, __sk_destruct); + else + __sk_destruct(&sk->sk_rcu); +} + static void __sk_free(struct sock *sk) { if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) -- cgit From 9caad864151e525929d323de96cad382da49c3b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:20 -0700 Subject: tcp: increment sk_drops for listeners Goal: packets dropped by a listener are accounted for. This adds tcp_listendrop() helper, and clears sk_drops in sk_clone_lock() so that children do not inherit their parent drop count. Note that we no longer increment LINUX_MIB_LISTENDROPS counter when sending a SYNCOOKIE, since the SYN packet generated a SYNACK. We already have a separate LINUX_MIB_SYNCOOKIESSENT Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 7a6a063b28b3..2f517ea56786 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1525,6 +1525,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_dst_cache = NULL; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; + atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; -- cgit From e6afc8ace6dd5cef5e812f26c72579da8806f5ac Mon Sep 17 00:00:00 2001 From: samanthakumar Date: Tue, 5 Apr 2016 12:41:15 -0400 Subject: udp: remove headers from UDP packets before queueing Remove UDP transport headers before queueing packets for reception. This change simplifies a follow-up patch to add MSG_PEEK support. Signed-off-by: Sam Kumar Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 2f517ea56786..e12197b359fd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -402,9 +402,8 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags) } -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -414,10 +413,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return -ENOMEM; } - err = sk_filter(sk, skb); - if (err) - return err; - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; @@ -440,6 +435,18 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk->sk_data_ready(sk); return 0; } +EXPORT_SYMBOL(__sock_queue_rcv_skb); + +int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int err; + + err = sk_filter(sk, skb); + if (err) + return err; + + return __sock_queue_rcv_skb(sk, skb); +} EXPORT_SYMBOL(sock_queue_rcv_skb); int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) -- cgit From 627d2d6b550094d88f9e518e15967e7bf906ebbf Mon Sep 17 00:00:00 2001 From: samanthakumar Date: Tue, 5 Apr 2016 12:41:16 -0400 Subject: udp: enable MSG_PEEK at non-zero offset Enable peeking at UDP datagrams at the offset specified with socket option SOL_SOCKET/SO_PEEK_OFF. Peek at any datagram in the queue, up to the end of the given datagram. Implement the SO_PEEK_OFF semantics introduced in commit ef64a54f6e55 ("sock: Introduce the SO_PEEK_OFF sock option"). Increase the offset on peek, decrease it on regular reads. When peeking, always checksum the packet immediately, to avoid recomputation on subsequent peeks and final read. The socket lock is not held for the duration of udp_recvmsg, so peek and read operations can run concurrently. Only the last store to sk_peek_off is preserved. Signed-off-by: Sam Kumar Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index e12197b359fd..2ce76e82857f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2187,6 +2187,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount) } EXPORT_SYMBOL(__sk_mem_reclaim); +int sk_set_peek_off(struct sock *sk, int val) +{ + if (val < 0) + return -EINVAL; + + sk->sk_peek_off = val; + return 0; +} +EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when -- cgit From 61881cfb5ad80c1d0a46ca6d08b7e271892b2ff6 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 5 Apr 2016 17:10:14 +0200 Subject: sock: fix lockdep annotation in release_sock During release_sock we use callbacks to finish the processing of outstanding skbs on the socket. We actually are still locked, sk_locked.owned == 1, but we already told lockdep that the mutex is released. This could lead to false positives in lockdep for lockdep_sock_is_held (we don't hold the slock spinlock during processing the outstanding skbs). I took over this patch from Eric Dumazet and tested it. Signed-off-by: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/sock.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 2ce76e82857f..152274d188ef 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2483,11 +2483,6 @@ EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { - /* - * The sk_lock has mutex_unlock() semantics: - */ - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); -- cgit From 5413d1babe8f10de13d72496c12b862eef8ba613 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Apr 2016 14:16:52 -0700 Subject: net: do not block BH while processing socket backlog Socket backlog processing is a major latency source. With current TCP socket sk_rcvbuf limits, I have sampled __release_sock() holding cpu for more than 5 ms, and packets being dropped by the NIC once ring buffer is filled. All users are now ready to be called from process context, we can unblock BH and let interrupts be serviced faster. cond_resched_softirq() could be removed, as it has no more user. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/sock.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index e16a5db853c6..70744dbb6c3f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2019,33 +2019,27 @@ static void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { - struct sk_buff *skb = sk->sk_backlog.head; + struct sk_buff *skb, *next; - do { + while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; - bh_unlock_sock(sk); - do { - struct sk_buff *next = skb->next; + spin_unlock_bh(&sk->sk_lock.slock); + do { + next = skb->next; prefetch(next); WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; sk_backlog_rcv(sk, skb); - /* - * We are in process context here with softirqs - * disabled, use cond_resched_softirq() to preempt. - * This is safe to do because we've taken the backlog - * queue private: - */ - cond_resched_softirq(); + cond_resched(); skb = next; } while (skb != NULL); - bh_lock_sock(sk); - } while ((skb = sk->sk_backlog.head) != NULL); + spin_lock_bh(&sk->sk_lock.slock); + } /* * Doing the zeroing here guarantee we can not loop forever -- cgit From d41a69f1d390fa3f2546498103cdcd78b30676ff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Apr 2016 14:16:53 -0700 Subject: tcp: make tcp_sendmsg() aware of socket backlog Large sendmsg()/write() hold socket lock for the duration of the call, unless sk->sk_sndbuf limit is hit. This is bad because incoming packets are parked into socket backlog for a long time. Critical decisions like fast retransmit might be delayed. Receivers have to maintain a big out of order queue with additional cpu overhead, and also possible stalls in TX once windows are full. Bidirectional flows are particularly hurt since the backlog can become quite big if the copy from user space triggers IO (page faults) Some applications learnt to use sendmsg() (or sendmmsg()) with small chunks to avoid this issue. Kernel should know better, right ? Add a generic sk_flush_backlog() helper and use it right before a new skb is allocated. Typically we put 64KB of payload per skb (unless MSG_EOR is requested) and checking socket backlog every 64KB gives good results. As a matter of fact, tests with TSO/GSO disabled give very nice results, as we manage to keep a small write queue and smaller perceived rtt. Note that sk_flush_backlog() maintains socket ownership, so is not equivalent to a {release_sock(sk); lock_sock(sk);}, to ensure implicit atomicity rules that sendmsg() was giving to (possibly buggy) applications. In this simple implementation, I chose to not call tcp_release_cb(), but we might consider this later. Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Cc: Marcelo Ricardo Leitner Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/core/sock.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 70744dbb6c3f..f615e9391170 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2048,6 +2048,13 @@ static void __release_sock(struct sock *sk) sk->sk_backlog.len = 0; } +void __sk_flush_backlog(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); + __release_sock(sk); + spin_unlock_bh(&sk->sk_lock.slock); +} + /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on -- cgit From 1d2077ac0165c0d173a2255e37cf4dc5033d92c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 May 2016 10:56:27 -0700 Subject: net: add __sock_wfree() helper Hosts sending lot of ACK packets exhibit high sock_wfree() cost because of cache line miss to test SOCK_USE_WRITE_QUEUE We could move this flag close to sk_wmem_alloc but it is better to perform the atomic_sub_and_test() on a clean cache line, as it avoid one extra bus transaction. skb_orphan_partial() can also have a fast track for packets that either are TCP acks, or already went through another skb_orphan_partial() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index f615e9391170..08bf97eceeb3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1655,6 +1655,17 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +/* This variant of sock_wfree() is used by TCP, + * since it sets SOCK_USE_WRITE_QUEUE. + */ +void __sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) + __sk_free(sk); +} + void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); @@ -1677,8 +1688,21 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL(skb_set_owner_w); +/* This helper is used by netem, as it can hold packets in its + * delay queue. We want to allow the owner socket to send more + * packets, as if they were already TX completed by a typical driver. + * But we also want to keep skb->sk set because some packet schedulers + * rely on it (sch_fq for example). So we set skb->truesize to a small + * amount (1) and decrease sk_wmem_alloc accordingly. + */ void skb_orphan_partial(struct sk_buff *skb) { + /* If this skb is a TCP pure ACK or already went here, + * we have nothing to do. 2 is already a very small truesize. + */ + if (skb->truesize <= 2) + return; + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, * so we do not completely orphan skb, but transfert all * accounted bytes but one, to avoid unexpected reorders. -- cgit