From 306b13eb3cf9515a8214bbf5d69d811371d05792 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 28 Jul 2017 16:22:41 -0700 Subject: proto_ops: Add locked held versions of sendmsg and sendpage Add new proto_ops sendmsg_locked and sendpage_locked that can be called when the socket lock is already held. Correspondingly, add kernel_sendmsg_locked and kernel_sendpage_locked as front end functions. These functions will be used in zero proxy so that we can take the socket lock in a ULP sendmsg/sendpage and then directly call the backend transport proto_ops functions. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/sock.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index ac2a404c73eb..742f68c9c84a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2500,6 +2500,12 @@ int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) } EXPORT_SYMBOL(sock_no_sendmsg); +int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_sendmsg_locked); + int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { @@ -2528,6 +2534,22 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz } EXPORT_SYMBOL(sock_no_sendpage); +ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + ssize_t res; + struct msghdr msg = {.msg_flags = flags}; + struct kvec iov; + char *kaddr = kmap(page); + + iov.iov_base = kaddr + offset; + iov.iov_len = size; + res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); + kunmap(page); + return res; +} +EXPORT_SYMBOL(sock_no_sendpage_locked); + /* * Default Socket Callbacks */ -- cgit From f70f250a77313b542531e1ff7a449cd0ccd83ec0 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 1 Aug 2017 12:49:10 +0300 Subject: net: Allow IPsec GSO for local sockets This patch allows local sockets to make use of XFRM GSO code path. Signed-off-by: Steffen Klassert Signed-off-by: Ilan Tayari --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 742f68c9c84a..564f835f408a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1757,7 +1757,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; sk->sk_route_caps &= ~sk->sk_route_nocaps; if (sk_can_gso(sk)) { - if (dst->header_len) { + if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; -- cgit From 98ba0bd5505dcbb90322a4be07bcfe6b8a18c73f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:37 -0400 Subject: sock: allocate skbs from optmem Add sock_omalloc and sock_ofree to be able to allocate control skbs, for instance for looping errors onto sk_error_queue. The transmit budget (sk_wmem_alloc) is involved in transmit skb shaping, most notably in TCP Small Queues. Using this budget for control packets would impact transmission. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 742f68c9c84a..1261880bdcc8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1923,6 +1923,33 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, } EXPORT_SYMBOL(sock_wmalloc); +static void sock_ofree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_omem_alloc); +} + +struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, + gfp_t priority) +{ + struct sk_buff *skb; + + /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ + if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > + sysctl_optmem_max) + return NULL; + + skb = alloc_skb(size, priority); + if (!skb) + return NULL; + + atomic_add(skb->truesize, &sk->sk_omem_alloc); + skb->sk = sk; + skb->destructor = sock_ofree; + return skb; +} + /* * Allocate a memory block from the socket's option memory buffer. */ -- cgit From 52267790ef52d7513879238ca9fac22c1733e0e3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:39 -0400 Subject: sock: add MSG_ZEROCOPY The kernel supports zerocopy sendmsg in virtio and tap. Expand the infrastructure to support other socket types. Introduce a completion notification channel over the socket error queue. Notifications are returned with ee_origin SO_EE_ORIGIN_ZEROCOPY. ee_errno is 0 to avoid blocking the send/recv path on receiving notifications. Add reference counting, to support the skb split, merge, resize and clone operations possible with SOCK_STREAM and other socket types. The patch does not yet modify any datapaths. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 1261880bdcc8..e8b696858cad 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1670,6 +1670,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); @@ -2722,6 +2723,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; + atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; -- cgit From 76851d1212c11365362525e1e2c0a18c97478e6b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:40 -0400 Subject: sock: add SOCK_ZEROCOPY sockopt The send call ignores unknown flags. Legacy applications may already unwittingly pass MSG_ZEROCOPY. Continue to ignore this flag unless a socket opts in to zerocopy. Introduce socket option SO_ZEROCOPY to enable MSG_ZEROCOPY processing. Processes can also query this socket option to detect kernel support for the feature. Older kernels will return ENOPROTOOPT. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index e8b696858cad..9ea988d25b0a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1055,6 +1055,20 @@ set_rcvbuf: if (val == 1) dst_negative_advice(sk); break; + + case SO_ZEROCOPY: + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + ret = -ENOTSUPP; + else if (sk->sk_protocol != IPPROTO_TCP) + ret = -ENOTSUPP; + else if (sk->sk_state != TCP_CLOSE) + ret = -EBUSY; + else if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + break; + default: ret = -ENOPROTOOPT; break; @@ -1383,6 +1397,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val64 = sock_gen_cookie(sk); break; + case SO_ZEROCOPY: + v.val = sock_flag(sk, SOCK_ZEROCOPY); + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). -- cgit From 257a73031d29447ee82fe06d2b97d8564f63276d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 23 Aug 2017 11:57:51 +0200 Subject: net/sock: allow the user to set negative peek offset This is necessary to allow the user to disable peeking with offset once it's enabled. Unix sockets already allow the above, with this patch we permit it for udp[6] sockets, too. Fixes: 627d2d6b5500 ("udp: enable MSG_PEEK at non-zero offset") Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 0f04d8bff607..dfdd14cac775 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2454,9 +2454,6 @@ EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) { - if (val < 0) - return -EINVAL; - sk->sk_peek_off = val; return 0; } -- cgit From eaa72dc47488d599439cd0fd0f8c4f1bcb3906bb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Aug 2017 15:16:01 -0700 Subject: neigh: increase queue_len_bytes to match wmem_default Florian reported UDP xmit drops that could be root caused to the too small neigh limit. Current limit is 64 KB, meaning that even a single UDP socket would hit it, since its default sk_sndbuf comes from net.core.wmem_default (~212992 bytes on 64bit arches). Once ARP/ND resolution is in progress, we should allow a little more packets to be queued, at least for one producer. Once neigh arp_queue is filled, a rogue socket should hit its sk_sndbuf limit and either block in sendmsg() or return -EAGAIN. Signed-off-by: Eric Dumazet Reported-by: Florian Fainelli Signed-off-by: David S. Miller --- net/core/sock.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index dfdd14cac775..9b7b6bbb2a23 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -307,16 +307,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX]; static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; -/* Take into consideration the size of the struct sk_buff overhead in the - * determination of these values, since that is non-constant across - * platforms. This makes socket queueing behavior and performance - * not depend upon such differences. - */ -#define _SK_MEM_PACKETS 256 -#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) - /* Run time adjustable parameters. */ __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; EXPORT_SYMBOL(sysctl_wmem_max); -- cgit