From 4546e44ca2ec6fa3ca971ab6de3ef382f17ed544 Mon Sep 17 00:00:00 2001
From: Enrico Weigelt <info@metux.net>
Date: Wed, 5 Jun 2019 22:58:50 +0200
Subject: net: socket: drop unneeded likely() call around IS_ERR()

IS_ERR() already calls unlikely(), so this extra likely() call
around the !IS_ERR() is not needed.

Signed-off-by: Enrico Weigelt <info@metux.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/socket.c')

diff --git a/net/socket.c b/net/socket.c
index 72372dc5dd70..0aa5a06f3698 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -435,7 +435,7 @@ static int sock_map_fd(struct socket *sock, int flags)
 	}
 
 	newfile = sock_alloc_file(sock, flags, NULL);
-	if (likely(!IS_ERR(newfile))) {
+	if (!IS_ERR(newfile)) {
 		fd_install(fd, newfile);
 		return fd;
 	}
-- 
cgit 


From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 27 Jun 2019 13:38:47 -0700
Subject: bpf: implement getsockopt and setsockopt hooks

Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and
BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks.

BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before
passing them down to the kernel or bypass kernel completely.
BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that
kernel returns.
Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure.

The buffer memory is pre-allocated (because I don't think there is
a precedent for working with __user memory from bpf). This might be
slow to do for each {s,g}etsockopt call, that's why I've added
__cgroup_bpf_prog_array_is_empty that exits early if there is nothing
attached to a cgroup. Note, however, that there is a race between
__cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup
program layout might have changed; this should not be a problem
because in general there is a race between multiple calls to
{s,g}etsocktop and user adding/removing bpf progs from a cgroup.

The return code of the BPF program is handled as follows:
* 0: EPERM
* 1: success, continue with next BPF program in the cgroup chain

v9:
* allow overwriting setsockopt arguments (Alexei Starovoitov):
  * use set_fs (same as kernel_setsockopt)
  * buffer is always kzalloc'd (no small on-stack buffer)

v8:
* use s32 for optlen (Andrii Nakryiko)

v7:
* return only 0 or 1 (Alexei Starovoitov)
* always run all progs (Alexei Starovoitov)
* use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov)
  (decided to use optval=-1 instead, optval=0 might be a valid input)
* call getsockopt hook after kernel handlers (Alexei Starovoitov)

v6:
* rework cgroup chaining; stop as soon as bpf program returns
  0 or 2; see patch with the documentation for the details
* drop Andrii's and Martin's Acked-by (not sure they are comfortable
  with the new state of things)

v5:
* skip copy_to_user() and put_user() when ret == 0 (Martin Lau)

v4:
* don't export bpf_sk_fullsock helper (Martin Lau)
* size != sizeof(__u64) for uapi pointers (Martin Lau)
* offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau)

v3:
* typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko)
* reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii
  Nakryiko)
* use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau)
* use BPF_FIELD_SIZEOF() for consistency (Martin Lau)
* new CG_SOCKOPT_ACCESS macro to wrap repeated parts

v2:
* moved bpf_sockopt_kern fields around to remove a hole (Martin Lau)
* aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau)
* bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau)
* added [0,2] return code check to verifier (Martin Lau)
* dropped unused buf[64] from the stack (Martin Lau)
* use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau)
* dropped bpf_target_off from ctx rewrites (Martin Lau)
* use return code for kernel bypass (Martin Lau & Andrii Nakryiko)

Cc: Andrii Nakryiko <andriin@fb.com>
Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/socket.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'net/socket.c')

diff --git a/net/socket.c b/net/socket.c
index 963df5dbdd54..0ddfbfb761d9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2051,6 +2051,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
 static int __sys_setsockopt(int fd, int level, int optname,
 			    char __user *optval, int optlen)
 {
+	mm_segment_t oldfs = get_fs();
+	char *kernel_optval = NULL;
 	int err, fput_needed;
 	struct socket *sock;
 
@@ -2063,6 +2065,22 @@ static int __sys_setsockopt(int fd, int level, int optname,
 		if (err)
 			goto out_put;
 
+		err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level,
+						     &optname, optval, &optlen,
+						     &kernel_optval);
+
+		if (err < 0) {
+			goto out_put;
+		} else if (err > 0) {
+			err = 0;
+			goto out_put;
+		}
+
+		if (kernel_optval) {
+			set_fs(KERNEL_DS);
+			optval = (char __user __force *)kernel_optval;
+		}
+
 		if (level == SOL_SOCKET)
 			err =
 			    sock_setsockopt(sock, level, optname, optval,
@@ -2071,6 +2089,11 @@ static int __sys_setsockopt(int fd, int level, int optname,
 			err =
 			    sock->ops->setsockopt(sock, level, optname, optval,
 						  optlen);
+
+		if (kernel_optval) {
+			set_fs(oldfs);
+			kfree(kernel_optval);
+		}
 out_put:
 		fput_light(sock->file, fput_needed);
 	}
@@ -2093,6 +2116,7 @@ static int __sys_getsockopt(int fd, int level, int optname,
 {
 	int err, fput_needed;
 	struct socket *sock;
+	int max_optlen;
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (sock != NULL) {
@@ -2100,6 +2124,8 @@ static int __sys_getsockopt(int fd, int level, int optname,
 		if (err)
 			goto out_put;
 
+		max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
 		if (level == SOL_SOCKET)
 			err =
 			    sock_getsockopt(sock, level, optname, optval,
@@ -2108,6 +2134,10 @@ static int __sys_getsockopt(int fd, int level, int optname,
 			err =
 			    sock->ops->getsockopt(sock, level, optname, optval,
 						  optlen);
+
+		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+						     optval, optlen,
+						     max_optlen, err);
 out_put:
 		fput_light(sock->file, fput_needed);
 	}
-- 
cgit 


From a648a592dc7c20873eb0aee78fa93e869714f42a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 3 Jul 2019 16:06:54 +0200
Subject: net: adjust socket level ICW to cope with ipv6 variant of {recv,
 send}msg

After the previous patch we have ipv{6,4} variants for {recv,send}msg,
we should use the generic _INET ICW variant to call into the proper
build-in.

This also allows dropping the now unused and rather ugly _INET4 ICW macro

v1 -> v2:
 - use ICW macro to declare inet6_{recv,send}msg
 - fix a couple of checkpatch offender in the code context

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'net/socket.c')

diff --git a/net/socket.c b/net/socket.c
index 963df5dbdd54..a865708940f9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -103,13 +103,6 @@
 #include <net/busy_poll.h>
 #include <linux/errqueue.h>
 
-/* proto_ops for ipv4 and ipv6 use the same {recv,send}msg function */
-#if IS_ENABLED(CONFIG_INET)
-#define INDIRECT_CALL_INET4(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
-#else
-#define INDIRECT_CALL_INET4(f, f1, ...) f(__VA_ARGS__)
-#endif
-
 #ifdef CONFIG_NET_RX_BUSY_POLL
 unsigned int sysctl_net_busy_read __read_mostly;
 unsigned int sysctl_net_busy_poll __read_mostly;
@@ -641,10 +634,13 @@ EXPORT_SYMBOL(__sock_tx_timestamp);
 
 INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *,
 					   size_t));
+INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *,
+					    size_t));
 static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
 {
-	int ret = INDIRECT_CALL_INET4(sock->ops->sendmsg, inet_sendmsg, sock,
-				      msg, msg_data_left(msg));
+	int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg,
+				     inet_sendmsg, sock, msg,
+				     msg_data_left(msg));
 	BUG_ON(ret == -EIOCBQUEUED);
 	return ret;
 }
@@ -870,12 +866,15 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
 
 INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *,
-					   size_t , int ));
+					   size_t, int));
+INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
+					    size_t, int));
 static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 				     int flags)
 {
-	return INDIRECT_CALL_INET4(sock->ops->recvmsg, inet_recvmsg, sock, msg,
-				   msg_data_left(msg), flags);
+	return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg,
+				  inet_recvmsg, sock, msg, msg_data_left(msg),
+				  flags);
 }
 
 /**
-- 
cgit 


From 6d7855c54e1e269275d7c504f8f62a0b7a5b3f18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Jul 2019 20:13:22 +0100
Subject: sockfs: switch to ->free_inode()

we do have an RCU-delayed part there already (freeing the wq),
so it's not like the pipe situation; moreover, it might be
worth considering coallocating wq with the rest of struct sock_alloc.
->sk_wq in struct sock would remain a pointer as it is, but
the object it normally points to would be coallocated with
struct socket...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/socket.c')

diff --git a/net/socket.c b/net/socket.c
index d97b74f762e8..541719a2443d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -258,12 +258,12 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void sock_destroy_inode(struct inode *inode)
+static void sock_free_inode(struct inode *inode)
 {
 	struct socket_alloc *ei;
 
 	ei = container_of(inode, struct socket_alloc, vfs_inode);
-	kfree_rcu(ei->socket.wq, rcu);
+	kfree(ei->socket.wq);
 	kmem_cache_free(sock_inode_cachep, ei);
 }
 
@@ -288,7 +288,7 @@ static void init_inodecache(void)
 
 static const struct super_operations sockfs_ops = {
 	.alloc_inode	= sock_alloc_inode,
-	.destroy_inode	= sock_destroy_inode,
+	.free_inode	= sock_free_inode,
 	.statfs		= simple_statfs,
 };
 
-- 
cgit 


From 333f7909a8573145811c4ab7d8c9092301707721 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Jul 2019 20:14:16 +0100
Subject: coallocate socket_wq with socket itself

socket->wq is assign-once, set when we are initializing both
struct socket it's in and struct socket_wq it points to.  As the
matter of fact, the only reason for separate allocation was the
ability to RCU-delay freeing of socket_wq.  RCU-delaying the
freeing of socket itself gets rid of that need, so we can just
fold struct socket_wq into the end of struct socket and simplify
the life both for sock_alloc_inode() (one allocation instead of
two) and for tun/tap oddballs, where we used to embed struct socket
and struct socket_wq into the same structure (now - embedding just
the struct socket).

Note that reference to struct socket_wq in struct sock does remain
a reference - that's unchanged.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'net/socket.c')

diff --git a/net/socket.c b/net/socket.c
index 541719a2443d..16449d6daeca 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -234,20 +234,13 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init;
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
 	struct socket_alloc *ei;
-	struct socket_wq *wq;
 
 	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
-	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
-	if (!wq) {
-		kmem_cache_free(sock_inode_cachep, ei);
-		return NULL;
-	}
-	init_waitqueue_head(&wq->wait);
-	wq->fasync_list = NULL;
-	wq->flags = 0;
-	ei->socket.wq = wq;
+	init_waitqueue_head(&ei->socket.wq.wait);
+	ei->socket.wq.fasync_list = NULL;
+	ei->socket.wq.flags = 0;
 
 	ei->socket.state = SS_UNCONNECTED;
 	ei->socket.flags = 0;
@@ -263,7 +256,6 @@ static void sock_free_inode(struct inode *inode)
 	struct socket_alloc *ei;
 
 	ei = container_of(inode, struct socket_alloc, vfs_inode);
-	kfree(ei->socket.wq);
 	kmem_cache_free(sock_inode_cachep, ei);
 }
 
@@ -599,7 +591,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
 		module_put(owner);
 	}
 
-	if (sock->wq->fasync_list)
+	if (sock->wq.fasync_list)
 		pr_err("%s: fasync list not empty!\n", __func__);
 
 	if (!sock->file) {
@@ -1288,13 +1280,12 @@ static int sock_fasync(int fd, struct file *filp, int on)
 {
 	struct socket *sock = filp->private_data;
 	struct sock *sk = sock->sk;
-	struct socket_wq *wq;
+	struct socket_wq *wq = &sock->wq;
 
 	if (sk == NULL)
 		return -EINVAL;
 
 	lock_sock(sk);
-	wq = sock->wq;
 	fasync_helper(fd, filp, on, &wq->fasync_list);
 
 	if (!wq->fasync_list)
-- 
cgit