From 4546e44ca2ec6fa3ca971ab6de3ef382f17ed544 Mon Sep 17 00:00:00 2001 From: Enrico Weigelt Date: Wed, 5 Jun 2019 22:58:50 +0200 Subject: net: socket: drop unneeded likely() call around IS_ERR() IS_ERR() already calls unlikely(), so this extra likely() call around the !IS_ERR() is not needed. Signed-off-by: Enrico Weigelt Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/socket.c') diff --git a/net/socket.c b/net/socket.c index 72372dc5dd70..0aa5a06f3698 100644 --- a/net/socket.c +++ b/net/socket.c @@ -435,7 +435,7 @@ static int sock_map_fd(struct socket *sock, int flags) } newfile = sock_alloc_file(sock, flags, NULL); - if (likely(!IS_ERR(newfile))) { + if (!IS_ERR(newfile)) { fd_install(fd, newfile); return fd; } -- cgit From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 27 Jun 2019 13:38:47 -0700 Subject: bpf: implement getsockopt and setsockopt hooks Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko Cc: Martin Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- net/socket.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'net/socket.c') diff --git a/net/socket.c b/net/socket.c index 963df5dbdd54..0ddfbfb761d9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2051,6 +2051,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { + mm_segment_t oldfs = get_fs(); + char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2063,6 +2065,22 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, + &optname, optval, &optlen, + &kernel_optval); + + if (err < 0) { + goto out_put; + } else if (err > 0) { + err = 0; + goto out_put; + } + + if (kernel_optval) { + set_fs(KERNEL_DS); + optval = (char __user __force *)kernel_optval; + } + if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, @@ -2071,6 +2089,11 @@ static int __sys_setsockopt(int fd, int level, int optname, err = sock->ops->setsockopt(sock, level, optname, optval, optlen); + + if (kernel_optval) { + set_fs(oldfs); + kfree(kernel_optval); + } out_put: fput_light(sock->file, fput_needed); } @@ -2093,6 +2116,7 @@ static int __sys_getsockopt(int fd, int level, int optname, { int err, fput_needed; struct socket *sock; + int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -2100,6 +2124,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, @@ -2108,6 +2134,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); + + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, + max_optlen, err); out_put: fput_light(sock->file, fput_needed); } -- cgit From a648a592dc7c20873eb0aee78fa93e869714f42a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 3 Jul 2019 16:06:54 +0200 Subject: net: adjust socket level ICW to cope with ipv6 variant of {recv, send}msg After the previous patch we have ipv{6,4} variants for {recv,send}msg, we should use the generic _INET ICW variant to call into the proper build-in. This also allows dropping the now unused and rather ugly _INET4 ICW macro v1 -> v2: - use ICW macro to declare inet6_{recv,send}msg - fix a couple of checkpatch offender in the code context Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/socket.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'net/socket.c') diff --git a/net/socket.c b/net/socket.c index 963df5dbdd54..a865708940f9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -103,13 +103,6 @@ #include #include -/* proto_ops for ipv4 and ipv6 use the same {recv,send}msg function */ -#if IS_ENABLED(CONFIG_INET) -#define INDIRECT_CALL_INET4(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) -#else -#define INDIRECT_CALL_INET4(f, f1, ...) f(__VA_ARGS__) -#endif - #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; @@ -641,10 +634,13 @@ EXPORT_SYMBOL(__sock_tx_timestamp); INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); +INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, + size_t)); static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - int ret = INDIRECT_CALL_INET4(sock->ops->sendmsg, inet_sendmsg, sock, - msg, msg_data_left(msg)); + int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, + inet_sendmsg, sock, msg, + msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); return ret; } @@ -870,12 +866,15 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, - size_t , int )); + size_t, int)); +INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, + size_t, int)); static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - return INDIRECT_CALL_INET4(sock->ops->recvmsg, inet_recvmsg, sock, msg, - msg_data_left(msg), flags); + return INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + inet_recvmsg, sock, msg, msg_data_left(msg), + flags); } /** -- cgit From 6d7855c54e1e269275d7c504f8f62a0b7a5b3f18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 5 Jul 2019 20:13:22 +0100 Subject: sockfs: switch to ->free_inode() we do have an RCU-delayed part there already (freeing the wq), so it's not like the pipe situation; moreover, it might be worth considering coallocating wq with the rest of struct sock_alloc. ->sk_wq in struct sock would remain a pointer as it is, but the object it normally points to would be coallocated with struct socket... Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/socket.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/socket.c') diff --git a/net/socket.c b/net/socket.c index d97b74f762e8..541719a2443d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -258,12 +258,12 @@ static struct inode *sock_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void sock_destroy_inode(struct inode *inode) +static void sock_free_inode(struct inode *inode) { struct socket_alloc *ei; ei = container_of(inode, struct socket_alloc, vfs_inode); - kfree_rcu(ei->socket.wq, rcu); + kfree(ei->socket.wq); kmem_cache_free(sock_inode_cachep, ei); } @@ -288,7 +288,7 @@ static void init_inodecache(void) static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, - .destroy_inode = sock_destroy_inode, + .free_inode = sock_free_inode, .statfs = simple_statfs, }; -- cgit From 333f7909a8573145811c4ab7d8c9092301707721 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 5 Jul 2019 20:14:16 +0100 Subject: coallocate socket_wq with socket itself socket->wq is assign-once, set when we are initializing both struct socket it's in and struct socket_wq it points to. As the matter of fact, the only reason for separate allocation was the ability to RCU-delay freeing of socket_wq. RCU-delaying the freeing of socket itself gets rid of that need, so we can just fold struct socket_wq into the end of struct socket and simplify the life both for sock_alloc_inode() (one allocation instead of two) and for tun/tap oddballs, where we used to embed struct socket and struct socket_wq into the same structure (now - embedding just the struct socket). Note that reference to struct socket_wq in struct sock does remain a reference - that's unchanged. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/socket.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'net/socket.c') diff --git a/net/socket.c b/net/socket.c index 541719a2443d..16449d6daeca 100644 --- a/net/socket.c +++ b/net/socket.c @@ -234,20 +234,13 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init; static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; - struct socket_wq *wq; ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; - wq = kmalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) { - kmem_cache_free(sock_inode_cachep, ei); - return NULL; - } - init_waitqueue_head(&wq->wait); - wq->fasync_list = NULL; - wq->flags = 0; - ei->socket.wq = wq; + init_waitqueue_head(&ei->socket.wq.wait); + ei->socket.wq.fasync_list = NULL; + ei->socket.wq.flags = 0; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; @@ -263,7 +256,6 @@ static void sock_free_inode(struct inode *inode) struct socket_alloc *ei; ei = container_of(inode, struct socket_alloc, vfs_inode); - kfree(ei->socket.wq); kmem_cache_free(sock_inode_cachep, ei); } @@ -599,7 +591,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) module_put(owner); } - if (sock->wq->fasync_list) + if (sock->wq.fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { @@ -1288,13 +1280,12 @@ static int sock_fasync(int fd, struct file *filp, int on) { struct socket *sock = filp->private_data; struct sock *sk = sock->sk; - struct socket_wq *wq; + struct socket_wq *wq = &sock->wq; if (sk == NULL) return -EINVAL; lock_sock(sk); - wq = sock->wq; fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) -- cgit