summaryrefslogtreecommitdiff
path: root/net/unix
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2021-07-15 18:17:51 -0700
committerAlexei Starovoitov <ast@kernel.org>2021-07-15 18:17:51 -0700
commitc50524ec4e3ad97d7d963268abd859c6413fbeb4 (patch)
tree0fce2ae719bf9c848f1f17ae533ce536d27abc21 /net/unix
parent1554a080e76554fa71004bba5b93c4695932a4d7 (diff)
parenta2ffda38dc01cb3963c78bd34e1ec7226af55028 (diff)
Merge branch 'sockmap: add sockmap support for unix datagram socket'
Cong Wang says: ==================== From: Cong Wang <cong.wang@bytedance.com> This is the last patchset of the original large patchset. In the previous patchset, a new BPF sockmap program BPF_SK_SKB_VERDICT was introduced and UDP began to support it too. In this patchset, we add BPF_SK_SKB_VERDICT support to Unix datagram socket, so that we can finally splice Unix datagram socket and UDP socket. Please check each patch description for more details. To see the big picture, the previous patchsets are available here: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=1e0ab70778bd86a90de438cc5e1535c115a7c396 https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=89d69c5d0fbcabd8656459bc8b1a476d6f1efee4 and this patchset is available here: https://github.com/congwang/linux/tree/sockmap3 Acked-by: John Fastabend <john.fastabend@gmail.com> --- v5: lift socket state check for dgram remove ->unhash() case add retries for EAGAIN in all test cases remove an unused parameter of __unix_dgram_recvmsg() rebase on the latest bpf-next v4: fix af_unix disconnect case add unix_unhash() split out two small patches reduce u->iolock critical section remove an unused parameter of __unix_dgram_recvmsg() v3: fix Kconfig dependency make unix_read_sock() static fix a UAF in unix_release() add a missing header unix_bpf.c v2: separate out from the original large patchset rebase to the latest bpf-next clean up unix_read_sock() export sock_map_close() factor out some helpers in selftests for code reuse ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'net/unix')
-rw-r--r--net/unix/Makefile1
-rw-r--r--net/unix/af_unix.c85
-rw-r--r--net/unix/unix_bpf.c122
3 files changed, 198 insertions, 10 deletions
diff --git a/net/unix/Makefile b/net/unix/Makefile
index 54e58cc4f945..20491825b4d0 100644
--- a/net/unix/Makefile
+++ b/net/unix/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_UNIX) += unix.o
unix-y := af_unix.o garbage.o
unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o
+unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
obj-$(CONFIG_UNIX_DIAG) += unix_diag.o
unix_diag-y := diag.o
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 23c92ad15c61..89927678c0dc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -494,6 +494,7 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
sk_error_report(other);
}
}
+ sk->sk_state = other->sk_state = TCP_CLOSE;
}
static void unix_sock_destructor(struct sock *sk)
@@ -669,6 +670,8 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
unsigned int flags);
static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
+static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor);
static int unix_dgram_connect(struct socket *, struct sockaddr *,
int, int);
static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -746,6 +749,7 @@ static const struct proto_ops unix_dgram_ops = {
.listen = sock_no_listen,
.shutdown = unix_shutdown,
.sendmsg = unix_dgram_sendmsg,
+ .read_sock = unix_read_sock,
.recvmsg = unix_dgram_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
@@ -777,10 +781,21 @@ static const struct proto_ops unix_seqpacket_ops = {
.show_fdinfo = unix_show_fdinfo,
};
-static struct proto unix_proto = {
+static void unix_close(struct sock *sk, long timeout)
+{
+ /* Nothing to do here, unix socket does not need a ->close().
+ * This is merely for sockmap.
+ */
+}
+
+struct proto unix_proto = {
.name = "UNIX",
.owner = THIS_MODULE,
.obj_size = sizeof(struct unix_sock),
+ .close = unix_close,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = unix_bpf_update_proto,
+#endif
};
static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
@@ -864,6 +879,7 @@ static int unix_release(struct socket *sock)
if (!sk)
return 0;
+ sk->sk_prot->close(sk, 0);
unix_release_sock(sk, 0);
sock->sk = NULL;
@@ -1199,6 +1215,9 @@ restart:
unix_peer(sk) = other;
unix_state_double_unlock(sk, other);
}
+
+ if (unix_peer(sk))
+ sk->sk_state = other->sk_state = TCP_ESTABLISHED;
return 0;
out_unlock:
@@ -1431,12 +1450,10 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
init_peercred(ska);
init_peercred(skb);
- if (ska->sk_type != SOCK_DGRAM) {
- ska->sk_state = TCP_ESTABLISHED;
- skb->sk_state = TCP_ESTABLISHED;
- socka->state = SS_CONNECTED;
- sockb->state = SS_CONNECTED;
- }
+ ska->sk_state = TCP_ESTABLISHED;
+ skb->sk_state = TCP_ESTABLISHED;
+ socka->state = SS_CONNECTED;
+ sockb->state = SS_CONNECTED;
return 0;
}
@@ -2081,11 +2098,11 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
}
}
-static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
- size_t size, int flags)
+int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
+ int flags)
{
struct scm_cookie scm;
- struct sock *sk = sock->sk;
+ struct socket *sock = sk->sk_socket;
struct unix_sock *u = unix_sk(sk);
struct sk_buff *skb, *last;
long timeo;
@@ -2188,6 +2205,53 @@ out:
return err;
}
+static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+
+#ifdef CONFIG_BPF_SYSCALL
+ if (sk->sk_prot != &unix_proto)
+ return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, NULL);
+#endif
+ return __unix_dgram_recvmsg(sk, msg, size, flags);
+}
+
+static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ int copied = 0;
+
+ while (1) {
+ struct unix_sock *u = unix_sk(sk);
+ struct sk_buff *skb;
+ int used, err;
+
+ mutex_lock(&u->iolock);
+ skb = skb_recv_datagram(sk, 0, 1, &err);
+ mutex_unlock(&u->iolock);
+ if (!skb)
+ return err;
+
+ used = recv_actor(desc, skb, 0, skb->len);
+ if (used <= 0) {
+ if (!copied)
+ copied = used;
+ kfree_skb(skb);
+ break;
+ } else if (used <= skb->len) {
+ copied += used;
+ }
+
+ kfree_skb(skb);
+ if (!desc->count)
+ break;
+ }
+
+ return copied;
+}
+
/*
* Sleep until more data has arrived. But check for races..
*/
@@ -2925,6 +2989,7 @@ static int __init af_unix_init(void)
sock_register(&unix_family_ops);
register_pernet_subsys(&unix_net_ops);
+ unix_bpf_build_proto();
out:
return rc;
}
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
new file mode 100644
index 000000000000..db0cda29fb2f
--- /dev/null
+++ b/net/unix/unix_bpf.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */
+
+#include <linux/skmsg.h>
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <net/af_unix.h>
+
+#define unix_sk_has_data(__sk, __psock) \
+ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \
+ !skb_queue_empty(&__psock->ingress_skb) || \
+ !list_empty(&__psock->ingress_msg); \
+ })
+
+static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+ long timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct unix_sock *u = unix_sk(sk);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ if (!unix_sk_has_data(sk, psock)) {
+ mutex_unlock(&u->iolock);
+ wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ mutex_lock(&u->iolock);
+ ret = unix_sk_has_data(sk, psock);
+ }
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
+ size_t len, int nonblock, int flags,
+ int *addr_len)
+{
+ struct unix_sock *u = unix_sk(sk);
+ struct sk_psock *psock;
+ int copied, ret;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return __unix_dgram_recvmsg(sk, msg, len, flags);
+
+ mutex_lock(&u->iolock);
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
+ sk_psock_queue_empty(psock)) {
+ ret = __unix_dgram_recvmsg(sk, msg, len, flags);
+ goto out;
+ }
+
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ long timeo;
+ int data;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = unix_msg_wait_data(sk, psock, timeo);
+ if (data) {
+ if (!sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ ret = __unix_dgram_recvmsg(sk, msg, len, flags);
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ mutex_unlock(&u->iolock);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
+static struct proto *unix_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(unix_prot_lock);
+static struct proto unix_bpf_prot;
+
+static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+ *prot = *base;
+ prot->close = sock_map_close;
+ prot->recvmsg = unix_dgram_bpf_recvmsg;
+}
+
+static void unix_bpf_check_needs_rebuild(struct proto *ops)
+{
+ if (unlikely(ops != smp_load_acquire(&unix_prot_saved))) {
+ spin_lock_bh(&unix_prot_lock);
+ if (likely(ops != unix_prot_saved)) {
+ unix_bpf_rebuild_protos(&unix_bpf_prot, ops);
+ smp_store_release(&unix_prot_saved, ops);
+ }
+ spin_unlock_bh(&unix_prot_lock);
+ }
+}
+
+int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+ if (restore) {
+ sk->sk_write_space = psock->saved_write_space;
+ WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+ return 0;
+ }
+
+ unix_bpf_check_needs_rebuild(psock->sk_proto);
+ WRITE_ONCE(sk->sk_prot, &unix_bpf_prot);
+ return 0;
+}
+
+void __init unix_bpf_build_proto(void)
+{
+ unix_bpf_rebuild_protos(&unix_bpf_prot, &unix_proto);
+}