From 4408d55a64677febdcb50d1b44d0dc714ce4187e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Thu, 13 Jan 2022 09:28:45 +0900
Subject: af_unix: Refactor unix_next_socket().

Currently, unix_next_socket() is overloaded depending on the 2nd argument.
If it is NULL, unix_next_socket() returns the first socket in the hash.  If
not NULL, it returns the next socket in the same hash list or the first
socket in the next non-empty hash list.

This patch refactors unix_next_socket() into two functions unix_get_first()
and unix_get_next().  unix_get_first() newly acquires a lock and returns
the first socket in the list.  unix_get_next() returns the next socket in a
list or releases a lock and falls back to unix_get_first().

In the following patch, bpf iter holds entire sockets in a list and always
releases the lock before .show().  It always calls unix_get_first() to
acquire a lock in each iteration.  So, this patch makes the change easier
to follow.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Link: https://lore.kernel.org/r/20220113002849.4384-2-kuniyu@amazon.co.jp
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/unix/af_unix.c | 51 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 21 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c19569819866..e1c4082accdb 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3240,49 +3240,58 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
 	return sk;
 }
 
-static struct sock *unix_next_socket(struct seq_file *seq,
-				     struct sock *sk,
-				     loff_t *pos)
+static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
 {
 	unsigned long bucket = get_bucket(*pos);
+	struct sock *sk;
 
-	while (sk > (struct sock *)SEQ_START_TOKEN) {
-		sk = sk_next(sk);
-		if (!sk)
-			goto next_bucket;
-		if (sock_net(sk) == seq_file_net(seq))
-			return sk;
-	}
-
-	do {
+	while (bucket < ARRAY_SIZE(unix_socket_table)) {
 		spin_lock(&unix_table_locks[bucket]);
+
 		sk = unix_from_bucket(seq, pos);
 		if (sk)
 			return sk;
 
-next_bucket:
-		spin_unlock(&unix_table_locks[bucket++]);
-		*pos = set_bucket_offset(bucket, 1);
-	} while (bucket < ARRAY_SIZE(unix_socket_table));
+		spin_unlock(&unix_table_locks[bucket]);
+
+		*pos = set_bucket_offset(++bucket, 1);
+	}
 
 	return NULL;
 }
 
+static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
+				  loff_t *pos)
+{
+	unsigned long bucket = get_bucket(*pos);
+
+	for (sk = sk_next(sk); sk; sk = sk_next(sk))
+		if (sock_net(sk) == seq_file_net(seq))
+			return sk;
+
+	spin_unlock(&unix_table_locks[bucket]);
+
+	*pos = set_bucket_offset(++bucket, 1);
+
+	return unix_get_first(seq, pos);
+}
+
 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	if (!*pos)
 		return SEQ_START_TOKEN;
 
-	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
-		return NULL;
-
-	return unix_next_socket(seq, NULL, pos);
+	return unix_get_first(seq, pos);
 }
 
 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	++*pos;
-	return unix_next_socket(seq, v, pos);
+
+	if (v == SEQ_START_TOKEN)
+		return unix_get_first(seq, pos);
+
+	return unix_get_next(seq, v, pos);
 }
 
 static void unix_seq_stop(struct seq_file *seq, void *v)
-- 
cgit 


From 855d8e77ffb05be6e54c34dababccb20318aec00 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Thu, 13 Jan 2022 09:28:46 +0900
Subject: bpf: af_unix: Use batching algorithm in bpf unix iter.

The commit 04c7820b776f ("bpf: tcp: Bpf iter batching and lock_sock")
introduces the batching algorithm to iterate TCP sockets with more
consistency.

This patch uses the same algorithm to iterate AF_UNIX sockets.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Link: https://lore.kernel.org/r/20220113002849.4384-3-kuniyu@amazon.co.jp
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/unix/af_unix.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 177 insertions(+), 7 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e1c4082accdb..d383d5f63b6b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3356,6 +3356,15 @@ static const struct seq_operations unix_seq_ops = {
 };
 
 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_unix_iter_state {
+	struct seq_net_private p;
+	unsigned int cur_sk;
+	unsigned int end_sk;
+	unsigned int max_sk;
+	struct sock **batch;
+	bool st_bucket_done;
+};
+
 struct bpf_iter__unix {
 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
 	__bpf_md_ptr(struct unix_sock *, unix_sk);
@@ -3374,24 +3383,156 @@ static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
 	return bpf_iter_run_prog(prog, &ctx);
 }
 
+static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
+
+{
+	struct bpf_unix_iter_state *iter = seq->private;
+	unsigned int expected = 1;
+	struct sock *sk;
+
+	sock_hold(start_sk);
+	iter->batch[iter->end_sk++] = start_sk;
+
+	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
+		if (sock_net(sk) != seq_file_net(seq))
+			continue;
+
+		if (iter->end_sk < iter->max_sk) {
+			sock_hold(sk);
+			iter->batch[iter->end_sk++] = sk;
+		}
+
+		expected++;
+	}
+
+	spin_unlock(&unix_table_locks[start_sk->sk_hash]);
+
+	return expected;
+}
+
+static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
+{
+	while (iter->cur_sk < iter->end_sk)
+		sock_put(iter->batch[iter->cur_sk++]);
+}
+
+static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
+				       unsigned int new_batch_sz)
+{
+	struct sock **new_batch;
+
+	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
+			     GFP_USER | __GFP_NOWARN);
+	if (!new_batch)
+		return -ENOMEM;
+
+	bpf_iter_unix_put_batch(iter);
+	kvfree(iter->batch);
+	iter->batch = new_batch;
+	iter->max_sk = new_batch_sz;
+
+	return 0;
+}
+
+static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
+					loff_t *pos)
+{
+	struct bpf_unix_iter_state *iter = seq->private;
+	unsigned int expected;
+	bool resized = false;
+	struct sock *sk;
+
+	if (iter->st_bucket_done)
+		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
+
+again:
+	/* Get a new batch */
+	iter->cur_sk = 0;
+	iter->end_sk = 0;
+
+	sk = unix_get_first(seq, pos);
+	if (!sk)
+		return NULL; /* Done */
+
+	expected = bpf_iter_unix_hold_batch(seq, sk);
+
+	if (iter->end_sk == expected) {
+		iter->st_bucket_done = true;
+		return sk;
+	}
+
+	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
+		resized = true;
+		goto again;
+	}
+
+	return sk;
+}
+
+static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (!*pos)
+		return SEQ_START_TOKEN;
+
+	/* bpf iter does not support lseek, so it always
+	 * continue from where it was stop()-ped.
+	 */
+	return bpf_iter_unix_batch(seq, pos);
+}
+
+static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_unix_iter_state *iter = seq->private;
+	struct sock *sk;
+
+	/* Whenever seq_next() is called, the iter->cur_sk is
+	 * done with seq_show(), so advance to the next sk in
+	 * the batch.
+	 */
+	if (iter->cur_sk < iter->end_sk)
+		sock_put(iter->batch[iter->cur_sk++]);
+
+	++*pos;
+
+	if (iter->cur_sk < iter->end_sk)
+		sk = iter->batch[iter->cur_sk];
+	else
+		sk = bpf_iter_unix_batch(seq, pos);
+
+	return sk;
+}
+
 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
 {
 	struct bpf_iter_meta meta;
 	struct bpf_prog *prog;
 	struct sock *sk = v;
 	uid_t uid;
+	bool slow;
+	int ret;
 
 	if (v == SEQ_START_TOKEN)
 		return 0;
 
+	slow = lock_sock_fast(sk);
+
+	if (unlikely(sk_unhashed(sk))) {
+		ret = SEQ_SKIP;
+		goto unlock;
+	}
+
 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
 	meta.seq = seq;
 	prog = bpf_iter_get_info(&meta, false);
-	return unix_prog_seq_show(prog, &meta, v, uid);
+	ret = unix_prog_seq_show(prog, &meta, v, uid);
+unlock:
+	unlock_sock_fast(sk, slow);
+	return ret;
 }
 
 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
 {
+	struct bpf_unix_iter_state *iter = seq->private;
 	struct bpf_iter_meta meta;
 	struct bpf_prog *prog;
 
@@ -3402,12 +3543,13 @@ static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
 			(void)unix_prog_seq_show(prog, &meta, v, 0);
 	}
 
-	unix_seq_stop(seq, v);
+	if (iter->cur_sk < iter->end_sk)
+		bpf_iter_unix_put_batch(iter);
 }
 
 static const struct seq_operations bpf_iter_unix_seq_ops = {
-	.start	= unix_seq_start,
-	.next	= unix_seq_next,
+	.start	= bpf_iter_unix_seq_start,
+	.next	= bpf_iter_unix_seq_next,
 	.stop	= bpf_iter_unix_seq_stop,
 	.show	= bpf_iter_unix_seq_show,
 };
@@ -3456,11 +3598,39 @@ static struct pernet_operations unix_net_ops = {
 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
 		     struct unix_sock *unix_sk, uid_t uid)
 
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+	struct bpf_unix_iter_state *iter = priv_data;
+	int err;
+
+	err = bpf_iter_init_seq_net(priv_data, aux);
+	if (err)
+		return err;
+
+	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
+	if (err) {
+		bpf_iter_fini_seq_net(priv_data);
+		return err;
+	}
+
+	return 0;
+}
+
+static void bpf_iter_fini_unix(void *priv_data)
+{
+	struct bpf_unix_iter_state *iter = priv_data;
+
+	bpf_iter_fini_seq_net(priv_data);
+	kvfree(iter->batch);
+}
+
 static const struct bpf_iter_seq_info unix_seq_info = {
 	.seq_ops		= &bpf_iter_unix_seq_ops,
-	.init_seq_private	= bpf_iter_init_seq_net,
-	.fini_seq_private	= bpf_iter_fini_seq_net,
-	.seq_priv_size		= sizeof(struct seq_net_private),
+	.init_seq_private	= bpf_iter_init_unix,
+	.fini_seq_private	= bpf_iter_fini_unix,
+	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
 };
 
 static struct bpf_iter_reg unix_reg_info = {
-- 
cgit 


From eb7d8f1d9ebc7379f09a51bf4faa35e0bfa7437d Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Thu, 13 Jan 2022 09:28:47 +0900
Subject: bpf: Support bpf_(get|set)sockopt() in bpf unix iter.

This patch makes bpf_(get|set)sockopt() available when iterating AF_UNIX
sockets.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Link: https://lore.kernel.org/r/20220113002849.4384-4-kuniyu@amazon.co.jp
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/unix/af_unix.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d383d5f63b6b..3e0d6281fd1e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3633,6 +3633,20 @@ static const struct bpf_iter_seq_info unix_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
 };
 
+static const struct bpf_func_proto *
+bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
+			     const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_setsockopt:
+		return &bpf_sk_setsockopt_proto;
+	case BPF_FUNC_getsockopt:
+		return &bpf_sk_getsockopt_proto;
+	default:
+		return NULL;
+	}
+}
+
 static struct bpf_iter_reg unix_reg_info = {
 	.target			= "unix",
 	.ctx_arg_info_size	= 1,
@@ -3640,6 +3654,7 @@ static struct bpf_iter_reg unix_reg_info = {
 		{ offsetof(struct bpf_iter__unix, unix_sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.get_func_proto         = bpf_iter_unix_get_func_proto,
 	.seq_info		= &unix_seq_info,
 };
 
-- 
cgit 


From e82025c623e2bf04d162bafceb66a59115814479 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Thu, 17 Mar 2022 12:08:08 +0900
Subject: af_unix: Fix some data-races around unix_sk(sk)->oob_skb.

Out-of-band data automatically places a "mark" showing wherein the
sequence the out-of-band data would have been.  If the out-of-band data
implies cancelling everything sent so far, the "mark" is helpful to flush
them.  When the socket's read pointer reaches the "mark", the ioctl() below
sets a non zero value to the arg `atmark`:

The out-of-band data is queued in sk->sk_receive_queue as well as ordinary
data and also saved in unix_sk(sk)->oob_skb.  It can be used to test if the
head of the receive queue is the out-of-band data meaning the socket is at
the "mark".

While testing that, unix_ioctl() reads unix_sk(sk)->oob_skb locklessly.
Thus, all accesses to oob_skb need some basic protection to avoid
load/store tearing which KCSAN detects when these are called concurrently:

  - ioctl(fd_a, SIOCATMARK, &atmark, sizeof(atmark))
  - send(fd_b_connected_to_a, buf, sizeof(buf), MSG_OOB)

BUG: KCSAN: data-race in unix_ioctl / unix_stream_sendmsg

write to 0xffff888003d9cff0 of 8 bytes by task 175 on cpu 1:
 unix_stream_sendmsg (net/unix/af_unix.c:2087 net/unix/af_unix.c:2191)
 sock_sendmsg (net/socket.c:705 net/socket.c:725)
 __sys_sendto (net/socket.c:2040)
 __x64_sys_sendto (net/socket.c:2048)
 do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
 entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)

read to 0xffff888003d9cff0 of 8 bytes by task 176 on cpu 0:
 unix_ioctl (net/unix/af_unix.c:3101 (discriminator 1))
 sock_do_ioctl (net/socket.c:1128)
 sock_ioctl (net/socket.c:1242)
 __x64_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:874 fs/ioctl.c:860 fs/ioctl.c:860)
 do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
 entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)

value changed: 0xffff888003da0c00 -> 0xffff888003da0d00

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 176 Comm: unix_race_oob_i Not tainted 5.17.0-rc5-59529-g83dc4c2af682 #12
Hardware name: Red Hat KVM, BIOS 1.11.0-2.amzn2 04/01/2014

Fixes: 314001f0bf92 ("af_unix: Add OOB support")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c19569819866..0c37e5595aae 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2084,7 +2084,7 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other
 	if (ousk->oob_skb)
 		consume_skb(ousk->oob_skb);
 
-	ousk->oob_skb = skb;
+	WRITE_ONCE(ousk->oob_skb, skb);
 
 	scm_stat_add(other, skb);
 	skb_queue_tail(&other->sk_receive_queue, skb);
@@ -2602,9 +2602,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)
 
 	oob_skb = u->oob_skb;
 
-	if (!(state->flags & MSG_PEEK)) {
-		u->oob_skb = NULL;
-	}
+	if (!(state->flags & MSG_PEEK))
+		WRITE_ONCE(u->oob_skb, NULL);
 
 	unix_state_unlock(sk);
 
@@ -2639,7 +2638,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
 				skb = NULL;
 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
 				if (!(flags & MSG_PEEK)) {
-					u->oob_skb = NULL;
+					WRITE_ONCE(u->oob_skb, NULL);
 					consume_skb(skb);
 				}
 			} else if (!(flags & MSG_PEEK)) {
@@ -3094,11 +3093,10 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCATMARK:
 		{
 			struct sk_buff *skb;
-			struct unix_sock *u = unix_sk(sk);
 			int answ = 0;
 
 			skb = skb_peek(&sk->sk_receive_queue);
-			if (skb && skb == u->oob_skb)
+			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
 				answ = 1;
 			err = put_user(answ, (int __user *)arg);
 		}
-- 
cgit 


From d9a232d435dcc966738b0f414a86f7edf4f4c8c4 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Thu, 17 Mar 2022 12:08:09 +0900
Subject: af_unix: Support POLLPRI for OOB.

The commit 314001f0bf92 ("af_unix: Add OOB support") introduced OOB for
AF_UNIX, but it lacks some changes for POLLPRI.  Let's add the missing
piece.

In the selftest, normal datagrams are sent followed by OOB data, so this
commit replaces `POLLIN | POLLPRI` with just `POLLPRI` in the first test
case.

Fixes: 314001f0bf92 ("af_unix: Add OOB support")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0c37e5595aae..1e7ed5829ed5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3137,6 +3137,10 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
 		mask |= EPOLLIN | EPOLLRDNORM;
 	if (sk_is_readable(sk))
 		mask |= EPOLLIN | EPOLLRDNORM;
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+	if (READ_ONCE(unix_sk(sk)->oob_skb))
+		mask |= EPOLLPRI;
+#endif
 
 	/* Connection-based need to check for termination and startup */
 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
-- 
cgit 


From 4edf21aa94ee33c75f819f2b6eb6dd52ef8a1628 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Thu, 17 Mar 2022 12:23:08 +0900
Subject: af_unix: Remove unnecessary brackets around CONFIG_AF_UNIX_OOB.

Let's remove unnecessary brackets around CONFIG_AF_UNIX_OOB.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Link: https://lore.kernel.org/r/20220317032308.65372-1-kuniyu@amazon.co.jp
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/unix/af_unix.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3e0d6281fd1e..4247c4134f31 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2049,7 +2049,7 @@ out:
  */
 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
 
-#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
 {
 	struct unix_sock *ousk = unix_sk(other);
@@ -2115,7 +2115,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 
 	err = -EOPNOTSUPP;
 	if (msg->msg_flags & MSG_OOB) {
-#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 		if (len)
 			len--;
 		else
@@ -2186,7 +2186,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 		sent += size;
 	}
 
-#if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 	if (msg->msg_flags & MSG_OOB) {
 		err = queue_oob(sock, msg, other);
 		if (err)
-- 
cgit 


From f4b41f062c424209e3939a81e6da022e049a45f2 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 4 Apr 2022 18:30:22 +0200
Subject: net: remove noblock parameter from skb_recv_datagram()

skb_recv_datagram() has two parameters 'flags' and 'noblock' that are
merged inside skb_recv_datagram() by 'flags | (noblock ? MSG_DONTWAIT : 0)'

As 'flags' may contain MSG_DONTWAIT as value most callers split the 'flags'
into 'flags' and 'noblock' with finally obsolete bit operations like this:

skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &rc);

And this is not even done consistently with the 'flags' parameter.

This patch removes the obsolete and costly splitting into two parameters
and only performs bit operations when really needed on the caller side.

One missing conversion thankfully reported by kernel test robot. I missed
to enable kunit tests to build the mctp code.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e71a312faa1e..fecbd95da918 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1643,7 +1643,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 	 * so that no locks are necessary.
 	 */
 
-	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
+	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
+				&err);
 	if (!skb) {
 		/* This means receive shutdown. */
 		if (err == 0)
@@ -2500,7 +2501,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
 		int used, err;
 
 		mutex_lock(&u->iolock);
-		skb = skb_recv_datagram(sk, 0, 1, &err);
+		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
 		mutex_unlock(&u->iolock);
 		if (!skb)
 			return err;
-- 
cgit 


From ec095263a965720e1ca39db1d9c5cd47846c789b Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 11 Apr 2022 14:49:55 +0200
Subject: net: remove noblock parameter from recvmsg() entities

The internal recvmsg() functions have two parameters 'flags' and 'noblock'
that were merged inside skb_recv_datagram(). As a follow up patch to commit
f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()")
this patch removes the separate 'noblock' parameter for recvmsg().

Analogue to the referenced patch for skb_recv_datagram() the 'flags' and
'noblock' parameters are unnecessarily split up with e.g.

err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
                           flags & ~MSG_DONTWAIT, &addr_len);

or in

err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
                      sk, msg, size, flags & MSG_DONTWAIT,
                      flags & ~MSG_DONTWAIT, &addr_len);

instead of simply using only flags all the time and check for MSG_DONTWAIT
where needed (to preserve for the formerly separated no(n)block condition).

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/unix/af_unix.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fecbd95da918..e1dd9e9c8452 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2484,8 +2484,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si
 	const struct proto *prot = READ_ONCE(sk->sk_prot);
 
 	if (prot != &unix_dgram_proto)
-		return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
-					    flags & ~MSG_DONTWAIT, NULL);
+		return prot->recvmsg(sk, msg, size, flags, NULL);
 #endif
 	return __unix_dgram_recvmsg(sk, msg, size, flags);
 }
@@ -2917,8 +2916,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 	const struct proto *prot = READ_ONCE(sk->sk_prot);
 
 	if (prot != &unix_stream_proto)
-		return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
-					    flags & ~MSG_DONTWAIT, NULL);
+		return prot->recvmsg(sk, msg, size, flags, NULL);
 #endif
 	return unix_stream_read_generic(&state, true);
 }
-- 
cgit 


From b146cbf2e32f01f56244d670aef2f43d44fcf120 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 May 2022 15:46:26 -0700
Subject: af_unix: Silence randstruct GCC plugin warning

While preparing for Clang randstruct support (which duplicated many of
the warnings the randstruct GCC plugin warned about), one strange one
remained only for the randstruct GCC plugin. Eliminating this rids
the plugin of the last exception.

It seems the plugin is happy to dereference individual members of
a cross-struct cast, but it is upset about casting to a whole object
pointer. This only manifests in one place in the kernel, so just replace
the variable with individual member accesses. There is no change in
executable instruction output.

Drop the last exception from the randstruct GCC plugin.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Cong Wang <cong.wang@bytedance.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: netdev@vger.kernel.org
Cc: linux-hardening@vger.kernel.org
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Link: https://lore.kernel.org/lkml/20220511022217.58586-1-kuniyu@amazon.co.jp
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/lkml/20220511151542.4cb3ff17@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 net/unix/af_unix.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e71a312faa1e..36367e7e3e0a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1808,11 +1808,9 @@ static int maybe_init_creds(struct scm_cookie *scm,
 static bool unix_skb_scm_eq(struct sk_buff *skb,
 			    struct scm_cookie *scm)
 {
-	const struct unix_skb_parms *u = &UNIXCB(skb);
-
-	return u->pid == scm->pid &&
-	       uid_eq(u->uid, scm->creds.uid) &&
-	       gid_eq(u->gid, scm->creds.gid) &&
+	return UNIXCB(skb).pid == scm->pid &&
+	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
+	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
 	       unix_secdata_eq(scm, skb);
 }
 
-- 
cgit 


From 662a80946ce13633ae90a55379f1346c10f0c432 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Sun, 5 Jun 2022 16:23:25 -0700
Subject: af_unix: Fix a data-race in unix_dgram_peer_wake_me().

unix_dgram_poll() calls unix_dgram_peer_wake_me() without `other`'s
lock held and check if its receive queue is full.  Here we need to
use unix_recvq_full_lockless() instead of unix_recvq_full(), otherwise
KCSAN will report a data-race.

Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20220605232325.11804-1-kuniyu@amazon.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 654dcef7cfb3..2206e6f8902d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -490,7 +490,7 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 	 * to other and its full, we will hang waiting for POLLOUT.
 	 */
-	if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
+	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 		return 1;
 
 	if (connected)
-- 
cgit