From c51da3f7a161c6822232be832abdffe47eb55b4c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jun 2025 13:30:01 +0000 Subject: net: remove sock_i_uid() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Difference between sock_i_uid() and sk_uid() is that after sock_orphan(), sock_i_uid() returns GLOBAL_ROOT_UID while sk_uid() returns the last cached sk->sk_uid value. None of sock_i_uid() callers care about this. Use sk_uid() which is much faster and inlined. Note that diag/dump users are calling sock_i_ino() and can not see the full benefit yet. Signed-off-by: Eric Dumazet Cc: Lorenzo Colitti Reviewed-by: Maciej Żenczykowski Link: https://patch.msgid.link/20250620133001.4090592-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 22e170fb5dda..1e320f89168d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3682,7 +3682,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) goto unlock; } - uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); -- cgit From 25489a4f556414445d342951615178368ee45cde Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 2 Jul 2025 15:38:08 +0200 Subject: net: splice: Drop unused @gfp Since its introduction in commit 2e910b95329c ("net: Add a function to splice pages into an skbuff for MSG_SPLICE_PAGES"), skb_splice_from_iter() never used the @gfp argument. Remove it and adapt callers. No functional change intended. Reviewed-by: Simon Horman Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250702-splice-drop-unused-v3-2-55f68b60d2b7@rbox.co Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 564c970d97ff..cd0d582bc7d4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2388,8 +2388,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; - err = skb_splice_from_iter(skb, &msg->msg_iter, size, - sk->sk_allocation); + err = skb_splice_from_iter(skb, &msg->msg_iter, size); if (err < 0) goto out_free; -- cgit From b429a5ad19cb4efe63d18388a2a4deebcba742c6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:13 +0000 Subject: af_unix: Don't hold unix_state_lock() in __unix_dgram_recvmsg(). When __skb_try_recv_datagram() returns NULL in __unix_dgram_recvmsg(), we hold unix_state_lock() unconditionally. This is because SOCK_SEQPACKET sk needs to return EOF in case its peer has been close()d concurrently. This behaviour totally depends on the timing of the peer's close() and reading sk->sk_shutdown, and taking the lock does not play a role. Let's drop the lock from __unix_dgram_recvmsg() and use READ_ONCE(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250702223606.1054680-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cd0d582bc7d4..becd84737635 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2527,12 +2527,10 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ - unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && - (sk->sk_shutdown & RCV_SHUTDOWN)) + (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN)) err = 0; - unix_state_unlock(sk); goto out; } -- cgit From 772f01049c4b722b28b3f7025b4996379f127ebf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:14 +0000 Subject: af_unix: Don't check SOCK_DEAD in unix_stream_read_skb(). unix_stream_read_skb() checks SOCK_DEAD only when the dequeued skb is OOB skb. unix_stream_read_skb() is called for a SOCK_STREAM socket in SOCKMAP when data is sent to it. The function is invoked via sk_psock_verdict_data_ready(), which is set to sk->sk_data_ready(). During sendmsg(), we check if the receiver has SOCK_DEAD, so there is no point in checking it again later in ->read_skb(). Also, unix_read_skb() for SOCK_DGRAM does not have the test either. Let's remove the SOCK_DEAD test in unix_stream_read_skb(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250702223606.1054680-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index becd84737635..34ddea34e87e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2803,14 +2803,6 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (unlikely(skb == READ_ONCE(u->oob_skb))) { bool drop = false; - unix_state_lock(sk); - - if (sock_flag(sk, SOCK_DEAD)) { - unix_state_unlock(sk); - kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); - return -ECONNRESET; - } - spin_lock(&sk->sk_receive_queue.lock); if (likely(skb == u->oob_skb)) { WRITE_ONCE(u->oob_skb, NULL); @@ -2818,8 +2810,6 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) } spin_unlock(&sk->sk_receive_queue.lock); - unix_state_unlock(sk); - if (drop) { kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return -EAGAIN; -- cgit From d0aac85449dec992bb8dc2503f2cb9e94ef436db Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:15 +0000 Subject: af_unix: Don't use skb_recv_datagram() in unix_stream_read_skb(). unix_stream_read_skb() calls skb_recv_datagram() with MSG_DONTWAIT, which is mostly equivalent to sock_error(sk) + skb_dequeue(). In the following patch, we will add a new field to cache the number of bytes in the receive queue. Then, we want to avoid introducing atomic ops in the fast path, so we will reuse the receive queue lock. As a preparation for the change, let's not use skb_recv_datagram() in unix_stream_read_skb(). Note that sock_error() is now moved out of the u->iolock mutex as the mutex does not synchronise the peer's close() at all. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250702223606.1054680-4-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 34ddea34e87e..94596d6c37e9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2786,6 +2786,7 @@ unlock: static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { + struct sk_buff_head *queue = &sk->sk_receive_queue; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; @@ -2793,30 +2794,34 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; - mutex_lock(&u->iolock); - skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); - mutex_unlock(&u->iolock); - if (!skb) + err = sock_error(sk); + if (err) return err; -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (unlikely(skb == READ_ONCE(u->oob_skb))) { - bool drop = false; + mutex_lock(&u->iolock); + spin_lock(&queue->lock); - spin_lock(&sk->sk_receive_queue.lock); - if (likely(skb == u->oob_skb)) { - WRITE_ONCE(u->oob_skb, NULL); - drop = true; - } - spin_unlock(&sk->sk_receive_queue.lock); + skb = __skb_dequeue(queue); + if (!skb) { + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); + return -EAGAIN; + } - if (drop) { - kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); - return -EAGAIN; - } +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb == u->oob_skb) { + WRITE_ONCE(u->oob_skb, NULL); + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); + + kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); + return -EAGAIN; } #endif + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); + return recv_actor(sk, skb); } -- cgit From f4e1fb04c12384fb1b69a95c33527b515a652a74 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:16 +0000 Subject: af_unix: Use cached value for SOCK_STREAM in unix_inq_len(). Compared to TCP, ioctl(SIOCINQ) for AF_UNIX SOCK_STREAM socket is more expensive, as unix_inq_len() requires iterating through the receive queue and accumulating skb->len. Let's cache the value for SOCK_STREAM to a new field during sendmsg() and recvmsg(). The field is protected by the receive queue lock. Note that ioctl(SIOCINQ) for SOCK_DGRAM returns the length of the first skb in the queue. SOCK_SEQPACKET still requires iterating through the queue because we do not touch functions shared with unix_dgram_ops. But, if really needed, we can support it by switching __skb_try_recv_datagram() to a custom version. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250702223606.1054680-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 94596d6c37e9..d9e604295a71 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2297,6 +2297,7 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, spin_lock(&other->sk_receive_queue.lock); WRITE_ONCE(ousk->oob_skb, skb); + WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); @@ -2319,6 +2320,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sock *other = NULL; + struct unix_sock *otheru; struct scm_cookie scm; bool fds_sent = false; int err, sent = 0; @@ -2342,14 +2344,16 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) { err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; - } else { - other = unix_peer(sk); - if (!other) { - err = -ENOTCONN; - goto out_err; - } } + other = unix_peer(sk); + if (!other) { + err = -ENOTCONN; + goto out_err; + } + + otheru = unix_sk(other); + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto out_pipe; @@ -2417,7 +2421,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); - skb_queue_tail(&other->sk_receive_queue, skb); + + spin_lock(&other->sk_receive_queue.lock); + WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); + unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2704,6 +2713,7 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); + WRITE_ONCE(u->inq_len, u->inq_len - 1); if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && !unix_skb_len(oob_skb->prev)) { @@ -2808,6 +2818,8 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) return -EAGAIN; } + WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb == u->oob_skb) { WRITE_ONCE(u->oob_skb, NULL); @@ -2988,7 +3000,11 @@ unlock: if (unix_skb_len(skb)) break; - skb_unlink(skb, &sk->sk_receive_queue); + spin_lock(&sk->sk_receive_queue.lock); + WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + __skb_unlink(skb, &sk->sk_receive_queue); + spin_unlock(&sk->sk_receive_queue.lock); + consume_skb(skb); if (scm.fp) @@ -3159,9 +3175,11 @@ long unix_inq_len(struct sock *sk) if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; + if (sk->sk_type == SOCK_STREAM) + return READ_ONCE(unix_sk(sk)->inq_len); + spin_lock(&sk->sk_receive_queue.lock); - if (sk->sk_type == SOCK_STREAM || - sk->sk_type == SOCK_SEQPACKET) { + if (sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { -- cgit From 8b77338eb2af74bb93986e4a8cfd86724168fe39 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:17 +0000 Subject: af_unix: Cache state->msg in unix_stream_read_generic(). In unix_stream_read_generic(), state->msg is fetched multiple times. Let's cache it in a local variable. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250702223606.1054680-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d9e604295a71..c3dd41596d89 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2840,20 +2840,21 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { - struct scm_cookie scm; + int noblock = state->flags & MSG_DONTWAIT; struct socket *sock = state->socket; + struct msghdr *msg = state->msg; struct sock *sk = sock->sk; - struct unix_sock *u = unix_sk(sk); - int copied = 0; + size_t size = state->size; int flags = state->flags; - int noblock = flags & MSG_DONTWAIT; bool check_creds = false; - int target; + struct scm_cookie scm; + unsigned int last_len; + struct unix_sock *u; + int copied = 0; int err = 0; long timeo; + int target; int skip; - size_t size = state->size; - unsigned int last_len; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; @@ -2873,6 +2874,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, memset(&scm, 0, sizeof(scm)); + u = unix_sk(sk); + /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ @@ -2964,14 +2967,12 @@ unlock: } /* Copy address just once */ - if (state->msg && state->msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, - state->msg->msg_name); - unix_copy_addr(state->msg, skb->sk); + if (msg && msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); - BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, - state->msg->msg_name, - &state->msg->msg_namelen); + unix_copy_addr(msg, skb->sk); + BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, + &msg->msg_namelen); sunaddr = NULL; } @@ -3033,8 +3034,8 @@ unlock: } while (size); mutex_unlock(&u->iolock); - if (state->msg) - scm_recv_unix(sock, state->msg, &scm, flags); + if (msg) + scm_recv_unix(sock, msg, &scm, flags); else scm_destroy(&scm); out: -- cgit From df30285b3670bf52e1e5512e4d4482bec5e93c16 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:18 +0000 Subject: af_unix: Introduce SO_INQ. We have an application that uses almost the same code for TCP and AF_UNIX (SOCK_STREAM). TCP can use TCP_INQ, but AF_UNIX doesn't have it and requires an extra syscall, ioctl(SIOCINQ) or getsockopt(SO_MEMINFO) as an alternative. Let's introduce the generic version of TCP_INQ. If SO_INQ is enabled, recvmsg() will put a cmsg of SCM_INQ that contains the exact value of ioctl(SIOCINQ). The cmsg is also included when msg->msg_get_inq is non-zero to make sockets io_uring-friendly. Note that SOCK_CUSTOM_SOCKOPT is flagged only for SOCK_STREAM to override setsockopt() for SOL_SOCKET. By having the flag in struct unix_sock, instead of struct sock, we can later add SO_INQ support for TCP and reuse tcp_sk(sk)->recvmsg_inq. Note also that supporting custom getsockopt() for SOL_SOCKET will need preparation for other SOCK_CUSTOM_SOCKOPT users (UDP, vsock, MPTCP). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250702223606.1054680-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c3dd41596d89..7a92733706fe 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -934,6 +934,52 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) #define unix_show_fdinfo NULL #endif +static bool unix_custom_sockopt(int optname) +{ + switch (optname) { + case SO_INQ: + return true; + default: + return false; + } +} + +static int unix_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct unix_sock *u = unix_sk(sock->sk); + struct sock *sk = sock->sk; + int val; + + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + if (!unix_custom_sockopt(optname)) + return sock_setsockopt(sock, level, optname, optval, optlen); + + if (optlen != sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + switch (optname) { + case SO_INQ: + if (sk->sk_type != SOCK_STREAM) + return -EINVAL; + + if (val > 1 || val < 0) + return -EINVAL; + + WRITE_ONCE(u->recvmsg_inq, val); + break; + default: + return -ENOPROTOOPT; + } + + return 0; +} + static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, @@ -950,6 +996,7 @@ static const struct proto_ops unix_stream_ops = { #endif .listen = unix_listen, .shutdown = unix_shutdown, + .setsockopt = unix_setsockopt, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, @@ -1116,6 +1163,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, switch (sock->type) { case SOCK_STREAM: + set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); sock->ops = &unix_stream_ops; break; /* @@ -1847,6 +1895,9 @@ static int unix_accept(struct socket *sock, struct socket *newsock, skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); + if (tsk->sk_type == SOCK_STREAM) + set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + /* attach accepted sock to socket */ unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); @@ -3034,10 +3085,17 @@ unlock: } while (size); mutex_unlock(&u->iolock); - if (msg) + if (msg) { scm_recv_unix(sock, msg, &scm, flags); - else + + if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) { + msg->msg_inq = READ_ONCE(u->inq_len); + put_cmsg(msg, SOL_SOCKET, SCM_INQ, + sizeof(msg->msg_inq), &msg->msg_inq); + } + } else { scm_destroy(&scm); + } out: return copied ? : err; } -- cgit