summaryrefslogtreecommitdiff
path: root/net/unix/af_unix.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
commit8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf (patch)
treefec3039a08284cd87f4ec9c3bea5b5a439f1859f /net/unix/af_unix.c
parent4b290aae788e06561754b28c6842e4080957d3f7 (diff)
parentfa582ca7e187a15e772e6a72fe035f649b387a60 (diff)
Merge tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Wrap datapath globals into net_aligned_data, to avoid false sharing - Preserve MSG_ZEROCOPY in forwarding (e.g. out of a container) - Add SO_INQ and SCM_INQ support to AF_UNIX - Add SIOCINQ support to AF_VSOCK - Add TCP_MAXSEG sockopt to MPTCP - Add IPv6 force_forwarding sysctl to enable forwarding per interface - Make TCP validation of whether packet fully fits in the receive window and the rcv_buf more strict. With increased use of HW aggregation a single "packet" can be multiple 100s of kB - Add MSG_MORE flag to optimize large TCP transmissions via sockmap, improves latency up to 33% for sockmap users - Convert TCP send queue handling from tasklet to BH workque - Improve BPF iteration over TCP sockets to see each socket exactly once - Remove obsolete and unused TCP RFC3517/RFC6675 loss recovery code - Support enabling kernel threads for NAPI processing on per-NAPI instance basis rather than a whole device. Fully stop the kernel NAPI thread when threaded NAPI gets disabled. Previously thread would stick around until ifdown due to tricky synchronization - Allow multicast routing to take effect on locally-generated packets - Add output interface argument for End.X in segment routing - MCTP: add support for gateway routing, improve bind() handling - Don't require rtnl_lock when fetching an IPv6 neighbor over Netlink - Add a new neighbor flag ("extern_valid"), which cedes refresh responsibilities to userspace. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed - Support NUD_PERMANENT for proxy neighbor entries - Add a new queuing discipline for IETF RFC9332 DualQ Coupled AQM - Add sequence numbers to netconsole messages. Unregister netconsole's console when all net targets are removed. Code refactoring. Add a number of selftests - Align IPSec inbound SA lookup to RFC 4301. Only SPI and protocol should be used for an inbound SA lookup - Support inspecting ref_tracker state via DebugFS - Don't force bonding advertisement frames tx to ~333 ms boundaries. Add broadcast_neighbor option to send ARP/ND on all bonded links - Allow providing upcall pid for the 'execute' command in openvswitch - Remove DCCP support from Netfilter's conntrack - Disallow multiple packet duplications in the queuing layer - Prevent use of deprecated iptables code on PREEMPT_RT Driver API: - Support RSS and hashing configuration over ethtool Netlink - Add dedicated ethtool callbacks for getting and setting hashing fields - Add support for power budget evaluation strategy in PSE / Power-over-Ethernet. Generate Netlink events for overcurrent etc - Support DPLL phase offset monitoring across all device inputs. Support providing clock reference and SYNC over separate DPLL inputs - Support traffic classes in devlink rate API for bandwidth management - Remove rtnl_lock dependency from UDP tunnel port configuration Device drivers: - Add a new Broadcom driver for 800G Ethernet (bnge) - Add a standalone driver for Microchip ZL3073x DPLL - Remove IBM's NETIUCV device driver - Ethernet high-speed NICs: - Broadcom (bnxt): - support zero-copy Tx of DMABUF memory - take page size into account for page pool recycling rings - Intel (100G, ice, idpf): - idpf: XDP and AF_XDP support preparations - idpf: add flow steering - add link_down_events statistic - clean up the TSPLL code - preparations for live VM migration - nVidia/Mellanox: - support zero-copy Rx/Tx interfaces (DMABUF and io_uring) - optimize context memory usage for matchers - expose serial numbers in devlink info - support PCIe congestion metrics - Meta (fbnic): - add 25G, 50G, and 100G link modes to phylink - support dumping FW logs - Marvell/Cavium: - support for CN20K generation of the Octeon chips - Amazon: - add HW clock (without timestamping, just hypervisor time access) - Ethernet virtual: - VirtIO net: - support segmentation of UDP-tunnel-encapsulated packets - Google (gve): - support packet timestamping and clock synchronization - Microsoft vNIC: - add handler for device-originated servicing events - allow dynamic MSI-X vector allocation - support Tx bandwidth clamping - Ethernet NICs consumer, and embedded: - AMD: - amd-xgbe: hardware timestamping and PTP clock support - Broadcom integrated MACs (bcmgenet, bcmasp): - use napi_complete_done() return value to support NAPI polling - add support for re-starting auto-negotiation - Broadcom switches (b53): - support BCM5325 switches - add bcm63xx EPHY power control - Synopsys (stmmac): - lots of code refactoring and cleanups - TI: - icssg-prueth: read firmware-names from device tree - icssg: PRP offload support - Microchip: - lan78xx: convert to PHYLINK for improved PHY and MAC management - ksz: add KSZ8463 switch support - Intel: - support similar queue priority scheme in multi-queue and time-sensitive networking (taprio) - support packet pre-emption in both - RealTek (r8169): - enable EEE at 5Gbps on RTL8126 - Airoha: - add PPPoE offload support - MDIO bus controller for Airoha AN7583 - Ethernet PHYs: - support for the IPQ5018 internal GE PHY - micrel KSZ9477 switch-integrated PHYs: - add MDI/MDI-X control support - add RX error counters - add cable test support - add Signal Quality Indicator (SQI) reporting - dp83tg720: improve reset handling and reduce link recovery time - support bcm54811 (and its MII-Lite interface type) - air_en8811h: support resume/suspend - support PHY counters for QCA807x and QCA808x - support WoL for QCA807x - CAN drivers: - rcar_canfd: support for Transceiver Delay Compensation - kvaser: report FW versions via devlink dev info - WiFi: - extended regulatory info support (6 GHz) - add statistics and beacon monitor for Multi-Link Operation (MLO) - support S1G aggregation, improve S1G support - add Radio Measurement action fields - support per-radio RTS threshold - some work around how FIPS affects wifi, which was wrong (RC4 is used by TKIP, not only WEP) - improvements for unsolicited probe response handling - WiFi drivers: - RealTek (rtw88): - IBSS mode for SDIO devices - RealTek (rtw89): - BT coexistence for MLO/WiFi7 - concurrent station + P2P support - support for USB devices RTL8851BU/RTL8852BU - Intel (iwlwifi): - use embedded PNVM in (to be released) FW images to fix compatibility issues - many cleanups (unused FW APIs, PCIe code, WoWLAN) - some FIPS interoperability - MediaTek (mt76): - firmware recovery improvements - more MLO work - Qualcomm/Atheros (ath12k): - fix scan on multi-radio devices - more EHT/Wi-Fi 7 features - encapsulation/decapsulation offload - Broadcom (brcm80211): - support SDIO 43751 device - Bluetooth: - hci_event: add support for handling LE BIG Sync Lost event - ISO: add socket option to report packet seqnum via CMSG - ISO: support SCM_TIMESTAMPING for ISO TS - Bluetooth drivers: - intel_pcie: support Function Level Reset - nxpuart: add support for 4M baudrate - nxpuart: implement powerup sequence, reset, FW dump, and FW loading" * tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1742 commits) dpll: zl3073x: Fix build failure selftests: bpf: fix legacy netfilter options ipv6: annotate data-races around rt->fib6_nsiblings ipv6: fix possible infinite loop in fib6_info_uses_dev() ipv6: prevent infinite loop in rt6_nlmsg_size() ipv6: add a retry logic in net6_rt_notify() vrf: Drop existing dst reference in vrf_ip6_input_dst net/sched: taprio: align entry index attr validation with mqprio net: fsl_pq_mdio: use dev_err_probe selftests: rtnetlink.sh: remove esp4_offload after test vsock: remove unnecessary null check in vsock_getname() igb: xsk: solve negative overflow of nb_pkts in zerocopy mode stmmac: xsk: fix negative overflow of budget in zerocopy mode dt-bindings: ieee802154: Convert at86rf230.txt yaml format net: dsa: microchip: Disable PTP function of KSZ8463 net: dsa: microchip: Setup fiber ports for KSZ8463 net: dsa: microchip: Write switch MAC address differently for KSZ8463 net: dsa: microchip: Use different registers for KSZ8463 net: dsa: microchip: Add KSZ8463 switch support to KSZ DSA driver dt-bindings: net: dsa: microchip: Add KSZ8463 switch support ...
Diffstat (limited to 'net/unix/af_unix.c')
-rw-r--r--net/unix/af_unix.c185
1 files changed, 127 insertions, 58 deletions
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a8895786e016..6d7c110814ff 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -929,6 +929,52 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
#define unix_show_fdinfo NULL
#endif
+static bool unix_custom_sockopt(int optname)
+{
+ switch (optname) {
+ case SO_INQ:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int unix_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct unix_sock *u = unix_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ int val;
+
+ if (level != SOL_SOCKET)
+ return -EOPNOTSUPP;
+
+ if (!unix_custom_sockopt(optname))
+ return sock_setsockopt(sock, level, optname, optval, optlen);
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ switch (optname) {
+ case SO_INQ:
+ if (sk->sk_type != SOCK_STREAM)
+ return -EINVAL;
+
+ if (val > 1 || val < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(u->recvmsg_inq, val);
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ return 0;
+}
+
static const struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
.owner = THIS_MODULE,
@@ -945,6 +991,7 @@ static const struct proto_ops unix_stream_ops = {
#endif
.listen = unix_listen,
.shutdown = unix_shutdown,
+ .setsockopt = unix_setsockopt,
.sendmsg = unix_stream_sendmsg,
.recvmsg = unix_stream_recvmsg,
.read_skb = unix_stream_read_skb,
@@ -1111,6 +1158,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
switch (sock->type) {
case SOCK_STREAM:
+ set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
sock->ops = &unix_stream_ops;
break;
/*
@@ -1842,6 +1890,9 @@ static int unix_accept(struct socket *sock, struct socket *newsock,
skb_free_datagram(sk, skb);
wake_up_interruptible(&unix_sk(sk)->peer_wait);
+ if (tsk->sk_type == SOCK_STREAM)
+ set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+
/* attach accepted sock to socket */
unix_state_lock(tsk);
unix_update_edges(unix_sk(tsk));
@@ -2322,6 +2373,7 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other,
spin_lock(&other->sk_receive_queue.lock);
WRITE_ONCE(ousk->oob_skb, skb);
+ WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1);
__skb_queue_tail(&other->sk_receive_queue, skb);
spin_unlock(&other->sk_receive_queue.lock);
@@ -2344,6 +2396,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
struct sock *sk = sock->sk;
struct sk_buff *skb = NULL;
struct sock *other = NULL;
+ struct unix_sock *otheru;
struct scm_cookie scm;
bool fds_sent = false;
int err, sent = 0;
@@ -2367,14 +2420,16 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
if (msg->msg_namelen) {
err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
goto out_err;
- } else {
- other = unix_peer(sk);
- if (!other) {
- err = -ENOTCONN;
- goto out_err;
- }
}
+ other = unix_peer(sk);
+ if (!other) {
+ err = -ENOTCONN;
+ goto out_err;
+ }
+
+ otheru = unix_sk(other);
+
if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
goto out_pipe;
@@ -2417,8 +2472,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
- err = skb_splice_from_iter(skb, &msg->msg_iter, size,
- sk->sk_allocation);
+ err = skb_splice_from_iter(skb, &msg->msg_iter, size);
if (err < 0)
goto out_free;
@@ -2446,7 +2500,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
}
scm_stat_add(other, skb);
- skb_queue_tail(&other->sk_receive_queue, skb);
+
+ spin_lock(&other->sk_receive_queue.lock);
+ WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len);
+ __skb_queue_tail(&other->sk_receive_queue, skb);
+ spin_unlock(&other->sk_receive_queue.lock);
+
unix_state_unlock(other);
other->sk_data_ready(other);
sent += size;
@@ -2556,12 +2615,10 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
&err, &timeo, last));
if (!skb) { /* implies iolock unlocked */
- unix_state_lock(sk);
/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
- (sk->sk_shutdown & RCV_SHUTDOWN))
+ (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN))
err = 0;
- unix_state_unlock(sk);
goto out;
}
@@ -2734,6 +2791,7 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)
if (!(state->flags & MSG_PEEK)) {
WRITE_ONCE(u->oob_skb, NULL);
+ WRITE_ONCE(u->inq_len, u->inq_len - 1);
if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue &&
!unix_skb_len(oob_skb->prev)) {
@@ -2816,6 +2874,7 @@ unlock:
static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
+ struct sk_buff_head *queue = &sk->sk_receive_queue;
struct unix_sock *u = unix_sk(sk);
struct sk_buff *skb;
int err;
@@ -2823,60 +2882,57 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
return -ENOTCONN;
- mutex_lock(&u->iolock);
- skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
- mutex_unlock(&u->iolock);
- if (!skb)
+ err = sock_error(sk);
+ if (err)
return err;
-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
- if (unlikely(skb == READ_ONCE(u->oob_skb))) {
- bool drop = false;
-
- unix_state_lock(sk);
+ mutex_lock(&u->iolock);
+ spin_lock(&queue->lock);
- if (sock_flag(sk, SOCK_DEAD)) {
- unix_state_unlock(sk);
- kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
- return -ECONNRESET;
- }
+ skb = __skb_dequeue(queue);
+ if (!skb) {
+ spin_unlock(&queue->lock);
+ mutex_unlock(&u->iolock);
+ return -EAGAIN;
+ }
- spin_lock(&sk->sk_receive_queue.lock);
- if (likely(skb == u->oob_skb)) {
- WRITE_ONCE(u->oob_skb, NULL);
- drop = true;
- }
- spin_unlock(&sk->sk_receive_queue.lock);
+ WRITE_ONCE(u->inq_len, u->inq_len - skb->len);
- unix_state_unlock(sk);
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+ if (skb == u->oob_skb) {
+ WRITE_ONCE(u->oob_skb, NULL);
+ spin_unlock(&queue->lock);
+ mutex_unlock(&u->iolock);
- if (drop) {
- kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
- return -EAGAIN;
- }
+ kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
+ return -EAGAIN;
}
#endif
+ spin_unlock(&queue->lock);
+ mutex_unlock(&u->iolock);
+
return recv_actor(sk, skb);
}
static int unix_stream_read_generic(struct unix_stream_read_state *state,
bool freezable)
{
- struct scm_cookie scm;
+ int noblock = state->flags & MSG_DONTWAIT;
struct socket *sock = state->socket;
+ struct msghdr *msg = state->msg;
struct sock *sk = sock->sk;
- struct unix_sock *u = unix_sk(sk);
- int copied = 0;
+ size_t size = state->size;
int flags = state->flags;
- int noblock = flags & MSG_DONTWAIT;
bool check_creds = false;
- int target;
+ struct scm_cookie scm;
+ unsigned int last_len;
+ struct unix_sock *u;
+ int copied = 0;
int err = 0;
long timeo;
+ int target;
int skip;
- size_t size = state->size;
- unsigned int last_len;
if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
err = -EINVAL;
@@ -2896,6 +2952,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
memset(&scm, 0, sizeof(scm));
+ u = unix_sk(sk);
+
/* Lock the socket to prevent queue disordering
* while sleeps in memcpy_tomsg
*/
@@ -2986,14 +3044,12 @@ unlock:
}
/* Copy address just once */
- if (state->msg && state->msg->msg_name) {
- DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
- state->msg->msg_name);
- unix_copy_addr(state->msg, skb->sk);
+ if (msg && msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
- BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
- state->msg->msg_name,
- &state->msg->msg_namelen);
+ unix_copy_addr(msg, skb->sk);
+ BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name,
+ &msg->msg_namelen);
sunaddr = NULL;
}
@@ -3022,7 +3078,11 @@ unlock:
if (unix_skb_len(skb))
break;
- skb_unlink(skb, &sk->sk_receive_queue);
+ spin_lock(&sk->sk_receive_queue.lock);
+ WRITE_ONCE(u->inq_len, u->inq_len - skb->len);
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ spin_unlock(&sk->sk_receive_queue.lock);
+
consume_skb(skb);
if (scm.fp)
@@ -3051,10 +3111,17 @@ unlock:
} while (size);
mutex_unlock(&u->iolock);
- if (state->msg)
- scm_recv_unix(sock, state->msg, &scm, flags);
- else
+ if (msg) {
+ scm_recv_unix(sock, msg, &scm, flags);
+
+ if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) {
+ msg->msg_inq = READ_ONCE(u->inq_len);
+ put_cmsg(msg, SOL_SOCKET, SCM_INQ,
+ sizeof(msg->msg_inq), &msg->msg_inq);
+ }
+ } else {
scm_destroy(&scm);
+ }
out:
return copied ? : err;
}
@@ -3193,9 +3260,11 @@ long unix_inq_len(struct sock *sk)
if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
return -EINVAL;
+ if (sk->sk_type == SOCK_STREAM)
+ return READ_ONCE(unix_sk(sk)->inq_len);
+
spin_lock(&sk->sk_receive_queue.lock);
- if (sk->sk_type == SOCK_STREAM ||
- sk->sk_type == SOCK_SEQPACKET) {
+ if (sk->sk_type == SOCK_SEQPACKET) {
skb_queue_walk(&sk->sk_receive_queue, skb)
amount += unix_skb_len(skb);
} else {
@@ -3715,7 +3784,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
goto unlock;
}
- uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+ uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
ret = unix_prog_seq_show(prog, &meta, v, uid);