From 10bbf1652c1cca9819e98d56f3432c56d7a2d229 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 20:28:11 +0000 Subject: net: implement lockless SO_PRIORITY This is a followup of 8bf43be799d4 ("net: annotate data-races around sk->sk_priority"). sk->sk_priority can be read and written without holding the socket lock. Signed-off-by: Eric Dumazet Reviewed-by: Wenjia Zhang Signed-off-by: David S. Miller --- net/core/sock.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index a5995750c5c5..1fdc0a0d8ff2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -806,9 +806,7 @@ EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { - lock_sock(sk); WRITE_ONCE(sk->sk_priority, priority); - release_sock(sk); } EXPORT_SYMBOL(sock_set_priority); @@ -1118,6 +1116,18 @@ int sk_setsockopt(struct sock *sk, int level, int optname, valbool = val ? 1 : 0; + /* handle options which do not require locking the socket. */ + switch (optname) { + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + sock_set_priority(sk, val); + return 0; + } + return -EPERM; + } + sockopt_lock_sock(sk); switch (optname) { @@ -1213,15 +1223,6 @@ set_sndbuf: sk->sk_no_check_tx = valbool; break; - case SO_PRIORITY: - if ((val >= 0 && val <= 6) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - WRITE_ONCE(sk->sk_priority, val); - else - ret = -EPERM; - break; - case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ -- cgit From 8ebfb6db5a01f16cd37254bfad7145204e4bf6f2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 20:28:12 +0000 Subject: net: lockless SO_PASSCRED, SO_PASSPIDFD and SO_PASSSEC sock->flags are atomic, no need to hold the socket lock in sk_setsockopt() for SO_PASSCRED, SO_PASSPIDFD and SO_PASSSEC. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 1fdc0a0d8ff2..f01c75724568 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1126,6 +1126,15 @@ int sk_setsockopt(struct sock *sk, int level, int optname, return 0; } return -EPERM; + case SO_PASSSEC: + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); + return 0; + case SO_PASSCRED: + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); + return 0; + case SO_PASSPIDFD: + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); + return 0; } sockopt_lock_sock(sk); @@ -1248,14 +1257,6 @@ set_sndbuf: case SO_BSDCOMPAT: break; - case SO_PASSCRED: - assign_bit(SOCK_PASSCRED, &sock->flags, valbool); - break; - - case SO_PASSPIDFD: - assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); - break; - case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: @@ -1361,9 +1362,6 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; - case SO_PASSSEC: - assign_bit(SOCK_PASSSEC, &sock->flags, valbool); - break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { -- cgit From b120251590a9c771bae353e444503fa49793c75e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 20:28:13 +0000 Subject: net: lockless SO_{TYPE|PROTOCOL|DOMAIN|ERROR } setsockopt() This options can not be set and return -ENOPROTOOPT, no need to acqure socket lock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index f01c75724568..4d20b74a93cb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1135,6 +1135,11 @@ int sk_setsockopt(struct sock *sk, int level, int optname, case SO_PASSPIDFD: assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); return 0; + case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: + case SO_ERROR: + return -ENOPROTOOPT; } sockopt_lock_sock(sk); @@ -1152,12 +1157,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: sk->sk_reuseport = valbool; break; - case SO_TYPE: - case SO_PROTOCOL: - case SO_DOMAIN: - case SO_ERROR: - ret = -ENOPROTOOPT; - break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); sk_dst_reset(sk); -- cgit From 2a4319cf3c83fc5d1997466196b99b3e14584e76 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 20:28:14 +0000 Subject: net: lockless implementation of SO_BUSY_POLL, SO_PREFER_BUSY_POLL, SO_BUSY_POLL_BUDGET Setting sk->sk_ll_usec, sk_prefer_busy_poll and sk_busy_poll_budget do not require the socket lock, readers are lockless anyway. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 4d20b74a93cb..408081549bd7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1140,6 +1140,26 @@ int sk_setsockopt(struct sock *sk, int level, int optname, case SO_DOMAIN: case SO_ERROR: return -ENOPROTOOPT; +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + if (val < 0) + return -EINVAL; + WRITE_ONCE(sk->sk_ll_usec, val); + return 0; + case SO_PREFER_BUSY_POLL: + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) + return -EPERM; + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + return 0; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && + !sockopt_capable(CAP_NET_ADMIN)) + return -EPERM; + if (val < 0 || val > U16_MAX) + return -EINVAL; + WRITE_ONCE(sk->sk_busy_poll_budget, val); + return 0; +#endif } sockopt_lock_sock(sk); @@ -1402,30 +1422,6 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; -#ifdef CONFIG_NET_RX_BUSY_POLL - case SO_BUSY_POLL: - if (val < 0) - ret = -EINVAL; - else - WRITE_ONCE(sk->sk_ll_usec, val); - break; - case SO_PREFER_BUSY_POLL: - if (valbool && !sockopt_capable(CAP_NET_ADMIN)) - ret = -EPERM; - else - WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); - break; - case SO_BUSY_POLL_BUDGET: - if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { - ret = -EPERM; - } else { - if (val < 0 || val > U16_MAX) - ret = -EINVAL; - else - WRITE_ONCE(sk->sk_busy_poll_budget, val); - } - break; -#endif case SO_MAX_PACING_RATE: { -- cgit From 28b24f90020fed8e8e3e8e20575f08c1cd06e54f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 20:28:15 +0000 Subject: net: implement lockless SO_MAX_PACING_RATE SO_MAX_PACING_RATE setsockopt() does not need to hold the socket lock, because sk->sk_pacing_rate readers can run fine if the value is changed by other threads, after adding READ_ONCE() accessors. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 408081549bd7..4254ed0e4817 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1160,6 +1160,27 @@ int sk_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(sk->sk_busy_poll_budget, val); return 0; #endif + case SO_MAX_PACING_RATE: + { + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; + unsigned long pacing_rate; + + if (sizeof(ulval) != sizeof(val) && + optlen >= sizeof(ulval) && + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { + return -EFAULT; + } + if (ulval != ~0UL) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); + /* Pairs with READ_ONCE() from sk_getsockopt() */ + WRITE_ONCE(sk->sk_max_pacing_rate, ulval); + pacing_rate = READ_ONCE(sk->sk_pacing_rate); + if (ulval < pacing_rate) + WRITE_ONCE(sk->sk_pacing_rate, ulval); + return 0; + } } sockopt_lock_sock(sk); @@ -1423,25 +1444,6 @@ set_sndbuf: break; - case SO_MAX_PACING_RATE: - { - unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; - - if (sizeof(ulval) != sizeof(val) && - optlen >= sizeof(ulval) && - copy_from_sockptr(&ulval, optval, sizeof(ulval))) { - ret = -EFAULT; - break; - } - if (ulval != ~0UL) - cmpxchg(&sk->sk_pacing_status, - SK_PACING_NONE, - SK_PACING_NEEDED); - /* Pairs with READ_ONCE() from sk_getsockopt() */ - WRITE_ONCE(sk->sk_max_pacing_rate, ulval); - sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); - break; - } case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; -- cgit From 5eef0b8de1be40c5d05873b7e3d63824300c9f39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 20:28:16 +0000 Subject: net: lockless implementation of SO_TXREHASH sk->sk_txrehash readers are already safe against concurrent change of this field. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 4254ed0e4817..f0930f858714 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1181,6 +1181,16 @@ int sk_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(sk->sk_pacing_rate, ulval); return 0; } + case SO_TXREHASH: + if (val < -1 || val > 1) + return -EINVAL; + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + /* Paired with READ_ONCE() in tcp_rtx_synack() + * and sk_getsockopt(). + */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); + return 0; } sockopt_lock_sock(sk); @@ -1528,19 +1538,6 @@ set_sndbuf: break; } - case SO_TXREHASH: - if (val < -1 || val > 1) { - ret = -EINVAL; - break; - } - if ((u8)val == SOCK_TXREHASH_DEFAULT) - val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - /* Paired with READ_ONCE() in tcp_rtx_synack() - * and sk_getsockopt(). - */ - WRITE_ONCE(sk->sk_txrehash, (u8)val); - break; - default: ret = -ENOPROTOOPT; break; -- cgit From eb44ad4e635132754bfbcb18103f1dcb7058aedd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 20:28:18 +0000 Subject: net: annotate data-races around sk->sk_dst_pending_confirm This field can be read or written without socket lock being held. Add annotations to avoid load-store tearing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index f0930f858714..290165954379 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -600,7 +600,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; -- cgit