From 4d748f9916076399f01c259d30fe1b88abe8f622 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:11 -0700 Subject: net: Add sk_setsockopt() to take the sk ptr instead of the sock ptr A latter patch refactors bpf_setsockopt(SOL_SOCKET) with the sock_setsockopt() to avoid code duplication and code drift between the two duplicates. The current sock_setsockopt() takes sock ptr as the argument. The very first thing of this function is to get back the sk ptr by 'sk = sock->sk'. bpf_setsockopt() could be called when the sk does not have the sock ptr created. Meaning sk->sk_socket is NULL. For example, when a passive tcp connection has just been established but has yet been accept()-ed. Thus, it cannot use the sock_setsockopt(sk->sk_socket) or else it will pass a NULL ptr. This patch moves all sock_setsockopt implementation to the newly added sk_setsockopt(). The new sk_setsockopt() takes a sk ptr and immediately gets the sock ptr by 'sock = sk->sk_socket' The existing sock_setsockopt(sock) is changed to call sk_setsockopt(sock->sk). All existing callers have both sock->sk and sk->sk_socket pointer. The latter patch will make bpf_setsockopt(SOL_SOCKET) call sk_setsockopt(sk) directly. The bpf_setsockopt(SOL_SOCKET) does not use the optnames that require sk->sk_socket, so it will be safe. Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061711.4175048-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/sock.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 4cb957d934a2..20269c37ab3b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1041,12 +1041,12 @@ static int sock_reserve_memory(struct sock *sk, int bytes) * at the socket level. Everything here is generic. */ -int sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) +static int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; + struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; - struct sock *sk = sock->sk; int val; int valbool; struct linger ling; @@ -1499,6 +1499,13 @@ set_sndbuf: release_sock(sk); return ret; } + +int sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return sk_setsockopt(sock->sk, level, optname, + optval, optlen); +} EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) -- cgit From 24426654ed3ae83d1127511891fb782c54f49203 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:17 -0700 Subject: bpf: net: Avoid sk_setsockopt() taking sk lock when called from bpf Most of the code in bpf_setsockopt(SOL_SOCKET) are duplicated from the sk_setsockopt(). The number of supported optnames are increasing ever and so as the duplicated code. One issue in reusing sk_setsockopt() is that the bpf prog has already acquired the sk lock. This patch adds a has_current_bpf_ctx() to tell if the sk_setsockopt() is called from a bpf prog. The bpf prog calling bpf_setsockopt() is either running in_task() or in_serving_softirq(). Both cases have the current->bpf_ctx initialized. Thus, the has_current_bpf_ctx() only needs to test !!current->bpf_ctx. This patch also adds sockopt_{lock,release}_sock() helpers for sk_setsockopt() to use. These helpers will test has_current_bpf_ctx() before acquiring/releasing the lock. They are in EXPORT_SYMBOL for the ipv6 module to use in a latter patch. Note on the change in sock_setbindtodevice(). sockopt_lock_sock() is done in sock_setbindtodevice() instead of doing the lock_sock in sock_bindtoindex(..., lock_sk = true). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061717.4175589-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/sock.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 20269c37ab3b..d3683228376f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -703,7 +703,9 @@ static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) goto out; } - return sock_bindtoindex(sk, index, true); + sockopt_lock_sock(sk); + ret = sock_bindtoindex_locked(sk, index); + sockopt_release_sock(sk); out: #endif @@ -1036,6 +1038,28 @@ static int sock_reserve_memory(struct sock *sk, int bytes) return 0; } +void sockopt_lock_sock(struct sock *sk) +{ + /* When current->bpf_ctx is set, the setsockopt is called from + * a bpf prog. bpf has ensured the sk lock has been + * acquired before calling setsockopt(). + */ + if (has_current_bpf_ctx()) + return; + + lock_sock(sk); +} +EXPORT_SYMBOL(sockopt_lock_sock); + +void sockopt_release_sock(struct sock *sk) +{ + if (has_current_bpf_ctx()) + return; + + release_sock(sk); +} +EXPORT_SYMBOL(sockopt_release_sock); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -1067,7 +1091,7 @@ static int sk_setsockopt(struct sock *sk, int level, int optname, valbool = val ? 1 : 0; - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: @@ -1496,7 +1520,7 @@ set_sndbuf: ret = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); return ret; } -- cgit From e42c7beee71d0d84a6193357e3525d0cf2a3e168 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:23 -0700 Subject: bpf: net: Consider has_current_bpf_ctx() when testing capable() in sk_setsockopt() When bpf program calling bpf_setsockopt(SOL_SOCKET), it could be run in softirq and doesn't make sense to do the capable check. There was a similar situation in bpf_setsockopt(TCP_CONGESTION). In commit 8d650cdedaab ("tcp: fix tcp_set_congestion_control() use from bpf hook"), tcp_set_congestion_control(..., cap_net_admin) was added to skip the cap check for bpf prog. This patch adds sockopt_ns_capable() and sockopt_capable() for the sk_setsockopt() to use. They will consider the has_current_bpf_ctx() before doing the ns_capable() and capable() test. They are in EXPORT_SYMBOL for the ipv6 module to use in a latter patch. Suggested-by: Stanislav Fomichev Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061723.4175820-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/sock.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index d3683228376f..7ea46e4700fd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1060,6 +1060,18 @@ void sockopt_release_sock(struct sock *sk) } EXPORT_SYMBOL(sockopt_release_sock); +bool sockopt_ns_capable(struct user_namespace *ns, int cap) +{ + return has_current_bpf_ctx() || ns_capable(ns, cap); +} +EXPORT_SYMBOL(sockopt_ns_capable); + +bool sockopt_capable(int cap) +{ + return has_current_bpf_ctx() || capable(cap); +} +EXPORT_SYMBOL(sockopt_capable); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -1095,7 +1107,7 @@ static int sk_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case SO_DEBUG: - if (val && !capable(CAP_NET_ADMIN)) + if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); @@ -1139,7 +1151,7 @@ set_sndbuf: break; case SO_SNDBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1161,7 +1173,7 @@ set_sndbuf: break; case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1188,8 +1200,8 @@ set_sndbuf: case SO_PRIORITY: if ((val >= 0 && val <= 6) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) sk->sk_priority = val; else ret = -EPERM; @@ -1334,8 +1346,8 @@ set_sndbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1343,8 +1355,8 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1378,7 +1390,7 @@ set_sndbuf: #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: /* allow unprivileged users to decrease the value */ - if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else { if (val < 0) @@ -1388,13 +1400,13 @@ set_sndbuf: } break; case SO_PREFER_BUSY_POLL: - if (valbool && !capable(CAP_NET_ADMIN)) + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; case SO_BUSY_POLL_BUDGET: - if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; } else { if (val < 0 || val > U16_MAX) @@ -1465,7 +1477,7 @@ set_sndbuf: * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } -- cgit From ebf9e8e653667e834372e9435217a7bf33bec7a0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:58 -0700 Subject: bpf: Embed kernel CONFIG check into the if statement in bpf_setsockopt This patch moves the "#ifdef CONFIG_XXX" check into the "if/else" statement itself. The change is done for the bpf_setsockopt() function only. It will make the latter patches easier to follow without the surrounding ifdef macro. Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061758.4178374-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e8508aaafd27..a663d7b96bad 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5116,8 +5116,7 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, default: ret = -EINVAL; } -#ifdef CONFIG_INET - } else if (level == SOL_IP) { + } else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) { if (optlen != sizeof(int) || sk->sk_family != AF_INET) return -EINVAL; @@ -5138,8 +5137,7 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, default: ret = -EINVAL; } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { + } else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) { if (optlen != sizeof(int) || sk->sk_family != AF_INET6) return -EINVAL; @@ -5160,8 +5158,7 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, default: ret = -EINVAL; } -#endif - } else if (level == SOL_TCP && + } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; @@ -5253,7 +5250,6 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, ret = -EINVAL; } } -#endif } else { ret = -EINVAL; } -- cgit From 29003875bd5bab262a29d1c6e76a2124bd07e4c2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:04 -0700 Subject: bpf: Change bpf_setsockopt(SOL_SOCKET) to reuse sk_setsockopt() After the prep work in the previous patches, this patch removes most of the dup code from bpf_setsockopt(SOL_SOCKET) and reuses them from sk_setsockopt(). The sock ptr test is added to the SO_RCVLOWAT because the sk->sk_socket could be NULL in some of the bpf hooks. The existing optname white-list is refactored into a new function sol_socket_setsockopt(). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061804.4178920-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 124 +++++++++++++----------------------------------------- net/core/sock.c | 6 +-- 2 files changed, 32 insertions(+), 98 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index a663d7b96bad..6f5bcc8df487 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5013,109 +5013,43 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static int sol_socket_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + switch (optname) { + case SO_SNDBUF: + case SO_RCVBUF: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_REUSEPORT: + case SO_RCVLOWAT: + case SO_MARK: + case SO_MAX_PACING_RATE: + case SO_BINDTOIFINDEX: + case SO_TXREHASH: + if (optlen != sizeof(int)) + return -EINVAL; + break; + case SO_BINDTODEVICE: + break; + default: + return -EINVAL; + } + + return sk_setsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), optlen); +} + static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { - char devname[IFNAMSIZ]; - int val, valbool; - struct net *net; - int ifindex; - int ret = 0; + int val, ret = 0; if (!sk_fullsock(sk)) return -EINVAL; if (level == SOL_SOCKET) { - if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) - return -EINVAL; - val = *((int *)optval); - valbool = val ? 1 : 0; - - /* Only some socketops are supported */ - switch (optname) { - case SO_RCVBUF: - val = min_t(u32, val, sysctl_rmem_max); - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); - break; - case SO_SNDBUF: - val = min_t(u32, val, sysctl_wmem_max); - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - WRITE_ONCE(sk->sk_sndbuf, - max_t(int, val * 2, SOCK_MIN_SNDBUF)); - break; - case SO_MAX_PACING_RATE: /* 32bit version */ - if (val != ~0U) - cmpxchg(&sk->sk_pacing_status, - SK_PACING_NONE, - SK_PACING_NEEDED); - sk->sk_max_pacing_rate = (val == ~0U) ? - ~0UL : (unsigned int)val; - sk->sk_pacing_rate = min(sk->sk_pacing_rate, - sk->sk_max_pacing_rate); - break; - case SO_PRIORITY: - sk->sk_priority = val; - break; - case SO_RCVLOWAT: - if (val < 0) - val = INT_MAX; - if (sk->sk_socket && sk->sk_socket->ops->set_rcvlowat) - ret = sk->sk_socket->ops->set_rcvlowat(sk, val); - else - WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); - break; - case SO_MARK: - if (sk->sk_mark != val) { - sk->sk_mark = val; - sk_dst_reset(sk); - } - break; - case SO_BINDTODEVICE: - optlen = min_t(long, optlen, IFNAMSIZ - 1); - strncpy(devname, optval, optlen); - devname[optlen] = 0; - - ifindex = 0; - if (devname[0] != '\0') { - struct net_device *dev; - - ret = -ENODEV; - - net = sock_net(sk); - dev = dev_get_by_name(net, devname); - if (!dev) - break; - ifindex = dev->ifindex; - dev_put(dev); - } - fallthrough; - case SO_BINDTOIFINDEX: - if (optname == SO_BINDTOIFINDEX) - ifindex = val; - ret = sock_bindtoindex(sk, ifindex, false); - break; - case SO_KEEPALIVE: - if (sk->sk_prot->keepalive) - sk->sk_prot->keepalive(sk, valbool); - sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); - break; - case SO_REUSEPORT: - sk->sk_reuseport = valbool; - break; - case SO_TXREHASH: - if (val < -1 || val > 1) { - ret = -EINVAL; - break; - } - sk->sk_txrehash = (u8)val; - break; - default: - ret = -EINVAL; - } + return sol_socket_setsockopt(sk, optname, optval, optlen); } else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) { if (optlen != sizeof(int) || sk->sk_family != AF_INET) return -EINVAL; diff --git a/net/core/sock.c b/net/core/sock.c index 7ea46e4700fd..2a6f84702eb9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1077,8 +1077,8 @@ EXPORT_SYMBOL(sockopt_capable); * at the socket level. Everything here is generic. */ -static int sk_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; struct socket *sock = sk->sk_socket; @@ -1264,7 +1264,7 @@ set_sndbuf: case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - if (sock->ops->set_rcvlowat) + if (sock && sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); -- cgit From 57db31a1a3adf2a4bd42528c4ac41fb50d6be59a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:12 -0700 Subject: bpf: Refactor bpf specific tcp optnames to a new function The patch moves all bpf specific tcp optnames (TCP_BPF_XXX) to a new function bpf_sol_tcp_setsockopt(). This will make the next patch easier to follow. Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061812.4179645-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 79 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 6f5bcc8df487..bb135d456a53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5040,6 +5040,52 @@ static int sol_socket_setsockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), optlen); } +static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + val = *(int *)optval; + + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > tp->syn_data) + return -EINVAL; + tcp_snd_cwnd_set(tp, val); + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) + return -EINVAL; + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; + default: + return -EINVAL; + } + + return 0; +} + static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { @@ -5094,6 +5140,10 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, } } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { + if (optname >= TCP_BPF_IW) + return bpf_sol_tcp_setsockopt(sk, optname, + optval, optlen); + if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; @@ -5104,7 +5154,6 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - unsigned long timeout; if (optlen != sizeof(int)) return -EINVAL; @@ -5112,34 +5161,6 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, val = *((int *)optval); /* Only some options are supported */ switch (optname) { - case TCP_BPF_IW: - if (val <= 0 || tp->data_segs_out > tp->syn_data) - ret = -EINVAL; - else - tcp_snd_cwnd_set(tp, val); - break; - case TCP_BPF_SNDCWND_CLAMP: - if (val <= 0) { - ret = -EINVAL; - } else { - tp->snd_cwnd_clamp = val; - tp->snd_ssthresh = val; - } - break; - case TCP_BPF_DELACK_MAX: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_DELACK_MAX || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_delack_max = timeout; - break; - case TCP_BPF_RTO_MIN: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_RTO_MIN || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_rto_min = timeout; - break; case TCP_SAVE_SYN: if (val < 0 || val > 1) ret = -EINVAL; -- cgit From 0c751f7071ef98d334ed06ca3f8f4cc1f7458cf5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:19 -0700 Subject: bpf: Change bpf_setsockopt(SOL_TCP) to reuse do_tcp_setsockopt() After the prep work in the previous patches, this patch removes all the dup code from bpf_setsockopt(SOL_TCP) and reuses the do_tcp_setsockopt(). The existing optname white-list is refactored into a new function sol_tcp_setsockopt(). The sol_tcp_setsockopt() also calls the bpf_sol_tcp_setsockopt() to handle the TCP_BPF_XXX specific optnames. bpf_setsockopt(TCP_SAVE_SYN) now also allows a value 2 to save the eth header also and it comes for free from do_tcp_setsockopt(). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061819.4180146-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 97 +++++++++++++++++-------------------------------------- 1 file changed, 30 insertions(+), 67 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index bb135d456a53..66877605bb78 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5086,6 +5086,34 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, return 0; } +static int sol_tcp_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + if (sk->sk_prot->setsockopt != tcp_setsockopt) + return -EINVAL; + + switch (optname) { + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + case TCP_USER_TIMEOUT: + case TCP_NOTSENT_LOWAT: + case TCP_SAVE_SYN: + if (optlen != sizeof(int)) + return -EINVAL; + break; + case TCP_CONGESTION: + break; + default: + return bpf_sol_tcp_setsockopt(sk, optname, optval, optlen); + } + + return do_tcp_setsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), optlen); +} + static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { @@ -5138,73 +5166,8 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, default: ret = -EINVAL; } - } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && - sk->sk_prot->setsockopt == tcp_setsockopt) { - if (optname >= TCP_BPF_IW) - return bpf_sol_tcp_setsockopt(sk, optname, - optval, optlen); - - if (optname == TCP_CONGESTION) { - char name[TCP_CA_NAME_MAX]; - - strncpy(name, optval, min_t(long, optlen, - TCP_CA_NAME_MAX-1)); - name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, true); - } else { - struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (optlen != sizeof(int)) - return -EINVAL; - - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case TCP_SAVE_SYN: - if (val < 0 || val > 1) - ret = -EINVAL; - else - tp->save_syn = val; - break; - case TCP_KEEPIDLE: - ret = tcp_sock_set_keepidle_locked(sk, val); - break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - ret = -EINVAL; - else - tp->keepalive_intvl = val * HZ; - break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - ret = -EINVAL; - else - tp->keepalive_probes = val; - break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - ret = -EINVAL; - else - icsk->icsk_syn_retries = val; - break; - case TCP_USER_TIMEOUT: - if (val < 0) - ret = -EINVAL; - else - icsk->icsk_user_timeout = val; - break; - case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; - sk->sk_write_space(sk); - break; - case TCP_WINDOW_CLAMP: - ret = tcp_set_window_clamp(sk, val); - break; - default: - ret = -EINVAL; - } - } + } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) { + return sol_tcp_setsockopt(sk, optname, optval, optlen); } else { ret = -EINVAL; } -- cgit From ee7f1e1302f5cb29168f70827c12855f1d8c9845 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:26 -0700 Subject: bpf: Change bpf_setsockopt(SOL_IP) to reuse do_ip_setsockopt() After the prep work in the previous patches, this patch removes the dup code from bpf_setsockopt(SOL_IP) and reuses the implementation in do_ip_setsockopt(). The existing optname white-list is refactored into a new function sol_ip_setsockopt(). NOTE, the current bpf_setsockopt(IP_TOS) is quite different from the the do_ip_setsockopt(IP_TOS). For example, it does not take the INET_ECN_MASK into the account for tcp and also does not adjust sk->sk_priority. It looks like the current bpf_setsockopt(IP_TOS) was referencing the IPV6_TCLASS implementation instead of IP_TOS. This patch tries to rectify that by using the do_ip_setsockopt(IP_TOS). While this is a behavior change, the do_ip_setsockopt(IP_TOS) behavior is arguably what the user is expecting. At least, the INET_ECN_MASK bits should be masked out for tcp. Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061826.4180990-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 66877605bb78..4d1b42b8f4a8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5114,6 +5114,25 @@ static int sol_tcp_setsockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), optlen); } +static int sol_ip_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + if (sk->sk_family != AF_INET) + return -EINVAL; + + switch (optname) { + case IP_TOS: + if (optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return do_ip_setsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), optlen); +} + static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { @@ -5125,26 +5144,7 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, if (level == SOL_SOCKET) { return sol_socket_setsockopt(sk, optname, optval, optlen); } else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET) - return -EINVAL; - - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct inet_sock *inet = inet_sk(sk); - - if (val == -1) - val = 0; - inet->tos = val; - } - break; - default: - ret = -EINVAL; - } + return sol_ip_setsockopt(sk, optname, optval, optlen); } else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) { if (optlen != sizeof(int) || sk->sk_family != AF_INET6) return -EINVAL; -- cgit From 75b64b68ee3f9fe90ad8f21e5c5c92de58abf725 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:34 -0700 Subject: bpf: Change bpf_setsockopt(SOL_IPV6) to reuse do_ipv6_setsockopt() After the prep work in the previous patches, this patch removes the dup code from bpf_setsockopt(SOL_IPV6) and reuses the implementation in do_ipv6_setsockopt(). ipv6 could be compiled as a module. Like how other code solved it with stubs in ipv6_stubs.h, this patch adds the do_ipv6_setsockopt to the ipv6_bpf_stub. The current bpf_setsockopt(IPV6_TCLASS) does not take the INET_ECN_MASK into the account for tcp. The do_ipv6_setsockopt(IPV6_TCLASS) will handle it correctly. The existing optname white-list is refactored into a new function sol_ipv6_setsockopt(). After this last SOL_IPV6 dup code removal, the __bpf_setsockopt() is simplified enough that the extra "{ }" around the if statement can be removed. Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061834.4181198-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 56 ++++++++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 4d1b42b8f4a8..23282b8cf61e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5133,45 +5133,41 @@ static int sol_ip_setsockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), optlen); } +static int sol_ipv6_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + if (sk->sk_family != AF_INET6) + return -EINVAL; + + switch (optname) { + case IPV6_TCLASS: + if (optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), optlen); +} + static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { - int val, ret = 0; - if (!sk_fullsock(sk)) return -EINVAL; - if (level == SOL_SOCKET) { + if (level == SOL_SOCKET) return sol_socket_setsockopt(sk, optname, optval, optlen); - } else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) { + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) return sol_ip_setsockopt(sk, optname, optval, optlen); - } else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - return -EINVAL; - - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct ipv6_pinfo *np = inet6_sk(sk); - - if (val == -1) - val = 0; - np->tclass = val; - } - break; - default: - ret = -EINVAL; - } - } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) { + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + return sol_ipv6_setsockopt(sk, optname, optval, optlen); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) return sol_tcp_setsockopt(sk, optname, optval, optlen); - } else { - ret = -EINVAL; - } - return ret; + + return -EINVAL; } static int _bpf_setsockopt(struct sock *sk, int level, int optname, -- cgit From 7e41df5dbba23a78c3fd30033a6123a06e1168e7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:41 -0700 Subject: bpf: Add a few optnames to bpf_setsockopt This patch adds a few optnames for bpf_setsockopt: SO_REUSEADDR, IPV6_AUTOFLOWLABEL, TCP_MAXSEG, TCP_NODELAY, and TCP_THIN_LINEAR_TIMEOUTS. Thanks to the previous patches of this set, all additions can reuse the sk_setsockopt(), do_ipv6_setsockopt(), and do_tcp_setsockopt(). The only change here is to allow them in bpf_setsockopt. The bpf prog has been able to read all members of a sk by using PTR_TO_BTF_ID of a sk. The optname additions here can also be read by the same approach. Meaning there is a way to read the values back. These optnames can also be added to bpf_getsockopt() later with another patch set that makes the bpf_getsockopt() to reuse the sock_getsockopt(), tcp_getsockopt(), and ip[v6]_getsockopt(). Thus, this patch does not add more duplicated code to bpf_getsockopt() now. Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061841.4181642-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 23282b8cf61e..1acfaffeaf32 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5017,6 +5017,7 @@ static int sol_socket_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { switch (optname) { + case SO_REUSEADDR: case SO_SNDBUF: case SO_RCVBUF: case SO_KEEPALIVE: @@ -5093,11 +5094,14 @@ static int sol_tcp_setsockopt(struct sock *sk, int optname, return -EINVAL; switch (optname) { + case TCP_NODELAY: + case TCP_MAXSEG: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPCNT: case TCP_SYNCNT: case TCP_WINDOW_CLAMP: + case TCP_THIN_LINEAR_TIMEOUTS: case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: @@ -5141,6 +5145,7 @@ static int sol_ipv6_setsockopt(struct sock *sk, int optname, switch (optname) { case IPV6_TCLASS: + case IPV6_AUTOFLOWLABEL: if (optlen != sizeof(int)) return -EINVAL; break; -- cgit From 0ba985024ae7db226776725d9aa436b5c1c9fca2 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:16 +0300 Subject: flow_dissector: Make 'bpf_flow_dissect' return the bpf program retcode Let 'bpf_flow_dissect' callers know the BPF program's retcode and act accordingly. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-2-shmulik.ladkani@gmail.com --- net/core/flow_dissector.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 764c4cb3fe8f..a01817fb4ef4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -866,8 +866,8 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags) +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -892,7 +892,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); - return result == BPF_OK; + return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) @@ -1008,6 +1008,7 @@ bool __skb_flow_dissect(const struct net *net, }; __be16 n_proto = proto; struct bpf_prog *prog; + u32 result; if (skb) { ctx.skb = skb; @@ -1019,12 +1020,12 @@ bool __skb_flow_dissect(const struct net *net, } prog = READ_ONCE(run_array->items[0].prog); - ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, - hlen, flags); + result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, + hlen, flags); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); - return ret; + return result == BPF_OK; } rcu_read_unlock(); } -- cgit From 91350fe152930c0d61a362af68272526490efea5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:17 +0300 Subject: bpf, flow_dissector: Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode for bpf progs Currently, attaching BPF_PROG_TYPE_FLOW_DISSECTOR programs completely replaces the flow-dissector logic with custom dissection logic. This forces implementors to write programs that handle dissection for any flows expected in the namespace. It makes sense for flow-dissector BPF programs to just augment the dissector with custom logic (e.g. dissecting certain flows or custom protocols), while enjoying the broad capabilities of the standard dissector for any other traffic. Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode. Flow-dissector BPF programs may return this to indicate no dissection was made, and fallback to the standard dissector is requested. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-3-shmulik.ladkani@gmail.com --- net/core/flow_dissector.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index a01817fb4ef4..990429c69ccd 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1022,11 +1022,14 @@ bool __skb_flow_dissect(const struct net *net, prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); + if (result == BPF_FLOW_DISSECTOR_CONTINUE) + goto dissect_continue; __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); return result == BPF_OK; } +dissect_continue: rcu_read_unlock(); } -- cgit From bed89185af0de0d417e29ca1798df50f161b0231 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:52 -0700 Subject: bpf: Use cgroup_{common,current}_func_proto in more hooks The following hooks are per-cgroup hooks but they are not using cgroup_{common,current}_func_proto, fix it: * BPF_PROG_TYPE_CGROUP_SKB (cg_skb) * BPF_PROG_TYPE_CGROUP_SOCK_ADDR (cg_sock_addr) * BPF_PROG_TYPE_CGROUP_SOCK (cg_sock) * BPF_PROG_TYPE_LSM+BPF_LSM_CGROUP Also: * move common func_proto's into cgroup func_proto handlers * make sure bpf_{g,s}et_retval are not accessible from recvmsg, getpeername and getsockname (return/errno is ignored in these places) * as a side effect, expose get_current_pid_tgid, get_current_comm_proto, get_current_ancestor_cgroup_id, get_cgroup_classid to more cgroup hooks Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 80 +++++++++++++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 47 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 1acfaffeaf32..63e25d8ce501 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3009,7 +3009,7 @@ BPF_CALL_0(bpf_get_cgroup_classid_curr) return __task_get_classid(current); } -static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { +const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, @@ -7581,34 +7581,23 @@ const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: @@ -7621,12 +7610,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: @@ -7639,24 +7633,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; @@ -7737,9 +7715,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -7979,6 +7961,12 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; @@ -7992,8 +7980,6 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: -- cgit From 44c51472bef83bb70b43e2f4b7a592096f32a855 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 31 Aug 2022 17:40:09 +0300 Subject: bpf: Support getting tunnel flags Existing 'bpf_skb_get_tunnel_key' extracts various tunnel parameters (id, ttl, tos, local and remote) but does not expose ip_tunnel_info's tun_flags to the BPF program. It makes sense to expose tun_flags to the BPF program. Assume for example multiple GRE tunnels maintained on a single GRE interface in collect_md mode. The program expects origins to initiate over GRE, however different origins use different GRE characteristics (e.g. some prefer to use GRE checksum, some do not; some pass a GRE key, some do not, etc..). A BPF program getting tun_flags can therefore remember the relevant flags (e.g. TUNNEL_CSUM, TUNNEL_SEQ...) for each initiating remote. In the reply path, the program can use 'bpf_skb_set_tunnel_key' in order to correctly reply to the remote, using similar characteristics, based on the stored tunnel flags. Introduce BPF_F_TUNINFO_FLAGS flag for bpf_skb_get_tunnel_key. If specified, 'bpf_tunnel_key->tunnel_flags' is set with the tun_flags. Decided to use the existing unused 'tunnel_ext' as the storage for the 'tunnel_flags' in order to avoid changing bpf_tunnel_key's layout. Also, the following has been considered during the design: 1. Convert the "interesting" internal TUNNEL_xxx flags back to BPF_F_yyy and place into the new 'tunnel_flags' field. This has 2 drawbacks: - The BPF_F_yyy flags are from *set_tunnel_key* enumeration space, e.g. BPF_F_ZERO_CSUM_TX. It is awkward that it is "returned" into tunnel_flags from a *get_tunnel_key* call. - Not all "interesting" TUNNEL_xxx flags can be mapped to existing BPF_F_yyy flags, and it doesn't make sense to create new BPF_F_yyy flags just for purposes of the returned tunnel_flags. 2. Place key.tun_flags into 'tunnel_flags' but mask them, keeping only "interesting" flags. That's ok, but the drawback is that what's "interesting" for my usecase might be limiting for other usecases. Therefore I decided to expose what's in key.tun_flags *as is*, which seems most flexible. The BPF user can just choose to ignore bits he's not interested in. The TUNNEL_xxx are also UAPI, so no harm exposing them back in the get_tunnel_key call. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220831144010.174110-1-shmulik.ladkani@gmail.com --- net/core/filter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 63e25d8ce501..74e2a4a0d747 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4488,7 +4488,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key void *to_orig = to; int err; - if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | + BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } @@ -4520,7 +4521,10 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - to->tunnel_ext = 0; + if (flags & BPF_F_TUNINFO_FLAGS) + to->tunnel_flags = info->key.tun_flags; + else + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, -- cgit From ba74a7608dc12fbbd8ea36e660087f08a81ef26a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:27:56 -0700 Subject: net: Change sock_getsockopt() to take the sk ptr instead of the sock ptr A latter patch refactors bpf_getsockopt(SOL_SOCKET) with the sock_getsockopt() to avoid code duplication and code drift between the two duplicates. The current sock_getsockopt() takes sock ptr as the argument. The very first thing of this function is to get back the sk ptr by 'sk = sock->sk'. bpf_getsockopt() could be called when the sk does not have the sock ptr created. Meaning sk->sk_socket is NULL. For example, when a passive tcp connection has just been established but has yet been accept()-ed. Thus, it cannot use the sock_getsockopt(sk->sk_socket) or else it will pass a NULL ptr. This patch moves all sock_getsockopt implementation to the newly added sk_getsockopt(). The new sk_getsockopt() takes a sk ptr and immediately gets the sock ptr by 'sock = sk->sk_socket' The existing sock_getsockopt(sock) is changed to call sk_getsockopt(sock->sk). All existing callers have both sock->sk and sk->sk_socket pointer. The latter patch will make bpf_getsockopt(SOL_SOCKET) call sk_getsockopt(sk) directly. The bpf_getsockopt(SOL_SOCKET) does not use the optnames that require sk->sk_socket, so it will be safe. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002756.2887884-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/sock.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 2a6f84702eb9..21bc4bf6b485 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1580,10 +1580,10 @@ static int groups_to_user(gid_t __user *dst, const struct group_info *src) return 0; } -int sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +static int sk_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) { - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; union { int val; @@ -1947,6 +1947,12 @@ lenout: return 0; } +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return sk_getsockopt(sock->sk, level, optname, optval, optlen); +} + /* * Initialize an sk_lock. * -- cgit From 4ff09db1b79b98b4a2a7511571c640b76cab3beb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:02 -0700 Subject: bpf: net: Change sk_getsockopt() to take the sockptr_t argument This patch changes sk_getsockopt() to take the sockptr_t argument such that it can be used by bpf_getsockopt(SOL_SOCKET) in a latter patch. security_socket_getpeersec_stream() is not changed. It stays with the __user ptr (optval.user and optlen.user) to avoid changes to other security hooks. bpf_getsockopt(SOL_SOCKET) also does not support SO_PEERSEC. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002802.2888419-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++--- net/core/sock.c | 43 ++++++++++++++++++++++++------------------- 2 files changed, 26 insertions(+), 22 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 74e2a4a0d747..962014f7f64b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10716,8 +10716,7 @@ int sk_detach_filter(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_detach_filter); -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, - unsigned int len) +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; @@ -10748,7 +10747,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; ret = -EFAULT; - if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) + if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number diff --git a/net/core/sock.c b/net/core/sock.c index 21bc4bf6b485..7fa30fd4b37f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -712,8 +712,8 @@ out: return ret; } -static int sock_getbindtodevice(struct sock *sk, char __user *optval, - int __user *optlen, int len) +static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -737,12 +737,12 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, len = strlen(devname) + 1; ret = -EFAULT; - if (copy_to_user(optval, devname, len)) + if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; @@ -1568,20 +1568,23 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred, } } -static int groups_to_user(gid_t __user *dst, const struct group_info *src) +static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; - for (i = 0; i < src->ngroups; i++) - if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) + for (i = 0; i < src->ngroups; i++) { + gid_t gid = from_kgid_munged(user_ns, src->gid[i]); + + if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; + } return 0; } static int sk_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + sockptr_t optval, sockptr_t optlen) { struct socket *sock = sk->sk_socket; @@ -1600,7 +1603,7 @@ static int sk_getsockopt(struct sock *sk, int level, int optname, int lv = sizeof(int); int len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; @@ -1735,7 +1738,7 @@ static int sk_getsockopt(struct sock *sk, int level, int optname, cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); - if (copy_to_user(optval, &peercred, len)) + if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } @@ -1753,11 +1756,11 @@ static int sk_getsockopt(struct sock *sk, int level, int optname, if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); - return put_user(len, optlen) ? -EFAULT : -ERANGE; + return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, cred->group_info); + ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; @@ -1773,7 +1776,7 @@ static int sk_getsockopt(struct sock *sk, int level, int optname, return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_user(optval, address, len)) + if (copy_to_sockptr(optval, address, len)) return -EFAULT; goto lenout; } @@ -1790,7 +1793,7 @@ static int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval, optlen, len); + return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len); case SO_MARK: v.val = sk->sk_mark; @@ -1822,7 +1825,7 @@ static int sk_getsockopt(struct sock *sk, int level, int optname, return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: - len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); + len = sk_get_filter(sk, optval, len); if (len < 0) return len; @@ -1870,7 +1873,7 @@ static int sk_getsockopt(struct sock *sk, int level, int optname, sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); - if (copy_to_user(optval, &meminfo, len)) + if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; @@ -1939,10 +1942,10 @@ static int sk_getsockopt(struct sock *sk, int level, int optname, if (len > lv) len = lv; - if (copy_to_user(optval, &v, len)) + if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } @@ -1950,7 +1953,9 @@ lenout: int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { - return sk_getsockopt(sock->sk, level, optname, optval, optlen); + return sk_getsockopt(sock->sk, level, optname, + USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); } /* -- cgit From 2c5b6bf5cda048af896bb0e12a956783f7d6c835 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:09 -0700 Subject: bpf: net: Avoid sk_getsockopt() taking sk lock when called from bpf Similar to the earlier commit that changed sk_setsockopt() to use sockopt_{lock,release}_sock() such that it can avoid taking lock when called from bpf. This patch also changes sk_getsockopt() to use sockopt_{lock,release}_sock() such that a latter patch can make bpf_getsockopt(SOL_SOCKET) to reuse sk_getsockopt(). Only sk_get_filter() requires this change and it is used by the optname SO_GET_FILTER. The '.getname' implementations in sock->ops->getname() is not changed also since bpf does not always have the sk->sk_socket pointer and cannot support SO_PEERNAME. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002809.2888981-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 962014f7f64b..f57f78feb380 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10722,7 +10722,7 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) struct sk_filter *filter; int ret = 0; - lock_sock(sk); + sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) @@ -10755,7 +10755,7 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) */ ret = fprog->len; out: - release_sock(sk); + sockopt_release_sock(sk); return ret; } -- cgit From c2b063ca34586758dcfd76e12696a71a1df849a0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:06 -0700 Subject: bpf: Embed kernel CONFIG check into the if statement in bpf_getsockopt This patch moves the "#ifdef CONFIG_XXX" check into the "if/else" statement itself. The change is done for the bpf_getsockopt() function only. It will make the latter patches easier to follow without the surrounding ifdef macro. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002906.2893572-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index f57f78feb380..55795815cc44 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5222,8 +5222,8 @@ static int __bpf_getsockopt(struct sock *sk, int level, int optname, default: goto err_clear; } -#ifdef CONFIG_INET - } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + } else if (IS_ENABLED(CONFIG_INET) && + level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; struct tcp_sock *tp; @@ -5247,7 +5247,7 @@ static int __bpf_getsockopt(struct sock *sk, int level, int optname, default: goto err_clear; } - } else if (level == SOL_IP) { + } else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) { struct inet_sock *inet = inet_sk(sk); if (optlen != sizeof(int) || sk->sk_family != AF_INET) @@ -5261,8 +5261,7 @@ static int __bpf_getsockopt(struct sock *sk, int level, int optname, default: goto err_clear; } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { + } else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) { struct ipv6_pinfo *np = inet6_sk(sk); if (optlen != sizeof(int) || sk->sk_family != AF_INET6) @@ -5276,8 +5275,6 @@ static int __bpf_getsockopt(struct sock *sk, int level, int optname, default: goto err_clear; } -#endif -#endif } else { goto err_clear; } -- cgit From 65ddc82d3b96be5555a36de4e2b4547433a00532 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:12 -0700 Subject: bpf: Change bpf_getsockopt(SOL_SOCKET) to reuse sk_getsockopt() This patch changes bpf_getsockopt(SOL_SOCKET) to reuse sk_getsockopt(). It removes all duplicated code from bpf_getsockopt(SOL_SOCKET). Before this patch, there were some optnames available to bpf_setsockopt(SOL_SOCKET) but missing in bpf_getsockopt(SOL_SOCKET). It surprises users from time to time. For example, SO_REUSEADDR, SO_KEEPALIVE, SO_RCVLOWAT, and SO_MAX_PACING_RATE. This patch automatically closes this gap without duplicating more code. The only exception is SO_BINDTODEVICE because it needs to acquire a blocking lock. Thus, SO_BINDTODEVICE is not supported. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002912.2894040-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 57 ++++++++++++++++++++++--------------------------------- net/core/sock.c | 4 ++-- 2 files changed, 25 insertions(+), 36 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 55795815cc44..9b26653a7e1f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5017,8 +5017,9 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static int sol_socket_setsockopt(struct sock *sk, int optname, - char *optval, int optlen) +static int sol_socket_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { switch (optname) { case SO_REUSEADDR: @@ -5032,7 +5033,7 @@ static int sol_socket_setsockopt(struct sock *sk, int optname, case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: - if (optlen != sizeof(int)) + if (*optlen != sizeof(int)) return -EINVAL; break; case SO_BINDTODEVICE: @@ -5041,8 +5042,16 @@ static int sol_socket_setsockopt(struct sock *sk, int optname, return -EINVAL; } + if (getopt) { + if (optname == SO_BINDTODEVICE) + return -EINVAL; + return sk_getsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + return sk_setsockopt(sk, SOL_SOCKET, optname, - KERNEL_SOCKPTR(optval), optlen); + KERNEL_SOCKPTR(optval), *optlen); } static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, @@ -5168,7 +5177,7 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; if (level == SOL_SOCKET) - return sol_socket_setsockopt(sk, optname, optval, optlen); + return sol_socket_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) return sol_ip_setsockopt(sk, optname, optval, optlen); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) @@ -5190,38 +5199,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, static int __bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { + int err = 0, saved_optlen = optlen; + if (!sk_fullsock(sk)) goto err_clear; if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) - goto err_clear; - - switch (optname) { - case SO_RCVBUF: - *((int *)optval) = sk->sk_rcvbuf; - break; - case SO_SNDBUF: - *((int *)optval) = sk->sk_sndbuf; - break; - case SO_MARK: - *((int *)optval) = sk->sk_mark; - break; - case SO_PRIORITY: - *((int *)optval) = sk->sk_priority; - break; - case SO_BINDTOIFINDEX: - *((int *)optval) = sk->sk_bound_dev_if; - break; - case SO_REUSEPORT: - *((int *)optval) = sk->sk_reuseport; - break; - case SO_TXREHASH: - *((int *)optval) = sk->sk_txrehash; - break; - default: - goto err_clear; - } + err = sol_socket_sockopt(sk, optname, optval, &optlen, true); } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; @@ -5278,7 +5262,12 @@ static int __bpf_getsockopt(struct sock *sk, int level, int optname, } else { goto err_clear; } - return 0; + + if (err) + optlen = 0; + if (optlen < saved_optlen) + memset(optval + optlen, 0, saved_optlen - optlen); + return err; err_clear: memset(optval, 0, optlen); return -EINVAL; diff --git a/net/core/sock.c b/net/core/sock.c index 7fa30fd4b37f..68e4662eb2eb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1583,8 +1583,8 @@ static int groups_to_user(sockptr_t dst, const struct group_info *src) return 0; } -static int sk_getsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, sockptr_t optlen) +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { struct socket *sock = sk->sk_socket; -- cgit From 273b7f0fb44847c41814a59901977be284daa447 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:18 -0700 Subject: bpf: Change bpf_getsockopt(SOL_TCP) to reuse do_tcp_getsockopt() This patch changes bpf_getsockopt(SOL_TCP) to reuse do_tcp_getsockopt(). It removes the duplicated code from bpf_getsockopt(SOL_TCP). Before this patch, there were some optnames available to bpf_setsockopt(SOL_TCP) but missing in bpf_getsockopt(SOL_TCP). For example, TCP_NODELAY, TCP_MAXSEG, TCP_KEEPIDLE, TCP_KEEPINTVL, and a few more. It surprises users from time to time. This patch automatically closes this gap without duplicating more code. bpf_getsockopt(TCP_SAVED_SYN) does not free the saved_syn, so it stays in sol_tcp_sockopt(). For string name value like TCP_CONGESTION, bpf expects it is always null terminated, so sol_tcp_sockopt() decrements optlen by one before calling do_tcp_getsockopt() and the 'if (optlen < saved_optlen) memset(..,0,..);' in __bpf_getsockopt() will always do a null termination. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002918.2894511-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 74 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 9b26653a7e1f..beadd5b83e6c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5100,8 +5100,9 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, return 0; } -static int sol_tcp_setsockopt(struct sock *sk, int optname, - char *optval, int optlen) +static int sol_tcp_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { if (sk->sk_prot->setsockopt != tcp_setsockopt) return -EINVAL; @@ -5118,17 +5119,51 @@ static int sol_tcp_setsockopt(struct sock *sk, int optname, case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: - if (optlen != sizeof(int)) + if (*optlen != sizeof(int)) return -EINVAL; break; case TCP_CONGESTION: + if (*optlen < 2) + return -EINVAL; + break; + case TCP_SAVED_SYN: + if (*optlen < 1) + return -EINVAL; break; default: - return bpf_sol_tcp_setsockopt(sk, optname, optval, optlen); + if (getopt) + return -EINVAL; + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); + } + + if (getopt) { + if (optname == TCP_SAVED_SYN) { + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->saved_syn || + *optlen > tcp_saved_syn_len(tp->saved_syn)) + return -EINVAL; + memcpy(optval, tp->saved_syn->data, *optlen); + /* It cannot free tp->saved_syn here because it + * does not know if the user space still needs it. + */ + return 0; + } + + if (optname == TCP_CONGESTION) { + if (!inet_csk(sk)->icsk_ca_ops) + return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; + } + + return do_tcp_getsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); } return do_tcp_setsockopt(sk, SOL_TCP, optname, - KERNEL_SOCKPTR(optval), optlen); + KERNEL_SOCKPTR(optval), *optlen); } static int sol_ip_setsockopt(struct sock *sk, int optname, @@ -5183,7 +5218,7 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) return sol_ipv6_setsockopt(sk, optname, optval, optlen); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) - return sol_tcp_setsockopt(sk, optname, optval, optlen); + return sol_tcp_sockopt(sk, optname, optval, &optlen, false); return -EINVAL; } @@ -5206,31 +5241,8 @@ static int __bpf_getsockopt(struct sock *sk, int level, int optname, if (level == SOL_SOCKET) { err = sol_socket_sockopt(sk, optname, optval, &optlen, true); - } else if (IS_ENABLED(CONFIG_INET) && - level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { - struct inet_connection_sock *icsk; - struct tcp_sock *tp; - - switch (optname) { - case TCP_CONGESTION: - icsk = inet_csk(sk); - - if (!icsk->icsk_ca_ops || optlen <= 1) - goto err_clear; - strncpy(optval, icsk->icsk_ca_ops->name, optlen); - optval[optlen - 1] = 0; - break; - case TCP_SAVED_SYN: - tp = tcp_sk(sk); - - if (optlen <= 0 || !tp->saved_syn || - optlen > tcp_saved_syn_len(tp->saved_syn)) - goto err_clear; - memcpy(optval, tp->saved_syn->data, optlen); - break; - default: - goto err_clear; - } + } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) { + err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); } else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) { struct inet_sock *inet = inet_sk(sk); -- cgit From fd969f25fe24be515278d28cbf86dde39be8a495 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:25 -0700 Subject: bpf: Change bpf_getsockopt(SOL_IP) to reuse do_ip_getsockopt() This patch changes bpf_getsockopt(SOL_IP) to reuse do_ip_getsockopt() and remove the duplicated code. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002925.2895416-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index beadd5b83e6c..33275460fe66 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5166,23 +5166,29 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), *optlen); } -static int sol_ip_setsockopt(struct sock *sk, int optname, - char *optval, int optlen) +static int sol_ip_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { if (sk->sk_family != AF_INET) return -EINVAL; switch (optname) { case IP_TOS: - if (optlen != sizeof(int)) + if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } + if (getopt) + return do_ip_getsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + return do_ip_setsockopt(sk, SOL_IP, optname, - KERNEL_SOCKPTR(optval), optlen); + KERNEL_SOCKPTR(optval), *optlen); } static int sol_ipv6_setsockopt(struct sock *sk, int optname, @@ -5214,7 +5220,7 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, if (level == SOL_SOCKET) return sol_socket_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) - return sol_ip_setsockopt(sk, optname, optval, optlen); + return sol_ip_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) return sol_ipv6_setsockopt(sk, optname, optval, optlen); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) @@ -5244,19 +5250,7 @@ static int __bpf_getsockopt(struct sock *sk, int level, int optname, } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) { err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); } else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) { - struct inet_sock *inet = inet_sk(sk); - - if (optlen != sizeof(int) || sk->sk_family != AF_INET) - goto err_clear; - - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - *((int *)optval) = (int)inet->tos; - break; - default: - goto err_clear; - } + err = sol_ip_sockopt(sk, optname, optval, &optlen, true); } else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) { struct ipv6_pinfo *np = inet6_sk(sk); -- cgit From 38566ec06f52250c4abaa7447aae676e0b881c46 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:31 -0700 Subject: bpf: Change bpf_getsockopt(SOL_IPV6) to reuse do_ipv6_getsockopt() This patch changes bpf_getsockopt(SOL_IPV6) to reuse do_ipv6_getsockopt(). It removes the duplicated code from bpf_getsockopt(SOL_IPV6). This also makes bpf_getsockopt(SOL_IPV6) supporting the same set of optnames as in bpf_setsockopt(SOL_IPV6). In particular, this adds IPV6_AUTOFLOWLABEL support to bpf_getsockopt(SOL_IPV6). ipv6 could be compiled as a module. Like how other code solved it with stubs in ipv6_stubs.h, this patch adds the do_ipv6_getsockopt to the ipv6_bpf_stub. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002931.2896218-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 55 ++++++++++++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 33275460fe66..ee768bb5b5ab 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5191,8 +5191,9 @@ static int sol_ip_sockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), *optlen); } -static int sol_ipv6_setsockopt(struct sock *sk, int optname, - char *optval, int optlen) +static int sol_ipv6_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { if (sk->sk_family != AF_INET6) return -EINVAL; @@ -5200,15 +5201,20 @@ static int sol_ipv6_setsockopt(struct sock *sk, int optname, switch (optname) { case IPV6_TCLASS: case IPV6_AUTOFLOWLABEL: - if (optlen != sizeof(int)) + if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } + if (getopt) + return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, - KERNEL_SOCKPTR(optval), optlen); + KERNEL_SOCKPTR(optval), *optlen); } static int __bpf_setsockopt(struct sock *sk, int level, int optname, @@ -5222,7 +5228,7 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) return sol_ip_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) - return sol_ipv6_setsockopt(sk, optname, optval, optlen); + return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) return sol_tcp_sockopt(sk, optname, optval, &optlen, false); @@ -5240,43 +5246,30 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, static int __bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { - int err = 0, saved_optlen = optlen; + int err, saved_optlen = optlen; - if (!sk_fullsock(sk)) - goto err_clear; + if (!sk_fullsock(sk)) { + err = -EINVAL; + goto done; + } - if (level == SOL_SOCKET) { + if (level == SOL_SOCKET) err = sol_socket_sockopt(sk, optname, optval, &optlen, true); - } else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) { + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); - } else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) { + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) err = sol_ip_sockopt(sk, optname, optval, &optlen, true); - } else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - goto err_clear; - - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - *((int *)optval) = (int)np->tclass; - break; - default: - goto err_clear; - } - } else { - goto err_clear; - } + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); + else + err = -EINVAL; +done: if (err) optlen = 0; if (optlen < saved_optlen) memset(optval + optlen, 0, saved_optlen - optlen); return err; -err_clear: - memset(optval, 0, optlen); - return -EINVAL; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, -- cgit