summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2023-05-24 21:57:57 -0700
committerJakub Kicinski <kuba@kernel.org>2023-05-24 21:57:57 -0700
commit0c615f1cc3b333775b9c0b56e369f8dbca1e0226 (patch)
tree8799101c67d85957c7354eb95f6cd020706b33dc /net
parent878ecb0897f4737a4c9401f3523fd49589025671 (diff)
parentf726e03564ef4e754dd93beb54303e2e1671049e (diff)
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf
Daniel Borkmann says: ==================== pull-request: bpf 2023-05-24 We've added 19 non-merge commits during the last 10 day(s) which contain a total of 20 files changed, 738 insertions(+), 448 deletions(-). The main changes are: 1) Batch of BPF sockmap fixes found when running against NGINX TCP tests, from John Fastabend. 2) Fix a memleak in the LRU{,_PERCPU} hash map when bucket locking fails, from Anton Protopopov. 3) Init the BPF offload table earlier than just late_initcall, from Jakub Kicinski. 4) Fix ctx access mask generation for 32-bit narrow loads of 64-bit fields, from Will Deacon. 5) Remove a now unsupported __fallthrough in BPF samples, from Andrii Nakryiko. 6) Fix a typo in pkg-config call for building sign-file, from Jeremy Sowden. * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf: bpf, sockmap: Test progs verifier error with latest clang bpf, sockmap: Test FIONREAD returns correct bytes in rx buffer with drops bpf, sockmap: Test FIONREAD returns correct bytes in rx buffer bpf, sockmap: Test shutdown() correctly exits epoll and recv()=0 bpf, sockmap: Build helper to create connected socket pair bpf, sockmap: Pull socket helpers out of listen test for general use bpf, sockmap: Incorrectly handling copied_seq bpf, sockmap: Wake up polling after data copy bpf, sockmap: TCP data stall on recv before accept bpf, sockmap: Handle fin correctly bpf, sockmap: Improved check for empty queue bpf, sockmap: Reschedule is now done through backlog bpf, sockmap: Convert schedule_work into delayed_work bpf, sockmap: Pass skb ownership through read_skb bpf: fix a memory leak in the LRU and LRU_PERCPU hash maps bpf: Fix mask generation for 32-bit narrow loads of 64-bit fields samples/bpf: Drop unnecessary fallthrough bpf: netdev: init the offload table earlier selftests/bpf: Fix pkg-config call building sign-file ==================== Link: https://lore.kernel.org/r/20230524170839.13905-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/core/skmsg.c81
-rw-r--r--net/core/sock_map.c3
-rw-r--r--net/ipv4/tcp.c11
-rw-r--r--net/ipv4/tcp_bpf.c79
-rw-r--r--net/ipv4/udp.c7
-rw-r--r--net/unix/af_unix.c7
-rw-r--r--net/vmw_vsock/virtio_transport_common.c5
7 files changed, 124 insertions, 69 deletions
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index f81883759d38..a9060e1f0e43 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -481,8 +481,6 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
msg_rx = sk_psock_peek_msg(psock);
}
out:
- if (psock->work_state.skb && copied > 0)
- schedule_work(&psock->work);
return copied;
}
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
@@ -624,42 +622,33 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
static void sk_psock_skb_state(struct sk_psock *psock,
struct sk_psock_work_state *state,
- struct sk_buff *skb,
int len, int off)
{
spin_lock_bh(&psock->ingress_lock);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
- state->skb = skb;
state->len = len;
state->off = off;
- } else {
- sock_drop(psock->sk, skb);
}
spin_unlock_bh(&psock->ingress_lock);
}
static void sk_psock_backlog(struct work_struct *work)
{
- struct sk_psock *psock = container_of(work, struct sk_psock, work);
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
struct sk_psock_work_state *state = &psock->work_state;
struct sk_buff *skb = NULL;
+ u32 len = 0, off = 0;
bool ingress;
- u32 len, off;
int ret;
mutex_lock(&psock->work_mutex);
- if (unlikely(state->skb)) {
- spin_lock_bh(&psock->ingress_lock);
- skb = state->skb;
+ if (unlikely(state->len)) {
len = state->len;
off = state->off;
- state->skb = NULL;
- spin_unlock_bh(&psock->ingress_lock);
}
- if (skb)
- goto start;
- while ((skb = skb_dequeue(&psock->ingress_skb))) {
+ while ((skb = skb_peek(&psock->ingress_skb))) {
len = skb->len;
off = 0;
if (skb_bpf_strparser(skb)) {
@@ -668,7 +657,6 @@ static void sk_psock_backlog(struct work_struct *work)
off = stm->offset;
len = stm->full_len;
}
-start:
ingress = skb_bpf_ingress(skb);
skb_bpf_redirect_clear(skb);
do {
@@ -678,22 +666,28 @@ start:
len, ingress);
if (ret <= 0) {
if (ret == -EAGAIN) {
- sk_psock_skb_state(psock, state, skb,
- len, off);
+ sk_psock_skb_state(psock, state, len, off);
+
+ /* Delay slightly to prioritize any
+ * other work that might be here.
+ */
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_delayed_work(&psock->work, 1);
goto end;
}
/* Hard errors break pipe and stop xmit. */
sk_psock_report_error(psock, ret ? -ret : EPIPE);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- sock_drop(psock->sk, skb);
goto end;
}
off += ret;
len -= ret;
} while (len);
- if (!ingress)
+ skb = skb_dequeue(&psock->ingress_skb);
+ if (!ingress) {
kfree_skb(skb);
+ }
}
end:
mutex_unlock(&psock->work_mutex);
@@ -734,7 +728,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
INIT_LIST_HEAD(&psock->link);
spin_lock_init(&psock->link_lock);
- INIT_WORK(&psock->work, sk_psock_backlog);
+ INIT_DELAYED_WORK(&psock->work, sk_psock_backlog);
mutex_init(&psock->work_mutex);
INIT_LIST_HEAD(&psock->ingress_msg);
spin_lock_init(&psock->ingress_lock);
@@ -786,11 +780,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
skb_bpf_redirect_clear(skb);
sock_drop(psock->sk, skb);
}
- kfree_skb(psock->work_state.skb);
- /* We null the skb here to ensure that calls to sk_psock_backlog
- * do not pick up the free'd skb.
- */
- psock->work_state.skb = NULL;
__sk_psock_purge_ingress_msg(psock);
}
@@ -809,7 +798,6 @@ void sk_psock_stop(struct sk_psock *psock)
spin_lock_bh(&psock->ingress_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
sk_psock_cork_free(psock);
- __sk_psock_zap_ingress(psock);
spin_unlock_bh(&psock->ingress_lock);
}
@@ -823,7 +811,8 @@ static void sk_psock_destroy(struct work_struct *work)
sk_psock_done_strp(psock);
- cancel_work_sync(&psock->work);
+ cancel_delayed_work_sync(&psock->work);
+ __sk_psock_zap_ingress(psock);
mutex_destroy(&psock->work_mutex);
psock_progs_drop(&psock->progs);
@@ -938,7 +927,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
}
skb_queue_tail(&psock_other->ingress_skb, skb);
- schedule_work(&psock_other->work);
+ schedule_delayed_work(&psock_other->work, 0);
spin_unlock_bh(&psock_other->ingress_lock);
return 0;
}
@@ -990,10 +979,8 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
err = -EIO;
sk_other = psock->sk;
if (sock_flag(sk_other, SOCK_DEAD) ||
- !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
- skb_bpf_redirect_clear(skb);
+ !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
goto out_free;
- }
skb_bpf_set_ingress(skb);
@@ -1018,22 +1005,23 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
spin_lock_bh(&psock->ingress_lock);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
skb_queue_tail(&psock->ingress_skb, skb);
- schedule_work(&psock->work);
+ schedule_delayed_work(&psock->work, 0);
err = 0;
}
spin_unlock_bh(&psock->ingress_lock);
- if (err < 0) {
- skb_bpf_redirect_clear(skb);
+ if (err < 0)
goto out_free;
- }
}
break;
case __SK_REDIRECT:
+ tcp_eat_skb(psock->sk, skb);
err = sk_psock_skb_redirect(psock, skb);
break;
case __SK_DROP:
default:
out_free:
+ skb_bpf_redirect_clear(skb);
+ tcp_eat_skb(psock->sk, skb);
sock_drop(psock->sk, skb);
}
@@ -1049,7 +1037,7 @@ static void sk_psock_write_space(struct sock *sk)
psock = sk_psock(sk);
if (likely(psock)) {
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
- schedule_work(&psock->work);
+ schedule_delayed_work(&psock->work, 0);
write_space = psock->saved_write_space;
}
rcu_read_unlock();
@@ -1078,8 +1066,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
skb_dst_drop(skb);
skb_bpf_redirect_clear(skb);
ret = bpf_prog_run_pin_on_cpu(prog, skb);
- if (ret == SK_PASS)
- skb_bpf_set_strparser(skb);
+ skb_bpf_set_strparser(skb);
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
@@ -1183,12 +1170,11 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
int ret = __SK_DROP;
int len = skb->len;
- skb_get(skb);
-
rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock)) {
len = 0;
+ tcp_eat_skb(sk, skb);
sock_drop(sk, skb);
goto out;
}
@@ -1212,12 +1198,21 @@ out:
static void sk_psock_verdict_data_ready(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
+ int copied;
trace_sk_data_ready(sk);
if (unlikely(!sock || !sock->ops || !sock->ops->read_skb))
return;
- sock->ops->read_skb(sk, sk_psock_verdict_recv);
+ copied = sock->ops->read_skb(sk, sk_psock_verdict_recv);
+ if (copied >= 0) {
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ psock->saved_data_ready(sk);
+ rcu_read_unlock();
+ }
}
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 7c189c2e2fbf..00afb66cd095 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1644,9 +1644,10 @@ void sock_map_close(struct sock *sk, long timeout)
rcu_read_unlock();
sk_psock_stop(psock);
release_sock(sk);
- cancel_work_sync(&psock->work);
+ cancel_delayed_work_sync(&psock->work);
sk_psock_put(sk, psock);
}
+
/* Make sure we do not recurse. This is a bug.
* Leak the socket instead of crashing on a stack overflow.
*/
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4d6392c16b7a..a60f6f4e7cd9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1571,7 +1571,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
+void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
@@ -1773,7 +1773,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
used = recv_actor(sk, skb);
- consume_skb(skb);
if (used < 0) {
if (!copied)
copied = used;
@@ -1787,14 +1786,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
break;
}
}
- WRITE_ONCE(tp->copied_seq, seq);
-
- tcp_rcv_space_adjust(sk);
-
- /* Clean up data we have read: This will do ACK frames. */
- if (copied > 0)
- __tcp_cleanup_rbuf(sk, copied);
-
return copied;
}
EXPORT_SYMBOL(tcp_read_skb);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 2e9547467edb..5f93918c063c 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -11,6 +11,24 @@
#include <net/inet_common.h>
#include <net/tls.h>
+void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tcp;
+ int copied;
+
+ if (!skb || !skb->len || !sk_is_tcp(sk))
+ return;
+
+ if (skb_bpf_strparser(skb))
+ return;
+
+ tcp = tcp_sk(sk);
+ copied = tcp->copied_seq + skb->len;
+ WRITE_ONCE(tcp->copied_seq, copied);
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, skb->len);
+}
+
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, u32 apply_bytes, int flags)
{
@@ -174,14 +192,34 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
return ret;
}
+static bool is_next_msg_fin(struct sk_psock *psock)
+{
+ struct scatterlist *sge;
+ struct sk_msg *msg_rx;
+ int i;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ i = msg_rx->sg.start;
+ sge = sk_msg_elem(msg_rx, i);
+ if (!sge->length) {
+ struct sk_buff *skb = msg_rx->skb;
+
+ if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ return true;
+ }
+ return false;
+}
+
static int tcp_bpf_recvmsg_parser(struct sock *sk,
struct msghdr *msg,
size_t len,
int flags,
int *addr_len)
{
+ struct tcp_sock *tcp = tcp_sk(sk);
+ u32 seq = tcp->copied_seq;
struct sk_psock *psock;
- int copied;
+ int copied = 0;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
@@ -194,8 +232,43 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
return tcp_recvmsg(sk, msg, len, flags, addr_len);
lock_sock(sk);
+
+ /* We may have received data on the sk_receive_queue pre-accept and
+ * then we can not use read_skb in this context because we haven't
+ * assigned a sk_socket yet so have no link to the ops. The work-around
+ * is to check the sk_receive_queue and in these cases read skbs off
+ * queue again. The read_skb hook is not running at this point because
+ * of lock_sock so we avoid having multiple runners in read_skb.
+ */
+ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+ tcp_data_ready(sk);
+ /* This handles the ENOMEM errors if we both receive data
+ * pre accept and are already under memory pressure. At least
+ * let user know to retry.
+ */
+ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+ copied = -EAGAIN;
+ goto out;
+ }
+ }
+
msg_bytes_ready:
copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ /* The typical case for EFAULT is the socket was gracefully
+ * shutdown with a FIN pkt. So check here the other case is
+ * some error on copy_page_to_iter which would be unexpected.
+ * On fin return correct return code to zero.
+ */
+ if (copied == -EFAULT) {
+ bool is_fin = is_next_msg_fin(psock);
+
+ if (is_fin) {
+ copied = 0;
+ seq++;
+ goto out;
+ }
+ }
+ seq += copied;
if (!copied) {
long timeo;
int data;
@@ -233,6 +306,10 @@ msg_bytes_ready:
copied = -EAGAIN;
}
out:
+ WRITE_ONCE(tcp->copied_seq, seq);
+ tcp_rcv_space_adjust(sk);
+ if (copied > 0)
+ __tcp_cleanup_rbuf(sk, copied);
release_sock(sk);
sk_psock_put(sk, psock);
return copied;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index aa32afd871ee..9482def1f310 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1818,7 +1818,7 @@ EXPORT_SYMBOL(__skb_recv_udp);
int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct sk_buff *skb;
- int err, copied;
+ int err;
try_again:
skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
@@ -1837,10 +1837,7 @@ try_again:
}
WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
- copied = recv_actor(sk, skb);
- kfree_skb(skb);
-
- return copied;
+ return recv_actor(sk, skb);
}
EXPORT_SYMBOL(udp_read_skb);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cc695c9f09ec..e7728b57a8c7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2553,7 +2553,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct unix_sock *u = unix_sk(sk);
struct sk_buff *skb;
- int err, copied;
+ int err;
mutex_lock(&u->iolock);
skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
@@ -2561,10 +2561,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
if (!skb)
return err;
- copied = recv_actor(sk, skb);
- kfree_skb(skb);
-
- return copied;
+ return recv_actor(sk, skb);
}
/*
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index e4878551f140..b769fc258931 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1441,7 +1441,6 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto
struct sock *sk = sk_vsock(vsk);
struct sk_buff *skb;
int off = 0;
- int copied;
int err;
spin_lock_bh(&vvs->rx_lock);
@@ -1454,9 +1453,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto
if (!skb)
return err;
- copied = recv_actor(sk, skb);
- kfree_skb(skb);
- return copied;
+ return recv_actor(sk, skb);
}
EXPORT_SYMBOL_GPL(virtio_transport_read_skb);