diff options
Diffstat (limited to 'net/core/skmsg.c')
| -rw-r--r-- | net/core/skmsg.c | 308 |
1 files changed, 186 insertions, 122 deletions
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8eb671c827f9..2ac7731e1e0a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -8,6 +8,7 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/tls.h> +#include <trace/events/sock.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -27,6 +28,7 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce) { struct page_frag *pfrag = sk_page_frag(sk); + u32 osize = msg->sg.size; int ret = 0; len -= msg->sg.size; @@ -35,13 +37,17 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, u32 orig_offset; int use, i; - if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) { + ret = -ENOMEM; + goto msg_trim; + } orig_offset = pfrag->offset; use = min_t(int, len, pfrag->size - orig_offset); - if (!sk_wmem_schedule(sk, use)) - return -ENOMEM; + if (!sk_wmem_schedule(sk, use)) { + ret = -ENOMEM; + goto msg_trim; + } i = msg->sg.end; sk_msg_iter_var_prev(i); @@ -71,6 +77,10 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, } return ret; + +msg_trim: + sk_msg_trim(sk, msg, osize); + return ret; } EXPORT_SYMBOL_GPL(sk_msg_alloc); @@ -283,7 +293,7 @@ out: /* If we trim data a full sg elem before curr pointer update * copybreak and current so that any future copy operations * start at new copy location. - * However trimed data that has not yet been used in a copy op + * However trimmed data that has not yet been used in a copy op * does not require an update. */ if (!msg->sg.size) { @@ -315,14 +325,13 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } - copied = iov_iter_get_pages(from, pages, bytes, maxpages, + copied = iov_iter_get_pages2(from, pages, bytes, maxpages, &offset); if (copied <= 0) { ret = -EFAULT; goto out; } - iov_iter_advance(from, copied); bytes -= copied; msg->sg.size += copied; @@ -360,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -388,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -395,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); @@ -425,16 +435,21 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, page = sg_page(sge); if (copied + copy > len) copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; + if (copy) + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; + } copied += copy; if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -447,13 +462,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, * didn't copy the entire length lets just break. */ if (copy != sge->length) - return copied; + goto out; sk_msg_iter_var_next(i); } if (copied == len) break; - } while (i != msg_rx->sg.end); + } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); if (unlikely(peek)) { msg_rx = sk_psock_next_msg(psock, msg_rx); @@ -463,13 +478,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { msg_rx = sk_psock_dequeue_msg(psock); kfree_sk_msg(msg_rx); } msg_rx = sk_psock_peek_msg(psock); } - +out: return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -488,49 +503,66 @@ bool sk_msg_is_readable(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_msg_is_readable); -static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, - struct sk_buff *skb) +static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); + if (unlikely(!msg)) return NULL; + sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); + return msg; +} - if (!sk_rmem_schedule(sk, skb, skb->truesize)) +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) +{ + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return NULL; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!msg)) + if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - sk_msg_init(msg); - return msg; + return alloc_sk_msg(GFP_KERNEL); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, - struct sk_msg *msg) + struct sk_msg *msg, + bool take_ref) { int num_sge, copied; - /* skb linearize may fail with ENOMEM, but lets simply try again - * later if this happens. Under memory pressure we don't want to - * drop the skb. We need to linearize the skb so that the mapping - * in skb_to_sgvec can not error. + /* skb_to_sgvec will fail when the total number of fragments in + * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the + * caller may aggregate multiple skbs. */ - if (skb_linearize(skb)) - return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); - if (unlikely(num_sge < 0)) - return num_sge; + if (num_sge < 0) { + /* skb linearize may fail with ENOMEM, but lets simply try again + * later if this happens. Under memory pressure we don't want to + * drop the skb. We need to linearize the skb so that the mapping + * in skb_to_sgvec can not error. + * Note that skb_linearize requires the skb not to be shared. + */ + if (skb_linearize(skb)) + return -EAGAIN; + + num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); + if (unlikely(num_sge < 0)) + return num_sge; + } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + psock->ingress_bytes += len; +#endif copied = len; msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; - msg->skb = skb; + msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); @@ -538,7 +570,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len); + u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) @@ -552,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * correctly. */ if (unlikely(skb->sk == sk)) - return sk_psock_skb_ingress_self(psock, skb, off, len); + return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; @@ -564,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * into user buffers. */ skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; @@ -575,17 +607,16 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len) + u32 off, u32 len, bool take_ref) { - struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; int err; if (unlikely(!msg)) return -EAGAIN; - sk_msg_init(msg); skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; @@ -599,47 +630,48 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } + return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, - struct sk_buff *skb, int len, int off) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { - state->skb = skb; state->len = len; state->off = off; - } else { - sock_drop(psock->sk, skb); } spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_backlog(struct work_struct *work) { - struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct delayed_work *dwork = to_delayed_work(work); + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; + u32 len = 0, off = 0; bool ingress; - u32 len, off; int ret; - mutex_lock(&psock->work_mutex); - if (unlikely(state->skb)) { - spin_lock_bh(&psock->ingress_lock); - skb = state->skb; - len = state->len; - off = state->off; - state->skb = NULL; - spin_unlock_bh(&psock->ingress_lock); - } - if (skb) - goto start; + /* If sk is quickly removed from the map and then added back, the old + * psock should not be scheduled, because there are now two psocks + * pointing to the same sk. + */ + if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + return; - while ((skb = skb_dequeue(&psock->ingress_skb))) { + /* Increment the psock refcnt to synchronize with close(fd) path in + * sock_map_close(), ensuring we wait for backlog thread completion + * before sk_socket freed. If refcnt increment fails, it indicates + * sock_map_close() completed with sk_socket potentially already freed. + */ + if (!sk_psock_get(psock->sk)) + return; + mutex_lock(&psock->work_mutex); + while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { @@ -648,7 +680,13 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } -start: + + /* Resume processing from previous partial state */ + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -658,25 +696,33 @@ start: len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { - sk_psock_skb_state(psock, state, skb, - len, off); + sk_psock_skb_state(psock, state, len, off); + /* Restore redir info we cleared before */ + skb_bpf_set_redir(skb, psock->sk, ingress); + /* Delay slightly to prioritize any + * other work that might be here. + */ + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - sock_drop(psock->sk, skb); goto end; } off += ret; len -= ret; } while (len); - if (!ingress) - kfree_skb(skb); + /* The entire skb sent, clear state */ + sk_psock_skb_state(psock, state, 0, 0); + skb = skb_dequeue(&psock->ingress_skb); + kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); + sk_psock_put(psock->sk, psock); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -686,6 +732,11 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); + if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; @@ -702,13 +753,14 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) psock->eval = __SK_NONE; psock->sk_proto = prot; psock->saved_unhash = prot->unhash; + psock->saved_destroy = prot->destroy; psock->saved_close = prot->close; psock->saved_write_space = sk->sk_write_space; INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); - INIT_WORK(&psock->work, sk_psock_backlog); + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); @@ -717,7 +769,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); refcount_set(&psock->refcnt, 1); - rcu_assign_sk_user_data_nocopy(sk, psock); + __rcu_assign_sk_user_data_with_flags(sk, psock, + SK_USER_DATA_NOCOPY | + SK_USER_DATA_PSOCK); sock_hold(sk); out: @@ -745,6 +799,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } @@ -758,11 +814,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); sock_drop(psock->sk, skb); } - kfree_skb(psock->work_state.skb); - /* We null the skb here to ensure that calls to sk_psock_backlog - * do not pick up the free'd skb. - */ - psock->work_state.skb = NULL; __sk_psock_purge_ingress_msg(psock); } @@ -776,16 +827,12 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } -void sk_psock_stop(struct sk_psock *psock, bool wait) +void sk_psock_stop(struct sk_psock *psock) { spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); - __sk_psock_zap_ingress(psock); spin_unlock_bh(&psock->ingress_lock); - - if (wait) - cancel_work_sync(&psock->work); } static void sk_psock_done_strp(struct sk_psock *psock); @@ -798,7 +845,8 @@ static void sk_psock_destroy(struct work_struct *work) sk_psock_done_strp(psock); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); + __sk_psock_zap_ingress(psock); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); @@ -808,6 +856,8 @@ static void sk_psock_destroy(struct work_struct *work) if (psock->sk_redir) sock_put(psock->sk_redir); + if (psock->sk_pair) + sock_put(psock->sk_pair); sock_put(psock->sk); kfree(psock); } @@ -823,10 +873,10 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_stop(psock, false); + sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); - queue_rcu_work(system_wq, &psock->rwork); + queue_rcu_work(system_percpu_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); @@ -862,13 +912,16 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { - if (psock->sk_redir) + if (psock->sk_redir) { sock_put(psock->sk_redir); - psock->sk_redir = msg->sk_redir; - if (!psock->sk_redir) { + psock->sk_redir = NULL; + } + if (!msg->sk_redir) { ret = __SK_DROP; goto out; } + psock->redir_ingress = sk_msg_to_ingress(msg); + psock->sk_redir = msg->sk_redir; sock_hold(psock->sk_redir); } out: @@ -910,7 +963,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) } skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); + schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } @@ -962,10 +1015,8 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { - skb_bpf_redirect_clear(skb); + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) goto out_free; - } skb_bpf_set_ingress(skb); @@ -984,28 +1035,29 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, off = stm->offset; len = stm->full_len; } - err = sk_psock_skb_ingress_self(psock, skb, off, len); + err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); - if (err < 0) { - skb_bpf_redirect_clear(skb); + if (err < 0) goto out_free; - } } break; case __SK_REDIRECT: + tcp_eat_skb(psock->sk, skb); err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: out_free: + skb_bpf_redirect_clear(skb); + tcp_eat_skb(psock->sk, skb); sock_drop(psock->sk, skb); } @@ -1021,7 +1073,7 @@ static void sk_psock_write_space(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); @@ -1050,8 +1102,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); - if (ret == SK_PASS) - skb_bpf_set_strparser(skb); + skb_bpf_set_strparser(skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } @@ -1087,15 +1138,17 @@ static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; + trace_sk_data_ready(sk); + rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { - write_lock_bh(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); @@ -1103,13 +1156,23 @@ static void sk_psock_strp_data_ready(struct sock *sk) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { + int ret; + static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; - return strp_init(&psock->strp, sk, &cb); + ret = strp_init(&psock->strp, sk, &cb); + if (!ret) + sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + + if (sk_is_tcp(sk)) { + psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; + psock->copied_seq = tcp_sk(sk)->copied_seq; + } + return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) @@ -1137,7 +1200,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.stream_parser) + if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else @@ -1146,26 +1209,18 @@ static void sk_psock_done_strp(struct sk_psock *psock) } #endif /* CONFIG_BPF_STREAM_PARSER */ -static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, - unsigned int offset, size_t orig_len) +static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = (struct sock *)desc->arg.data; struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; int len = skb->len; - /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ - skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) { - desc->error = -ENOMEM; - return 0; - } - rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; + tcp_eat_skb(sk, skb); sock_drop(sk, skb); goto out; } @@ -1173,15 +1228,14 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, if (!prog) prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - skb->sk = sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); - skb->sk = NULL; } - if (sk_psock_verdict_apply(psock, skb, ret) < 0) - len = 0; + ret = sk_psock_verdict_apply(psock, skb, ret); + if (ret < 0) + len = ret; out: rcu_read_unlock(); return len; @@ -1190,16 +1244,26 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; - read_descriptor_t desc; + const struct proto_ops *ops; + int copied; - if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) - return; + trace_sk_data_ready(sk); - desc.arg.data = sk; - desc.error = 0; - desc.count = 1; - - sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); + if (unlikely(!sock)) + return; + ops = READ_ONCE(sock->ops); + if (!ops || !ops->read_skb) + return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); + if (copied >= 0) { + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) + sk_psock_data_ready(sk, psock); + rcu_read_unlock(); + } } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) |
