diff options
Diffstat (limited to 'net/ipv4/tcp_bpf.c')
| -rw-r--r-- | net/ipv4/tcp_bpf.c | 696 |
1 files changed, 372 insertions, 324 deletions
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 1bb7321a256d..a268e1595b22 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -6,157 +6,31 @@ #include <linux/bpf.h> #include <linux/init.h> #include <linux/wait.h> +#include <linux/util_macros.h> #include <net/inet_common.h> #include <net/tls.h> -static bool tcp_bpf_stream_read(const struct sock *sk) +void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { - struct sk_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) - empty = list_empty(&psock->ingress_msg); - rcu_read_unlock(); - return !empty; -} - -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags) -{ - struct iov_iter *iter = &msg->msg_iter; - int peek = flags & MSG_PEEK; - int i, ret, copied = 0; - struct sk_msg *msg_rx; - - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - - while (copied != len) { - struct scatterlist *sge; - - if (unlikely(!msg_rx)) - break; + struct tcp_sock *tcp; + int copied; - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - ret = copy_page_to_iter(page, sge->offset, copy, iter); - if (ret != copy) { - msg_rx->sg.start = i; - return -EFAULT; - } - - copied += copy; - if (likely(!peek)) { - sge->offset += copy; - sge->length -= copy; - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - - if (!sge->length) { - sk_msg_iter_var_next(i); - if (!msg_rx->skb) - put_page(page); - } - } else { - sk_msg_iter_var_next(i); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - if (unlikely(peek)) { - msg_rx = list_next_entry(msg_rx, list); - continue; - } - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); - } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - -int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct sk_psock *psock; - int copied, ret; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + if (!skb || !skb->len || !sk_is_tcp(sk)) + return; - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); -msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); - if (!copied) { - int data, err = 0; - long timeo; + if (skb_bpf_strparser(skb)) + return; - timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); - if (data) { - if (skb_queue_empty(&sk->sk_receive_queue)) - goto msg_bytes_ready; - release_sock(sk); - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - } - if (err) { - ret = err; - goto out; - } - copied = -EAGAIN; - } - ret = copied; -out: - release_sock(sk); - sk_psock_put(sk, psock); - return ret; + tcp = tcp_sk(sk); + copied = tcp->copied_seq + skb->len; + WRITE_ONCE(tcp->copied_seq, copied); + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, skb->len); } static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, - struct sk_msg *msg, u32 apply_bytes, int flags) + struct sk_msg *msg, u32 apply_bytes) { bool apply = apply_bytes; struct scatterlist *sge; @@ -175,13 +49,14 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; - if (!sk_wmem_schedule(sk, size)) { + if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; } sk_mem_charge(sk, size); + atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) @@ -190,15 +65,18 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, tmp->sg.end = i; if (apply) { apply_bytes -= size; - if (!apply_bytes) + if (!apply_bytes) { + if (sge->length) + sk_msg_iter_var_prev(i); break; + } } } while (i != msg->sg.end); if (!ret) { msg->sg.start = i; - msg->sg.size -= apply_bytes; - sk_psock_queue_msg(psock, tmp); + if (!sk_psock_queue_msg(psock, tmp)) + atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); @@ -212,6 +90,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { + struct msghdr msghdr = {}; bool apply = apply_bytes; struct scatterlist *sge; struct page *page; @@ -219,6 +98,7 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, u32 off; while (1) { + struct bio_vec bvec; bool has_tx_ulp; sge = sk_msg_elem(msg, msg->sg.start); @@ -229,17 +109,20 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, tcp_rate_check_app_limited(sk); retry: + msghdr.msg_flags = flags | MSG_SPLICE_PAGES; has_tx_ulp = tls_sw_has_ctx_tx(sk); - if (has_tx_ulp) { - flags |= MSG_SENDPAGE_NOPOLICY; - ret = kernel_sendpage_locked(sk, - page, off, size, flags); - } else { - ret = do_tcp_sendpages(sk, page, off, size, flags); - } + if (has_tx_ulp) + msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY; + + if (size < sge->length && msg->sg.start != msg->sg.end) + msghdr.msg_flags |= MSG_MORE; + bvec_set_page(&bvec, page, size, off); + iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size); + ret = tcp_sendmsg_locked(sk, &msghdr, size); if (ret <= 0) return ret; + if (apply) apply_bytes -= ret; msg->sg.size -= ret; @@ -277,30 +160,234 @@ static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, return ret; } -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, - u32 bytes, int flags) +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, + struct sk_msg *msg, u32 bytes, int flags) { - bool ingress = sk_msg_to_ingress(msg); struct sk_psock *psock = sk_psock_get(sk); int ret; - if (unlikely(!psock)) { - sk_msg_free(sk, msg); - return 0; - } - ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : + if (unlikely(!psock)) + return -EPIPE; + + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); return ret; } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); +#ifdef CONFIG_BPF_SYSCALL +static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, + long timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + +static bool is_next_msg_fin(struct sk_psock *psock) +{ + struct scatterlist *sge; + struct sk_msg *msg_rx; + int i; + + msg_rx = sk_psock_peek_msg(psock); + i = msg_rx->sg.start; + sge = sk_msg_elem(msg_rx, i); + if (!sge->length) { + struct sk_buff *skb = msg_rx->skb; + + if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + return true; + } + return false; +} + +static int tcp_bpf_recvmsg_parser(struct sock *sk, + struct msghdr *msg, + size_t len, + int flags, + int *addr_len) +{ + int peek = flags & MSG_PEEK; + struct sk_psock *psock; + struct tcp_sock *tcp; + int copied = 0; + u32 seq; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (!len) + return 0; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + + lock_sock(sk); + tcp = tcp_sk(sk); + seq = tcp->copied_seq; + /* We may have received data on the sk_receive_queue pre-accept and + * then we can not use read_skb in this context because we haven't + * assigned a sk_socket yet so have no link to the ops. The work-around + * is to check the sk_receive_queue and in these cases read skbs off + * queue again. The read_skb hook is not running at this point because + * of lock_sock so we avoid having multiple runners in read_skb. + */ + if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { + tcp_data_ready(sk); + /* This handles the ENOMEM errors if we both receive data + * pre accept and are already under memory pressure. At least + * let user know to retry. + */ + if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { + copied = -EAGAIN; + goto out; + } + } + +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + /* The typical case for EFAULT is the socket was gracefully + * shutdown with a FIN pkt. So check here the other case is + * some error on copy_page_to_iter which would be unexpected. + * On fin return correct return code to zero. + */ + if (copied == -EFAULT) { + bool is_fin = is_next_msg_fin(psock); + + if (is_fin) { + copied = 0; + seq++; + goto out; + } + } + seq += copied; + if (!copied) { + long timeo; + int data; + + if (sock_flag(sk, SOCK_DONE)) + goto out; + + if (sk->sk_err) { + copied = sock_error(sk); + goto out; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + + if (sk->sk_state == TCP_CLOSE) { + copied = -ENOTCONN; + goto out; + } + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + if (!timeo) { + copied = -EAGAIN; + goto out; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + goto out; + } + + data = tcp_msg_wait_data(sk, psock, timeo); + if (data < 0) { + copied = data; + goto unlock; + } + if (data && !sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + copied = -EAGAIN; + } +out: + if (!peek) + WRITE_ONCE(tcp->copied_seq, seq); + tcp_rcv_space_adjust(sk); + if (copied > 0) + __tcp_cleanup_rbuf(sk, copied); + +unlock: + release_sock(sk); + sk_psock_put(sk, psock); + return copied; +} + +static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (!len) + return 0; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, flags, addr_len); + } + lock_sock(sk); +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + long timeo; + int data; + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + data = tcp_msg_wait_data(sk, psock, timeo); + if (data < 0) { + ret = data; + goto unlock; + } + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + release_sock(sk); + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, flags, addr_len); + } + copied = -EAGAIN; + } + ret = copied; + +unlock: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} + static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { - bool cork = false, enospc = msg->sg.start == msg->sg.end; + bool cork = false, enospc = sk_msg_full(msg), redir_ingress; struct sock *sk_redir; - u32 tosend, delta = 0; + u32 tosend, origsize, sent, delta = 0; + u32 eval; int ret; more_data: @@ -312,10 +399,7 @@ more_data: */ delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); - if (msg->sg.size < delta) - delta -= msg->sg.size; - else - delta = 0; + delta -= msg->sg.size; } if (msg->cork_bytes && @@ -324,8 +408,11 @@ more_data: if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); - if (!psock->cork) + if (!psock->cork) { + sk_msg_free(sk, msg); + *copied = 0; return -ENOMEM; + } } memcpy(psock->cork, msg, sizeof(*msg)); return 0; @@ -334,6 +421,7 @@ more_data: tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; + eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: @@ -345,18 +433,33 @@ more_data: sk_msg_apply_bytes(psock, tosend); break; case __SK_REDIRECT: + redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); + if (!psock->apply_bytes) { + /* Clean up before releasing the sock lock. */ + eval = psock->eval; + psock->eval = __SK_NONE; + psock->sk_redir = NULL; + } if (psock->cork) { cork = true; psock->cork = NULL; } - sk_msg_return(sk, msg, tosend); release_sock(sk); - ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + + origsize = msg->sg.size; + ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, + msg, tosend, flags); + sent = origsize - msg->sg.size; + + if (eval == __SK_REDIRECT) + sock_put(sk_redir); + lock_sock(sk); + sk_mem_uncharge(sk, sent); if (unlikely(ret < 0)) { - int free = sk_msg_free_nocharge(sk, msg); + int free = sk_msg_free(sk, msg); if (!cork) *copied -= free; @@ -370,7 +473,7 @@ more_data: break; case __SK_DROP: default: - sk_msg_free_partial(sk, msg, tosend); + sk_msg_free(sk, msg); sk_msg_apply_bytes(psock, tosend); *copied -= (tosend + delta); return -EACCES; @@ -395,10 +498,14 @@ more_data: static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; - int copied = 0, err = 0; + int copied = 0, err = 0, ret = 0; struct sk_psock *psock; long timeo; + int flags; + + /* Don't let internal flags through */ + flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); + flags |= MSG_NO_SHARED_FRAGS; psock = sk_psock_get(sk); if (unlikely(!psock)) @@ -434,14 +541,14 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copy = msg_tx->sg.size - osize; } - err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); - if (err < 0) { + if (ret < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } - copied += copy; + copied += ret; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; @@ -473,108 +580,7 @@ out_err: err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); sk_psock_put(sk, psock); - return copied ? copied : err; -} - -static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct sk_msg tmp, *msg = NULL; - int err = 0, copied = 0; - struct sk_psock *psock; - bool enospc = false; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_sendpage(sk, page, offset, size, flags); - - lock_sock(sk); - if (psock->cork) { - msg = psock->cork; - } else { - msg = &tmp; - sk_msg_init(msg); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(sk_msg_full(msg))) - goto out_err; - - sk_msg_page_add(msg, page, size, offset); - sk_mem_charge(sk, size); - copied = size; - if (sk_msg_full(msg)) - enospc = true; - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - if (psock->cork_bytes && !enospc) - goto out_err; - /* All cork bytes are accounted, rerun the prog. */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); -out_err: - release_sock(sk); - sk_psock_put(sk, psock); - return copied ? copied : err; -} - -static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_link *link; - - sk_psock_cork_free(psock); - __sk_psock_purge_ingress_msg(psock); - while ((link = sk_psock_link_pop(psock))) { - sk_psock_unlink(sk, link); - sk_psock_free_link(link); - } -} - -static void tcp_bpf_unhash(struct sock *sk) -{ - void (*saved_unhash)(struct sock *sk); - struct sk_psock *psock; - - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; - } - - saved_unhash = psock->saved_unhash; - tcp_bpf_remove(sk, psock); - rcu_read_unlock(); - saved_unhash(sk); -} - -static void tcp_bpf_close(struct sock *sk, long timeout) -{ - void (*saved_close)(struct sock *sk, long timeout); - struct sk_psock *psock; - - lock_sock(sk); - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - saved_close = psock->saved_close; - tcp_bpf_remove(sk, psock); - rcu_read_unlock(); - release_sock(sk); - saved_close(sk, timeout); + return copied > 0 ? copied : err; } enum { @@ -586,6 +592,8 @@ enum { enum { TCP_BPF_BASE, TCP_BPF_TX, + TCP_BPF_RX, + TCP_BPF_TXRX, TCP_BPF_NUM_CFGS, }; @@ -597,20 +605,24 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; - prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash; - prot[TCP_BPF_BASE].close = tcp_bpf_close; + prot[TCP_BPF_BASE].destroy = sock_map_destroy; + prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; - prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; + prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; - prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; + + prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; + prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; + + prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; + prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; } -static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) { - if (sk->sk_family == AF_INET6 && - unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { + if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { spin_lock_bh(&tcpv6_prot_lock); if (likely(ops != tcpv6_prot_saved)) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); @@ -625,27 +637,7 @@ static int __init tcp_bpf_v4_build_proto(void) tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); return 0; } -core_initcall(tcp_bpf_v4_build_proto); - -static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; - int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - - sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]); -} - -static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; - int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - - /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed - * or added requiring sk_prot hook updates. We keep original saved - * hooks in this case. - */ - sk->sk_prot = &tcp_bpf_prots[family][config]; -} +late_initcall(tcp_bpf_v4_build_proto); static int tcp_bpf_assert_proto_ops(struct proto *ops) { @@ -654,38 +646,94 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) * indeed valid assumptions. */ return ops->recvmsg == tcp_recvmsg && - ops->sendmsg == tcp_sendmsg && - ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; + ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } -void tcp_bpf_reinit(struct sock *sk) +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor) { + struct sock *sk = strp->sk; struct sk_psock *psock; + struct tcp_sock *tp; + int copied = 0; - sock_owned_by_me(sk); - + tp = tcp_sk(sk); rcu_read_lock(); psock = sk_psock(sk); - tcp_bpf_reinit_sk_prot(sk, psock); + if (WARN_ON_ONCE(!psock)) { + desc->error = -EINVAL; + goto out; + } + + psock->ingress_bytes = 0; + copied = tcp_read_sock_noack(sk, desc, recv_actor, true, + &psock->copied_seq); + if (copied < 0) + goto out; + /* recv_actor may redirect skb to another socket (SK_REDIRECT) or + * just put skb into ingress queue of current socket (SK_PASS). + * For SK_REDIRECT, we need to ack the frame immediately but for + * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser(). + */ + tp->copied_seq = psock->copied_seq - psock->ingress_bytes; + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); +out: rcu_read_unlock(); + return copied; } +#endif /* CONFIG_BPF_STREAM_PARSER */ -int tcp_bpf_init(struct sock *sk) +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { - struct proto *ops = READ_ONCE(sk->sk_prot); - struct sk_psock *psock; + int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; + int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - sock_owned_by_me(sk); + if (psock->progs.stream_verdict || psock->progs.skb_verdict) { + config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX; + } - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock || psock->sk_proto || - tcp_bpf_assert_proto_ops(ops))) { - rcu_read_unlock(); - return -EINVAL; + if (restore) { + if (inet_csk_has_ulp(sk)) { + /* TLS does not have an unhash proto in SW cases, + * but we need to ensure we stop using the sock_map + * unhash routine because the associated psock is being + * removed. So use the original unhash handler. + */ + WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + sock_replace_proto(sk, psock->sk_proto); + } + return 0; } - tcp_bpf_check_v6_needs_rebuild(sk, ops); - tcp_bpf_update_sk_prot(sk, psock); - rcu_read_unlock(); + + if (sk->sk_family == AF_INET6) { + if (tcp_bpf_assert_proto_ops(psock->sk_proto)) + return -EINVAL; + + tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); + } + + /* Pairs with lockless read in sk_clone_lock() */ + sock_replace_proto(sk, &tcp_bpf_prots[family][config]); return 0; } +EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); + +/* If a child got cloned from a listening socket that had tcp_bpf + * protocol callbacks installed, we need to restore the callbacks to + * the default ones because the child does not inherit the psock state + * that tcp_bpf callbacks expect. + */ +void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ + struct proto *prot = newsk->sk_prot; + + if (is_insidevar(prot, tcp_bpf_prots)) + newsk->sk_prot = sk->sk_prot_creator; +} +#endif /* CONFIG_BPF_SYSCALL */ |
