From e9d9da91548b21e189fcd0259a0f2d26d1afc509 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2023 15:55:39 +0000 Subject: tcp: preserve const qualifier in tcp_sk() We can change tcp_sk() to propagate its argument const qualifier, thanks to container_of_const(). We have two places where a const sock pointer has to be upgraded to a write one. We have been using const qualifier for lockless listeners to clearly identify points where writes could happen. Add tcp_sk_rw() helper to better document these. tcp_inbound_md5_hash(), __tcp_grow_window(), tcp_reset_check() and tcp_rack_reo_wnd() get an additional const qualififer for their @tp local variables. smc_check_reset_syn_req() also needs a similar change. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index db9f828e9d1e..a0a91a988272 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -529,7 +529,7 @@ static inline void tcp_synq_overflow(const struct sock *sk) last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) - WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now); + WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ -- cgit From 8fb1a76a0f35c45a424c9eb84b0f97ffd51e6052 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:23:59 -0700 Subject: net: Update an existing TCP congestion control algorithm. This feature lets you immediately transition to another congestion control algorithm or implementation with the same name. Once a name is updated, new connections will apply this new algorithm. The purpose is to update a customized algorithm implemented in BPF struct_ops with a new version on the flight. The following is an example of using the userspace API implemented in later BPF patches. link = bpf_map__attach_struct_ops(skel->maps.ca_update_1); ....... err = bpf_link__update_map(link, skel->maps.ca_update_2); We first load and register an algorithm implemented in BPF struct_ops, then swap it out with a new one using the same name. After that, newly created connections will apply the updated algorithm, while older ones retain the previous version already applied. This patch also takes this chance to refactor the ca validation into the new tcp_validate_congestion_control() function. Cc: netdev@vger.kernel.org, Eric Dumazet Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-3-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/net/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index db9f828e9d1e..2abb755e6a3a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1117,6 +1117,9 @@ struct tcp_congestion_ops { int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); +int tcp_update_congestion_control(struct tcp_congestion_ops *type, + struct tcp_congestion_ops *old_type); +int tcp_validate_congestion_control(struct tcp_congestion_ops *ca); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); -- cgit From e5c6de5fa025882babf89cecbed80acf49b987fa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:12 -0700 Subject: bpf, sockmap: Incorrectly handling copied_seq The read_skb() logic is incrementing the tcp->copied_seq which is used for among other things calculating how many outstanding bytes can be read by the application. This results in application errors, if the application does an ioctl(FIONREAD) we return zero because this is calculated from the copied_seq value. To fix this we move tcp->copied_seq accounting into the recv handler so that we update these when the recvmsg() hook is called and data is in fact copied into user buffers. This gives an accurate FIONREAD value as expected and improves ACK handling. Before we were calling the tcp_rcv_space_adjust() which would update 'number of bytes copied to user in last RTT' which is wrong for programs returning SK_PASS. The bytes are only copied to the user when recvmsg is handled. Doing the fix for recvmsg is straightforward, but fixing redirect and SK_DROP pkts is a bit tricker. Build a tcp_psock_eat() helper and then call this from skmsg handlers. This fixes another issue where a broken socket with a BPF program doing a resubmit could hang the receiver. This happened because although read_skb() consumed the skb through sock_drop() it did not update the copied_seq. Now if a single reccv socket is redirecting to many sockets (for example for lb) the receiver sk will be hung even though we might expect it to continue. The hang comes from not updating the copied_seq numbers and memory pressure resulting from that. We have a slight layer problem of calling tcp_eat_skb even if its not a TCP socket. To fix we could refactor and create per type receiver handlers. I decided this is more work than we want in the fix and we already have some small tweaks depending on caller that use the helper skb_bpf_strparser(). So we extend that a bit and always set the strparser bit when it is in use and then we can gate the seq_copied updates on this. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-9-john.fastabend@gmail.com --- include/net/tcp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 04a31643cda3..18a038d16434 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1470,6 +1470,8 @@ static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) } void tcp_cleanup_rbuf(struct sock *sk, int copied); +void __tcp_cleanup_rbuf(struct sock *sk, int copied); + /* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override @@ -2326,6 +2328,14 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ +#ifdef CONFIG_INET +void tcp_eat_skb(struct sock *sk, struct sk_buff *skb); +#else +static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) +{ +} +#endif + int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, struct sk_msg *msg, u32 bytes, int flags); #endif /* CONFIG_NET_SOCK_MSG */ -- cgit From 30c6f0bf9579debce27e45fac34fdc97e46acacc Mon Sep 17 00:00:00 2001 From: fuyuanli Date: Wed, 31 May 2023 16:01:50 +0800 Subject: tcp: fix mishandling when the sack compression is deferred. In this patch, we mainly try to handle sending a compressed ack correctly if it's deferred. Here are more details in the old logic: When sack compression is triggered in the tcp_compressed_ack_kick(), if the sock is owned by user, it will set TCP_DELACK_TIMER_DEFERRED and then defer to the release cb phrase. Later once user releases the sock, tcp_delack_timer_handler() should send a ack as expected, which, however, cannot happen due to lack of ICSK_ACK_TIMER flag. Therefore, the receiver would not sent an ack until the sender's retransmission timeout. It definitely increases unnecessary latency. Fixes: 5d9f4262b7ea ("tcp: add SACK compression") Suggested-by: Eric Dumazet Signed-off-by: fuyuanli Signed-off-by: Jason Xing Link: https://lore.kernel.org/netdev/20230529113804.GA20300@didi-ThinkCentre-M920t-N000/ Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230531080150.GA20424@didi-ThinkCentre-M920t-N000 Signed-off-by: Paolo Abeni --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 18a038d16434..5066e4586cf0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -632,6 +632,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); +void tcp_sack_compress_send_ack(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); -- cgit