From e993ffe3da4bcddea0536b03be1031bf35cd8d85 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 21 Oct 2022 11:16:39 +0100 Subject: net: flag sockets supporting msghdr originated zerocopy We need an efficient way in io_uring to check whether a socket supports zerocopy with msghdr provided ubuf_info. Add a new flag into the struct socket flags fields. Cc: # 6.0 Signed-off-by: Pavel Begunkov Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/3dafafab822b1c66308bb58a0ac738b1e3f53f74.1666346426.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- net/ipv4/tcp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f8232811a5be..ef14efa1fb70 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); -- cgit From 29c1c44646aec5d5134f2365259a84becc1ee7d3 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:14 +0000 Subject: tcp: add u32 counter in tcp_sock and an SNMP counter for PLB A u32 counter is added to tcp_sock for counting the number of PLB triggered rehashes for a TCP connection. An SNMP counter is also added to count overall PLB triggered rehash events for a host. These counters are hooked up to PLB implementation for DCTCP. TCP_NLA_REHASH is added to SCM_TIMESTAMPING_OPT_STATS that reports the rehash attempts triggered due to PLB or timeouts. This gives a historical view of sustained congestion or timeouts experienced by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef14efa1fb70..1da7c53b6cb5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3176,6 +3176,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; + tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rack.mstamp = 0; @@ -3973,6 +3974,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } @@ -4049,6 +4051,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); + nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } -- cgit From 71fc704768f601ed3fa36310822a5e03f310f781 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:15 +0000 Subject: tcp: add rcv_wnd and plb_rehash to TCP_INFO rcv_wnd can be useful to diagnose TCP performance where receiver window becomes the bottleneck. rehash reports the PLB and timeout triggered rehash attempts by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1da7c53b6cb5..de8f0cd7cb32 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3940,6 +3940,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_rcv_wnd = tp->rcv_wnd; + info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } -- cgit From 0c175da7b0378445f5ef53904247cfbfb87e0b78 Mon Sep 17 00:00:00 2001 From: Lu Wei Date: Fri, 4 Nov 2022 10:27:23 +0800 Subject: tcp: prohibit TCP_REPAIR_OPTIONS if data was already sent If setsockopt with option name of TCP_REPAIR_OPTIONS and opt_code of TCPOPT_SACK_PERM is called to enable sack after data is sent and dupacks are received , it will trigger a warning in function tcp_verify_left_out() as follows: ============================================ WARNING: CPU: 8 PID: 0 at net/ipv4/tcp_input.c:2132 tcp_timeout_mark_lost+0x154/0x160 tcp_enter_loss+0x2b/0x290 tcp_retransmit_timer+0x50b/0x640 tcp_write_timer_handler+0x1c8/0x340 tcp_write_timer+0xe5/0x140 call_timer_fn+0x3a/0x1b0 __run_timers.part.0+0x1bf/0x2d0 run_timer_softirq+0x43/0xb0 __do_softirq+0xfd/0x373 __irq_exit_rcu+0xf6/0x140 The warning is caused in the following steps: 1. a socket named socketA is created 2. socketA enters repair mode without build a connection 3. socketA calls connect() and its state is changed to TCP_ESTABLISHED directly 4. socketA leaves repair mode 5. socketA calls sendmsg() to send data, packets_out and sack_outs(dup ack receives) increase 6. socketA enters repair mode again 7. socketA calls setsockopt with TCPOPT_SACK_PERM to enable sack 8. retransmit timer expires, it calls tcp_timeout_mark_lost(), lost_out increases 9. sack_outs + lost_out > packets_out triggers since lost_out and sack_outs increase repeatly In function tcp_timeout_mark_lost(), tp->sacked_out will be cleared if Step7 not happen and the warning will not be triggered. As suggested by Denis and Eric, TCP_REPAIR_OPTIONS should be prohibited if data was already sent. socket-tcp tests in CRIU has been tested as follows: $ sudo ./test/zdtm.py run -t zdtm/static/socket-tcp* --keep-going \ --ignore-taint socket-tcp* represent all socket-tcp tests in test/zdtm/static/. Fixes: b139ba4e90dc ("tcp: Repair connection-time negotiated parameters") Signed-off-by: Lu Wei Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef14efa1fb70..54836a6b81d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3647,7 +3647,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; - else if (sk->sk_state == TCP_ESTABLISHED) + else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; -- cgit From e0833d1fedb02f038b526ae7dde178a076f56545 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 17:49:14 -0800 Subject: dccp/tcp: Fixup bhash2 bucket when connect() fails. If a socket bound to a wildcard address fails to connect(), we only reset saddr and keep the port. Then, we have to fix up the bhash2 bucket; otherwise, the bucket has an inconsistent address in the list. Also, listen() for such a socket will fire the WARN_ON() in inet_csk_get_port(). [0] Note that when a system runs out of memory, we give up fixing the bucket and unlink sk from bhash and bhash2 by inet_put_port(). [0]: WARNING: CPU: 0 PID: 207 at net/ipv4/inet_connection_sock.c:548 inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1)) Modules linked in: CPU: 0 PID: 207 Comm: bhash2_prev_rep Not tainted 6.1.0-rc3-00799-gc8421681c845 #63 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.amzn2022.0.1 04/01/2014 RIP: 0010:inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1)) Code: 74 a7 eb 93 48 8b 54 24 18 0f b7 cb 4c 89 e6 4c 89 ff e8 48 b2 ff ff 49 8b 87 18 04 00 00 e9 32 ff ff ff 0f 0b e9 34 ff ff ff <0f> 0b e9 42 ff ff ff 41 8b 7f 50 41 8b 4f 54 89 fe 81 f6 00 00 ff RSP: 0018:ffffc900003d7e50 EFLAGS: 00010202 RAX: ffff8881047fb500 RBX: 0000000000004e20 RCX: 0000000000000000 RDX: 000000000000000a RSI: 00000000fffffe00 RDI: 00000000ffffffff RBP: ffffffff8324dc00 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000001 R14: 0000000000004e20 R15: ffff8881054e1280 FS: 00007f8ac04dc740(0000) GS:ffff88842fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020001540 CR3: 00000001055fa003 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: inet_csk_listen_start (net/ipv4/inet_connection_sock.c:1205) inet_listen (net/ipv4/af_inet.c:228) __sys_listen (net/socket.c:1810) __x64_sys_listen (net/socket.c:1819 net/socket.c:1817 net/socket.c:1817) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7f8ac051de5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007ffc1c177248 EFLAGS: 00000206 ORIG_RAX: 0000000000000032 RAX: ffffffffffffffda RBX: 0000000020001550 RCX: 00007f8ac051de5d RDX: ffffffffffffff80 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 00007ffc1c177270 R08: 0000000000000018 R09: 0000000000000007 R10: 0000000020001540 R11: 0000000000000206 R12: 00007ffc1c177388 R13: 0000000000401169 R14: 0000000000403e18 R15: 00007f8ac0723000 Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: syzbot Reported-by: Mat Martineau Signed-off-by: Kuniyuki Iwashima Acked-by: Joanne Koong Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 54836a6b81d6..4f2205756cfe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3114,8 +3114,7 @@ int tcp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); -- cgit From de4eda9de2d957ef2d6a8365a01e26a435e958cb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Sep 2022 20:25:47 -0400 Subject: use less confusing names for iov_iter direction initializers READ/WRITE proved to be actively confusing - the meanings are "data destination, as used with read(2)" and "data source, as used with write(2)", but people keep interpreting those as "we read data from it" and "we write data to it", i.e. exactly the wrong way. Call them ITER_DEST and ITER_SOURCE - at least that is harder to misinterpret... Signed-off-by: Al Viro --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 54836a6b81d6..01cc5705b146 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2000,7 +2000,7 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(READ, (void __user *)copy_address, + err = import_single_range(ITER_DEST, (void __user *)copy_address, inq, &iov, &msg.msg_iter); if (err) return err; @@ -2034,7 +2034,7 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(READ, (void __user *)copy_address, + err = import_single_range(ITER_DEST, (void __user *)copy_address, copylen, &iov, &msg.msg_iter); if (err) return err; -- cgit From 459837b522f7dff3b6681f534d8fff4eca19b7d1 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:57 +0000 Subject: net/tcp: Disable TCP-MD5 static key on tcp_md5sig_info destruction To do that, separate two scenarios: - where it's the first MD5 key on the system, which means that enabling of the static key may need to sleep; - copying of an existing key from a listening socket to the request socket upon receiving a signed TCP segment, where static key was already enabled (when the key was added to the listening socket). Now the life-time of the static branch for TCP-MD5 is until: - last tcp_md5sig_info is destroyed - last socket in time-wait state with MD5 key is closed. Which means that after all sockets with TCP-MD5 keys are gone, the system gets back the performance of disabled md5-key static branch. While at here, provide static_key_fast_inc() helper that does ref counter increment in atomic fashion (without grabbing cpus_read_lock() on CONFIG_JUMP_LABEL=y). This is needed to add a new user for a static_key when the caller controls the lifetime of another user. Signed-off-by: Dmitry Safonov Acked-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 24602a5184b0..001947136b0a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4464,11 +4464,8 @@ bool tcp_alloc_md5sig_pool(void) if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { mutex_lock(&tcp_md5sig_mutex); - if (!tcp_md5sig_pool_populated) { + if (!tcp_md5sig_pool_populated) __tcp_alloc_md5sig_pool(); - if (tcp_md5sig_pool_populated) - static_branch_inc(&tcp_md5_needed); - } mutex_unlock(&tcp_md5sig_mutex); } -- cgit