diff options
Diffstat (limited to 'net/ipv4/tcp.c')
| -rw-r--r-- | net/ipv4/tcp.c | 4602 |
1 files changed, 3371 insertions, 1231 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5423223e93c2..f035440c475a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -205,11 +206,6 @@ * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or(at your option) any later version. - * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack @@ -247,59 +243,80 @@ #define pr_fmt(fmt) "TCP: " fmt +#include <crypto/md5.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> +#include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> -#include <linux/scatterlist.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/random.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/highmem.h> -#include <linux/swap.h> #include <linux/cache.h> #include <linux/err.h> -#include <linux/crypto.h> #include <linux/time.h> #include <linux/slab.h> +#include <linux/errqueue.h> +#include <linux/static_key.h> +#include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> +#include <net/inet_ecn.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> +#include <net/mptcp.h> +#include <net/proto_memory.h> #include <net/xfrm.h> #include <net/ip.h> -#include <net/netdma.h> +#include <net/psp.h> #include <net/sock.h> +#include <net/rstreason.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> +#include <net/hotdata.h> +#include <trace/events/tcp.h> +#include <net/rps.h> + +#include "../core/devmem.h" + +/* Track pending CMSGs. */ +enum { + TCP_CMSG_INQ = 1, + TCP_CMSG_TS = 2 +}; -int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; +DEFINE_PER_CPU(unsigned int, tcp_orphan_count); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); -struct percpu_counter tcp_orphan_count; -EXPORT_SYMBOL_GPL(tcp_orphan_count); +DEFINE_PER_CPU(u32, tcp_tw_isn); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; +long sysctl_tcp_mem[3] __read_mostly; +EXPORT_IPV6_MOD(sysctl_tcp_mem); -EXPORT_SYMBOL(sysctl_tcp_rmem); -EXPORT_SYMBOL(sysctl_tcp_wmem); +DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); -atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ -EXPORT_SYMBOL(tcp_memory_allocated); +#if IS_ENABLED(CONFIG_SMC) +DEFINE_STATIC_KEY_FALSE(tcp_have_smc); +EXPORT_SYMBOL(tcp_have_smc); +#endif /* * Current number of TCP sockets. */ -struct percpu_counter tcp_sockets_allocated; -EXPORT_SYMBOL(tcp_sockets_allocated); +struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; +EXPORT_IPV6_MOD(tcp_sockets_allocated); /* * TCP splice context @@ -316,17 +333,36 @@ struct tcp_splice_state { * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ -int tcp_memory_pressure __read_mostly; -EXPORT_SYMBOL(tcp_memory_pressure); +unsigned long tcp_memory_pressure __read_mostly; +EXPORT_SYMBOL_GPL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) { - if (!tcp_memory_pressure) { + unsigned long val; + + if (READ_ONCE(tcp_memory_pressure)) + return; + val = jiffies; + + if (!val) + val--; + if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); - tcp_memory_pressure = 1; - } } -EXPORT_SYMBOL(tcp_enter_memory_pressure); +EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure); + +void tcp_leave_memory_pressure(struct sock *sk) +{ + unsigned long val; + + if (!READ_ONCE(tcp_memory_pressure)) + return; + val = xchg(&tcp_memory_pressure, 0); + if (val) + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, + jiffies_to_msecs(jiffies - val)); +} +EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) @@ -365,6 +401,34 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) return period; } +static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) +{ + u32 rate = READ_ONCE(tp->rate_delivered); + u32 intv = READ_ONCE(tp->rate_interval_us); + u64 rate64 = 0; + + if (rate && intv) { + rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; + do_div(rate64, intv); + } + return rate64; +} + +#ifdef CONFIG_TCP_MD5SIG +void tcp_md5_destruct_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->md5sig_info) { + + tcp_clear_md5_list(sk); + kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); + static_branch_slow_dec_deferred(&tcp_md5_needed); + } +} +EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); +#endif + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to @@ -374,21 +438,35 @@ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + int rto_min_us, rto_max_ms; - skb_queue_head_init(&tp->out_of_order_queue); + tp->out_of_order_queue = RB_ROOT; + sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; + + rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms); + icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms); + + rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); + icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); + icsk->icsk_delack_max = TCP_DELACK_MAX; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - tp->snd_cwnd = TCP_INIT_CWND; + tcp_snd_cwnd_set(tp, TCP_INIT_CWND); + + /* There's a bubble in the pipe until at least the first ACK. */ + tp->app_limited = ~0U; + tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. @@ -397,32 +475,54 @@ void tcp_init_sock(struct sock *sk) tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; - tp->reordering = sysctl_tcp_reordering; - tcp_enable_early_retrans(tp); - icsk->icsk_ca_ops = &tcp_init_congestion_ops; + tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); + tcp_assign_congestion_control(sk); tp->tsoffset = 0; - - sk->sk_state = TCP_CLOSE; + tp->rack.reo_wnd_steps = 1; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_sync_mss = tcp_sync_mss; - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + tcp_scaling_ratio_init(sk); - local_bh_disable(); - sock_update_memcg(sk); + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); - local_bh_enable(); + xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); +} +EXPORT_IPV6_MOD(tcp_init_sock); + +static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + u32 tsflags = sockc->tsflags; + + if (tsflags && skb) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); + if (tsflags & SOF_TIMESTAMPING_TX_ACK) + tcb->txstamp_ack |= TSTAMP_ACK_SK; + if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) + shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + } + + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && + SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) + bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); +} + +static bool tcp_stream_is_readable(struct sock *sk, int target) +{ + if (tcp_epollin_ready(sk, target)) + return true; + return sk_is_readable(sk); } -EXPORT_SYMBOL(tcp_init_sock); /* * Wait for a TCP event. @@ -431,16 +531,18 @@ EXPORT_SYMBOL(tcp_init_sock); * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) +__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask; + __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + u8 shutdown; + int state; - sock_rps_record_flow(sk); + sock_poll_wait(file, sock, wait); - sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == TCP_LISTEN) + state = inet_sk_state_load(sk); + if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -451,84 +553,92 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask = 0; /* - * POLLHUP is certainly not done right. But poll() doesn't + * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * - * Some poll() documentation says that POLLHUP is incompatible - * with the POLLOUT/POLLWR flags, so somebody should check this + * Some poll() documentation says that EPOLLHUP is incompatible + * with the EPOLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * - * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and + * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() - * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP + * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX - * solve this dilemma. I would prefer, if POLLHUP were maskable, + * solve this dilemma. I would prefer, if EPOLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains - * why POLLHUP is incompatible with POLLOUT. --ANK + * why EPOLLHUP is incompatible with EPOLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) - mask |= POLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLIN | POLLRDNORM | POLLRDHUP; + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + mask |= EPOLLHUP; + if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ - if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { + if (state != TCP_SYN_SENT && + (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); + u16 urg_data = READ_ONCE(tp->urg_data); - if (tp->urg_seq == tp->copied_seq && - !sock_flag(sk, SOCK_URGINLINE) && - tp->urg_data) + if (unlikely(urg_data) && + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && + !sock_flag(sk, SOCK_URGINLINE)) target++; - /* Potential race condition. If read of tp below will - * escape above sk->sk_state, we can be illegally awaken - * in SYN_* states. */ - if (tp->rcv_nxt - tp->copied_seq >= target) - mask |= POLLIN | POLLRDNORM; + if (tcp_stream_is_readable(sk, target)) + mask |= EPOLLIN | EPOLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { - mask |= POLLOUT | POLLWRNORM; + if (!(shutdown & SEND_SHUTDOWN)) { + if (__sk_stream_is_writeable(sk, 1)) { + mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, - * IO signal will be lost. + * IO signal will be lost. Memory barrier + * pairs with the input side. */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) - mask |= POLLOUT | POLLWRNORM; + smp_mb__after_atomic(); + if (__sk_stream_is_writeable(sk, 1)) + mask |= EPOLLOUT | EPOLLWRNORM; } } else - mask |= POLLOUT | POLLWRNORM; - - if (tp->urg_data & TCP_URG_VALID) - mask |= POLLPRI; + mask |= EPOLLOUT | EPOLLWRNORM; + + if (urg_data & TCP_URG_VALID) + mask |= EPOLLPRI; + } else if (state == TCP_SYN_SENT && + inet_test_bit(DEFER_CONNECT, sk)) { + /* Active TCP fastopen socket with defer_connect + * Return EPOLLOUT so application can call write() + * in order for kernel to generate SYN+data + */ + mask |= EPOLLOUT | EPOLLWRNORM; } - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */ smp_rmb(); - if (sk->sk_err) - mask |= POLLERR; + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) + mask |= EPOLLERR; return mask; } EXPORT_SYMBOL(tcp_poll); -int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int tcp_ioctl(struct sock *sk, int cmd, int *karg) { struct tcp_sock *tp = tcp_sk(sk); int answ; @@ -540,24 +650,12 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -EINVAL; slow = lock_sock_fast(sk); - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) - answ = 0; - else if (sock_flag(sk, SOCK_URGINLINE) || - !tp->urg_data || - before(tp->urg_seq, tp->copied_seq) || - !before(tp->urg_seq, tp->rcv_nxt)) { - - answ = tp->rcv_nxt - tp->copied_seq; - - /* Subtract 1, if FIN was received */ - if (answ && sock_flag(sk, SOCK_DONE)) - answ--; - } else - answ = tp->urg_seq - tp->copied_seq; + answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + answ = READ_ONCE(tp->urg_data) && + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) @@ -566,7 +664,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_una; + answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) @@ -575,17 +673,19 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_nxt; + answ = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; } - return put_user(answ, (int __user *)arg); + *karg = answ; + return 0; } -EXPORT_SYMBOL(tcp_ioctl); +EXPORT_IPV6_MOD(tcp_ioctl); -static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) +void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; @@ -596,21 +696,22 @@ static inline bool forced_push(const struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct sk_buff *skb) +void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - skb->csum = 0; tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; - tcb->sacked = 0; - skb_header_release(skb); + __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; + + tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) @@ -619,19 +720,59 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) tp->snd_up = tp->write_seq; } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle) +/* If a not yet filled skb is pushed, do not send it if + * we have data packets in Qdisc or NIC queues : + * Because TX completion will happen shortly, it gives a chance + * to coalesce future sendmsg() payload into this skb, without + * need for a timer, and with no latency trade off. + * As packets containing data payload have a bigger truesize + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. + */ +static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, + int size_goal) { - if (tcp_send_head(sk)) { - struct tcp_sock *tp = tcp_sk(sk); + return skb->len < size_goal && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && + !tcp_rtx_queue_empty(sk) && + refcount_read(&sk->sk_wmem_alloc) > skb->truesize && + tcp_skb_can_collapse_to(skb); +} + +void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + skb = tcp_write_queue_tail(sk); + if (!skb) + return; + if (!(flags & MSG_MORE) || forced_push(tp)) + tcp_mark_push(tp, skb); - if (!(flags & MSG_MORE) || forced_push(tp)) - tcp_mark_push(tp, tcp_write_queue_tail(sk)); + tcp_mark_urg(tp, flags); - tcp_mark_urg(tp, flags); - __tcp_push_pending_frames(sk, mss_now, - (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); + if (tcp_should_autocork(sk, skb, size_goal)) { + + /* avoid atomic op if TSQ_THROTTLED bit is already set */ + if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + smp_mb__after_atomic(); + } + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. + */ + if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) + return; } + + if (flags & MSG_MORE) + nonagle = TCP_NAGLE_CORK; + + __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, @@ -640,8 +781,8 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), - tss->flags); + ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, + min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; @@ -716,15 +857,22 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, * This occurs when user tries to read * from never connected socket. */ - if (!sock_flag(sk, SOCK_DONE)) - ret = -ENOTCONN; + ret = -ENOTCONN; break; } if (!timeo) { ret = -EAGAIN; break; } - sk_wait_data(sk, &timeo); + /* if __tcp_splice_read() got nothing while we have + * an skb in receive queue, we do not want to loop. + * This might happen with URG data. + */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) + break; if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; @@ -734,7 +882,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, tss.len -= ret; spliced += ret; - if (!timeo) + if (!tss.len || !timeo) break; release_sock(sk); lock_sock(sk); @@ -752,29 +900,34 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, return ret; } -EXPORT_SYMBOL(tcp_splice_read); +EXPORT_IPV6_MOD(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; - /* The TCP header must be at least 32-bit aligned. */ - size = ALIGN(size, 4); + skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); + if (likely(skb)) { + bool mem_scheduled; - skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { - skb_reserve(skb, sk->sk_prot->max_header); - /* - * Make sure that we have exactly size bytes - * available to the caller, no more, no less. - */ - skb->reserved_tailroom = skb->end - skb->tail - size; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + if (force_schedule) { + mem_scheduled = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_scheduled = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_scheduled)) { + skb_reserve(skb, MAX_TCP_HEADER); + skb->ip_summed = CHECKSUM_PARTIAL; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + tcp_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; @@ -784,40 +937,27 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal, old_size_goal; - - xmit_size_goal = mss_now; - - if (large_allowed && sk_can_gso(sk)) { - xmit_size_goal = ((sk->sk_gso_max_size - 1) - - inet_csk(sk)->icsk_af_ops->net_header_len - - inet_csk(sk)->icsk_ext_hdr_len - - tp->tcp_header_len); + u32 new_size_goal, size_goal; - /* TSQ : try to have two TSO segments in flight */ - xmit_size_goal = min_t(u32, xmit_size_goal, - sysctl_tcp_limit_output_bytes >> 1); + if (!large_allowed) + return mss_now; - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); + /* Note : tcp_tso_autosize() will eventually split this later */ + new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size); - /* We try hard to avoid divides here */ - old_size_goal = tp->xmit_size_goal_segs * mss_now; - - if (likely(old_size_goal <= xmit_size_goal && - old_size_goal + mss_now > xmit_size_goal)) { - xmit_size_goal = old_size_goal; - } else { - tp->xmit_size_goal_segs = - min_t(u16, xmit_size_goal / mss_now, - sk->sk_gso_max_segs); - xmit_size_goal = tp->xmit_size_goal_segs * mss_now; - } + /* We try hard to avoid divides here */ + size_goal = tp->gso_segs * mss_now; + if (unlikely(new_size_goal < size_goal || + new_size_goal >= size_goal + mss_now)) { + tp->gso_segs = min_t(u16, new_size_goal / mss_now, + sk->sk_gso_max_segs); + size_goal = tp->gso_segs * mss_now; } - return max(xmit_size_goal, mss_now); + return max(size_goal, mss_now); } -static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; @@ -827,234 +967,204 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } -static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +/* In some cases, sendmsg() could have added an skb to the write queue, + * but failed adding payload on it. We need to remove it to consume less + * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger + * epoll() users. Another reason is that tcp_write_xmit() does not like + * finding an empty skb in the write queue. + */ +void tcp_remove_empty_skb(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - int mss_now, size_goal; - int err; - ssize_t copied; - long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + struct sk_buff *skb = tcp_write_queue_tail(sk); - /* Wait for a connection to finish. One exception is TCP Fast Open - * (passive side) where data is allowed to be sent before a connection - * is fully established. - */ - if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && - !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_err; - } - - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - - mss_now = tcp_send_mss(sk, &size_goal, flags); - copied = 0; - - err = -EPIPE; - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto out_err; - - while (size > 0) { - struct sk_buff *skb = tcp_write_queue_tail(sk); - int copy, i; - bool can_coalesce; - - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { -new_segment: - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); - if (!skb) - goto wait_for_memory; - - skb_entail(sk, skb); - copy = size_goal; - } - - if (copy > size) - copy = size; - - i = skb_shinfo(skb)->nr_frags; - can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { - tcp_mark_push(tp, skb); - goto new_segment; - } - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; - - if (can_coalesce) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - } else { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); - } - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; - - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; - sk->sk_wmem_queued += copy; - sk_mem_charge(sk, copy); - skb->ip_summed = CHECKSUM_PARTIAL; - tp->write_seq += copy; - TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; - - if (!copied) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - - copied += copy; - offset += copy; - if (!(size -= copy)) - goto out; - - if (skb->len < size_goal || (flags & MSG_OOB)) - continue; - - if (forced_push(tp)) { - tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); - } else if (skb == tcp_send_head(sk)) - tcp_push_one(sk, mss_now); - continue; - -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); - - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) - goto do_error; - - mss_now = tcp_send_mss(sk, &size_goal, flags); + if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + tcp_unlink_write_queue(skb, sk); + if (tcp_write_queue_empty(sk)) + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + tcp_wmem_free_skb(sk, skb); } - -out: - if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) - tcp_push(sk, flags, mss_now, tp->nonagle); - return copied; - -do_error: - if (copied) - goto out; -out_err: - return sk_stream_error(sk, flags, err); } -int tcp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +/* skb changing from pure zc to mixed, must charge zc */ +static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) { - ssize_t res; + if (unlikely(skb_zcopy_pure(skb))) { + u32 extra = skb->truesize - + SKB_TRUESIZE(skb_end_offset(skb)); - if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) - return sock_no_sendpage(sk->sk_socket, page, offset, size, - flags); + if (!sk_wmem_schedule(sk, extra)) + return -ENOMEM; - lock_sock(sk); - res = do_tcp_sendpages(sk, page, offset, size, flags); - release_sock(sk); - return res; + sk_mem_charge(sk, extra); + skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + } + return 0; } -EXPORT_SYMBOL(tcp_sendpage); -static inline int select_size(const struct sock *sk, bool sg) -{ - const struct tcp_sock *tp = tcp_sk(sk); - int tmp = tp->mss_cache; - if (sg) { - if (sk_can_gso(sk)) { - /* Small frames wont use a full page: - * Payload will immediately follow tcp header. - */ - tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); - } else { - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); +int tcp_wmem_schedule(struct sock *sk, int copy) +{ + int left; - if (tmp >= pgbreak && - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) - tmp = pgbreak; - } - } + if (likely(sk_wmem_schedule(sk, copy))) + return copy; - return tmp; + /* We could be in trouble if we have nothing queued. + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] + * to guarantee some progress. + */ + left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued; + if (left > 0) + sk_forced_mem_schedule(sk, min(left, copy)); + return min(copy, sk->sk_forward_alloc); } void tcp_free_fastopen_req(struct tcp_sock *tp) { - if (tp->fastopen_req != NULL) { + if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } } -static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) +int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, + size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & + TFO_CLIENT_ENABLE) || + (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && + uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; - if (tp->fastopen_req != NULL) + if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); - if (unlikely(tp->fastopen_req == NULL)) + if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; + tp->fastopen_req->size = size; + tp->fastopen_req->uarg = uarg; + if (inet_test_bit(DEFER_CONNECT, sk)) { + err = tcp_connect(sk); + /* Same failure procedure as in tcp_v4/6_connect */ + if (err) { + tcp_set_state(sk, TCP_CLOSE); + inet->inet_dport = 0; + sk->sk_route_caps = 0; + } + } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, msg->msg_name, - msg->msg_namelen, flags); - *size = tp->fastopen_req->copied; - tcp_free_fastopen_req(tp); + err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr, + msg->msg_namelen, flags, 1); + /* fastopen_req could already be freed in __inet_stream_connect + * if the connection times out or gets rst + */ + if (tp->fastopen_req) { + *copied = tp->fastopen_req->copied; + tcp_free_fastopen_req(tp); + inet_clear_bit(DEFER_CONNECT, sk); + } return err; } -int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size) +int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { - struct iovec *iov; + struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; - int iovlen, flags, err, copied = 0; - int mss_now = 0, size_goal, copied_syn = 0, offset = 0; - bool sg; + struct sockcm_cookie sockc; + int flags, err, copied = 0; + int mss_now = 0, size_goal, copied_syn = 0; + int process_backlog = 0; + int sockc_err = 0; + int zc = 0; long timeo; - lock_sock(sk); - flags = msg->msg_flags; - if (flags & MSG_FASTOPEN) { - err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); + + sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; + if (msg->msg_controllen) { + sockc_err = sock_cmsg_send(sk, msg, &sockc); + /* Don't return error until MSG_FASTOPEN has been processed; + * that may succeed even if the cmsg is invalid. + */ + } + + if ((flags & MSG_ZEROCOPY) && size) { + if (msg->msg_ubuf) { + uarg = msg->msg_ubuf; + if (sk->sk_route_caps & NETIF_F_SG) + zc = MSG_ZEROCOPY; + } else if (sock_flag(sk, SOCK_ZEROCOPY)) { + skb = tcp_write_queue_tail(sk); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), + !sockc_err && sockc.dmabuf_id); + if (!uarg) { + err = -ENOBUFS; + goto out_err; + } + if (sk->sk_route_caps & NETIF_F_SG) + zc = MSG_ZEROCOPY; + else + uarg_to_msgzc(uarg)->zerocopy = 0; + + if (!sockc_err && sockc.dmabuf_id) { + binding = net_devmem_get_binding(sk, sockc.dmabuf_id); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + binding = NULL; + goto out_err; + } + } + } + } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { + if (sk->sk_route_caps & NETIF_F_SG) + zc = MSG_SPLICE_PAGES; + } + + if (!sockc_err && sockc.dmabuf_id && + (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { + err = -EINVAL; + goto out_err; + } + + if (unlikely(flags & MSG_FASTOPEN || + inet_test_bit(DEFER_CONNECT, sk)) && + !tp->repair) { + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) goto out_err; - offset = copied_syn; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto do_error; } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); - goto out; + goto out_nopush; } err = -EINVAL; @@ -1064,185 +1174,264 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* 'common' sending to sendq */ } - /* This should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + if (sockc_err) { + err = sockc_err; + goto out_err; + } - mss_now = tcp_send_mss(sk, &size_goal, flags); + /* This should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Ok commence sending. */ - iovlen = msg->msg_iovlen; - iov = msg->msg_iov; copied = 0; +restart: + mss_now = tcp_send_mss(sk, &size_goal, flags); + err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto out_err; + goto do_error; - sg = !!(sk->sk_route_caps & NETIF_F_SG); + while (msg_data_left(msg)) { + int copy = 0; - while (--iovlen >= 0) { - size_t seglen = iov->iov_len; - unsigned char __user *from = iov->iov_base; + skb = tcp_write_queue_tail(sk); + if (skb) + copy = size_goal - skb->len; - iov++; - if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ - if (offset >= seglen) { - offset -= seglen; - continue; - } - seglen -= offset; - from += offset; - offset = 0; - } + trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); - while (seglen > 0) { - int copy = 0; - int max = size_goal; + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { + bool first_skb; - skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { - if (skb->ip_summed == CHECKSUM_NONE) - max = mss_now; - copy = max - skb->len; +new_segment: + if (!sk_stream_memory_free(sk)) + goto wait_for_space; + + if (unlikely(process_backlog >= 16)) { + process_backlog = 0; + if (sk_flush_backlog(sk)) + goto restart; } + first_skb = tcp_rtx_and_write_queues_empty(sk); + skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, + first_skb); + if (!skb) + goto wait_for_space; - if (copy <= 0) { -new_segment: - /* Allocate new segment. If the interface is SG, - * allocate skb fitting to single page. - */ - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + process_backlog++; - skb = sk_stream_alloc_skb(sk, - select_size(sk, sg), - sk->sk_allocation); - if (!skb) - goto wait_for_memory; +#ifdef CONFIG_SKB_DECRYPTED + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif + tcp_skb_entail(sk, skb); + copy = size_goal; - /* - * Check whether we can use HW checksum. - */ - if (sk->sk_route_caps & NETIF_F_ALL_CSUM) - skb->ip_summed = CHECKSUM_PARTIAL; + /* All packets are restored as if they have + * already been sent. skb_mstamp_ns isn't set to + * avoid wrong rtt estimation. + */ + if (tp->repair) + TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; + } - skb_entail(sk, skb); - copy = size_goal; - max = size_goal; - } + /* Try to append data to the end of skb. */ + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); - /* Try to append data to the end of skb. */ - if (copy > seglen) - copy = seglen; + if (zc == 0) { + bool merge = true; + int i = skb_shinfo(skb)->nr_frags; + struct page_frag *pfrag = sk_page_frag(sk); - /* Where to copy to? */ - if (skb_availroom(skb) > 0) { - /* We have some space in skb head. Superb! */ - copy = min_t(int, copy, skb_availroom(skb)); - err = skb_add_data_nocache(sk, skb, from, copy); - if (err) - goto do_fault; - } else { - bool merge = true; - int i = skb_shinfo(skb)->nr_frags; - struct page_frag *pfrag = sk_page_frag(sk); - - if (!sk_page_frag_refill(sk, pfrag)) - goto wait_for_memory; - - if (!skb_can_coalesce(skb, i, pfrag->page, - pfrag->offset)) { - if (i == MAX_SKB_FRAGS || !sg) { - tcp_mark_push(tp, skb); - goto new_segment; - } - merge = false; + if (!sk_page_frag_refill(sk, pfrag)) + goto wait_for_space; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { + tcp_mark_push(tp, skb); + goto new_segment; } + merge = false; + } - copy = min_t(int, copy, pfrag->size - pfrag->offset); + copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { + if (tcp_downgrade_zcopy_pure(sk, skb)) + goto wait_for_space; + skb_zcopy_downgrade_managed(skb); + } - err = skb_copy_to_page_nocache(sk, from, skb, - pfrag->page, - pfrag->offset, - copy); - if (err) - goto do_error; + copy = tcp_wmem_schedule(sk, copy); + if (!copy) + goto wait_for_space; - /* Update the skb. */ - if (merge) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - } else { - skb_fill_page_desc(skb, i, pfrag->page, - pfrag->offset, copy); - get_page(pfrag->page); + err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, + pfrag->page, + pfrag->offset, + copy); + if (err) + goto do_error; + + /* Update the skb. */ + if (merge) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, copy); + page_ref_inc(pfrag->page); + } + pfrag->offset += copy; + } else if (zc == MSG_ZEROCOPY) { + /* First append to a fragless skb builds initial + * pure zerocopy skb + */ + if (!skb->len) + skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; + + if (!skb_zcopy_pure(skb)) { + copy = tcp_wmem_schedule(sk, copy); + if (!copy) + goto wait_for_space; + } + + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, + binding); + if (err == -EMSGSIZE || err == -EEXIST) { + tcp_mark_push(tp, skb); + goto new_segment; + } + if (err < 0) + goto do_error; + copy = err; + } else if (zc == MSG_SPLICE_PAGES) { + /* Splice in data if we can; copy if we can't. */ + if (tcp_downgrade_zcopy_pure(sk, skb)) + goto wait_for_space; + copy = tcp_wmem_schedule(sk, copy); + if (!copy) + goto wait_for_space; + + err = skb_splice_from_iter(skb, &msg->msg_iter, copy); + if (err < 0) { + if (err == -EMSGSIZE) { + tcp_mark_push(tp, skb); + goto new_segment; } - pfrag->offset += copy; + goto do_error; } + copy = err; - if (!copied) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; - tp->write_seq += copy; - TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); + } - from += copy; - copied += copy; - if ((seglen -= copy) == 0 && iovlen == 0) - goto out; + if (!copied) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) - continue; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); - if (forced_push(tp)) { - tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); - } else if (skb == tcp_send_head(sk)) - tcp_push_one(sk, mss_now); + copied += copy; + if (!msg_data_left(msg)) { + if (unlikely(flags & MSG_EOR)) + TCP_SKB_CB(skb)->eor = 1; + goto out; + } + + if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - if (copied) - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) + tcp_push_one(sk, mss_now); + continue; - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) - goto do_error; +wait_for_space: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + tcp_remove_empty_skb(sk); + if (copied) + tcp_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); - mss_now = tcp_send_mss(sk, &size_goal, flags); - } + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) + goto do_error; + + mss_now = tcp_send_mss(sk, &size_goal, flags); } out: - if (copied) - tcp_push(sk, flags, mss_now, tp->nonagle); - release_sock(sk); - return copied + copied_syn; - -do_fault: - if (!skb->len) { - tcp_unlink_write_queue(skb, sk); - /* It is the one place in all of TCP, except connection - * reset, where we can be unlinking the send_head. - */ - tcp_check_send_head(sk, skb); - sk_wmem_free_skb(sk, skb); + if (copied) { + tcp_tx_timestamp(sk, &sockc); + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } +out_nopush: + /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ + if (uarg && !msg->msg_ubuf) + net_zcopy_put(uarg); + if (binding) + net_devmem_dmabuf_binding_put(binding); + return copied + copied_syn; do_error: + tcp_remove_empty_skb(sk); + if (copied + copied_syn) goto out; out_err: + /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ + if (uarg && !msg->msg_ubuf) + net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); - release_sock(sk); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { + sk->sk_write_space(sk); + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } + if (binding) + net_devmem_dmabuf_binding_put(binding); + return err; } +EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); + +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int ret; + + lock_sock(sk); + ret = tcp_sendmsg_locked(sk, msg, size); + release_sock(sk); + + return ret; +} EXPORT_SYMBOL(tcp_sendmsg); +void tcp_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); + int mss_now, size_goal; + + if (!tcp_write_queue_tail(sk)) + return; + + lock_sock(sk); + mss_now = tcp_send_mss(sk, &size_goal, 0); + tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); + release_sock(sk); +} +EXPORT_IPV6_MOD_GPL(tcp_splice_eof); + /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) @@ -1265,14 +1454,14 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) char c = tp->urg_data; if (!(flags & MSG_PEEK)) - tp->urg_data = TCP_URG_READ; + WRITE_ONCE(tp->urg_data, TCP_URG_READ); /* Read urgent data. */ msg->msg_flags |= MSG_OOB; if (len > 0) { if (!(flags & MSG_TRUNC)) - err = memcpy_toiovec(msg->msg_iov, &c, 1); + err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; @@ -1297,10 +1486,15 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) struct sk_buff *skb; int copied = 0, err = 0; - /* XXX -- need to support SO_PEEK_OFF */ + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; @@ -1316,23 +1510,15 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -void tcp_cleanup_rbuf(struct sock *sk, int copied) +void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - - WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), - "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", - tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); - if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); - /* Delayed ACKs frequently hit locked sockets during bulk - * receive. */ - if (icsk->icsk_ack.blocked || - /* Once-per-two-segments ACK was not sent by tcp_input.c */ + + if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if @@ -1343,7 +1529,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && - !icsk->icsk_ack.pingpong)) && + !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } @@ -1370,71 +1556,47 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) time_to_ack = true; } } - if (time_to_ack) + if (time_to_ack) { + tcp_mstamp_refresh(tp); tcp_send_ack(sk); + } } -static void tcp_prequeue_process(struct sock *sk) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { - struct sk_buff *skb; + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tcp_sock *tp = tcp_sk(sk); - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); - - /* RX process wants to run with disabled BHs, though it is not - * necessary */ - local_bh_disable(); - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - local_bh_enable(); - - /* Clear memory counter. */ - tp->ucopy.memory = 0; + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), + "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); + __tcp_cleanup_rbuf(sk, copied); } -#ifdef CONFIG_NET_DMA -static void tcp_service_net_dma(struct sock *sk, bool wait) +static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { - dma_cookie_t done, used; - dma_cookie_t last_issued; - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->ucopy.dma_chan) - return; - - last_issued = tp->ucopy.dma_cookie; - dma_async_issue_pending(tp->ucopy.dma_chan); - - do { - if (dma_async_is_tx_complete(tp->ucopy.dma_chan, - last_issued, &done, - &used) == DMA_SUCCESS) { - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - break; - } else { - struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_SUCCESS)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } - } while (wait); + __skb_unlink(skb, &sk->sk_receive_queue); + if (likely(skb->destructor == sock_rfree)) { + sock_rfree(skb); + skb->destructor = NULL; + skb->sk = NULL; + return skb_attempt_defer_free(skb); + } + __kfree_skb(skb); } -#endif -static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; - if (offset < skb->len || tcp_hdr(skb)->fin) { + } + if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } @@ -1442,10 +1604,11 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb, false); + tcp_eat_recv_skb(sk, skb); } return NULL; } +EXPORT_SYMBOL(tcp_recv_skb); /* * This routine provides an alternative to tcp_recvmsg() for routines @@ -1458,12 +1621,13 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ -int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; + u32 seq = *copied_seq; u32 offset; int copied = 0; @@ -1476,7 +1640,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ - if (tp->urg_data) { + if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; @@ -1488,11 +1652,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (!copied) copied = used; break; - } else if (used <= len) { - seq += used; - copied += used; - offset += used; } + if (WARN_ON_ONCE(used > len)) + used = len; + seq += used; + copied += used; + offset += used; + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it @@ -1507,17 +1673,20 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (offset + 1 != skb->len) continue; } - if (tcp_hdr(skb)->fin) { - sk_eat_skb(sk, skb, false); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + tcp_eat_recv_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb, false); + tcp_eat_recv_skb(sk, skb); if (!desc->count) break; - tp->copied_seq = seq; + WRITE_ONCE(*copied_seq, seq); } - tp->copied_seq = seq; + WRITE_ONCE(*copied_seq, seq); + + if (noack) + goto out; tcp_rcv_space_adjust(sk); @@ -1526,10 +1695,934 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } +out: return copied; } + +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __tcp_read_sock(sk, desc, recv_actor, false, + &tcp_sk(sk)->copied_seq); +} EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) +{ + return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq); +} + +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) +{ + struct sk_buff *skb; + int copied = 0; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + u8 tcp_flags; + int used; + + __skb_unlink(skb, &sk->sk_receive_queue); + WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); + tcp_flags = TCP_SKB_CB(skb)->tcp_flags; + used = recv_actor(sk, skb); + if (used < 0) { + if (!copied) + copied = used; + break; + } + copied += used; + + if (tcp_flags & TCPHDR_FIN) + break; + } + return copied; +} +EXPORT_IPV6_MOD(tcp_read_skb); + +void tcp_read_done(struct sock *sk, size_t len) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + struct sk_buff *skb; + size_t left; + u32 offset; + + if (sk->sk_state == TCP_LISTEN) + return; + + left = len; + while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + int used; + + used = min_t(size_t, skb->len - offset, left); + seq += used; + left -= used; + + if (skb->len > offset + used) + break; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + tcp_eat_recv_skb(sk, skb); + ++seq; + break; + } + tcp_eat_recv_skb(sk, skb); + } + WRITE_ONCE(tp->copied_seq, seq); + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (left != len) + tcp_cleanup_rbuf(sk, len - left); +} +EXPORT_SYMBOL(tcp_read_done); + +int tcp_peek_len(struct socket *sock) +{ + return tcp_inq(sock->sk); +} +EXPORT_IPV6_MOD(tcp_peek_len); + +/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ +int tcp_set_rcvlowat(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + int space, cap; + + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + cap = sk->sk_rcvbuf >> 1; + else + cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; + val = min(val, cap); + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); + + /* Check if we need to signal EPOLLIN right now */ + tcp_data_ready(sk); + + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + return 0; + + space = tcp_space_from_win(sk, val); + if (space > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, space); + + if (tp->window_clamp && tp->window_clamp < val) + WRITE_ONCE(tp->window_clamp, val); + } + return 0; +} +EXPORT_IPV6_MOD(tcp_set_rcvlowat); + +void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec64(skb->tstamp); + else + tss->ts[0] = (struct timespec64) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec64) {0}; +} + +#ifdef CONFIG_MMU +static const struct vm_operations_struct tcp_vm_ops = { +}; + +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); + + /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ + vm_flags_set(vma, VM_MIXEDMAP); + + vma->vm_ops = &tcp_vm_ops; + return 0; +} +EXPORT_IPV6_MOD(tcp_mmap); + +static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, + u32 *offset_frag) +{ + skb_frag_t *frag; + + if (unlikely(offset_skb >= skb->len)) + return NULL; + + offset_skb -= skb_headlen(skb); + if ((int)offset_skb < 0 || skb_has_frag_list(skb)) + return NULL; + + frag = skb_shinfo(skb)->frags; + while (offset_skb) { + if (skb_frag_size(frag) > offset_skb) { + *offset_frag = offset_skb; + return frag; + } + offset_skb -= skb_frag_size(frag); + ++frag; + } + *offset_frag = 0; + return frag; +} + +static bool can_map_frag(const skb_frag_t *frag) +{ + struct page *page; + + if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag)) + return false; + + page = skb_frag_page(frag); + + if (PageCompound(page) || page->mapping) + return false; + + return true; +} + +static int find_next_mappable_frag(const skb_frag_t *frag, + int remaining_in_skb) +{ + int offset = 0; + + if (likely(can_map_frag(frag))) + return 0; + + while (offset < remaining_in_skb && !can_map_frag(frag)) { + offset += skb_frag_size(frag); + ++frag; + } + return offset; +} + +static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct sk_buff *skb, u32 offset) +{ + u32 frag_offset, partial_frag_remainder = 0; + int mappable_offset; + skb_frag_t *frag; + + /* worst case: skip to next skb. try to improve on this case below */ + zc->recv_skip_hint = skb->len - offset; + + /* Find the frag containing this offset (and how far into that frag) */ + frag = skb_advance_to_frag(skb, offset, &frag_offset); + if (!frag) + return; + + if (frag_offset) { + struct skb_shared_info *info = skb_shinfo(skb); + + /* We read part of the last frag, must recvmsg() rest of skb. */ + if (frag == &info->frags[info->nr_frags - 1]) + return; + + /* Else, we must at least read the remainder in this frag. */ + partial_frag_remainder = skb_frag_size(frag) - frag_offset; + zc->recv_skip_hint -= partial_frag_remainder; + ++frag; + } + + /* partial_frag_remainder: If part way through a frag, must read rest. + * mappable_offset: Bytes till next mappable frag, *not* counting bytes + * in partial_frag_remainder. + */ + mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); + zc->recv_skip_hint = mappable_offset + partial_frag_remainder; +} + +static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, + int flags, struct scm_timestamping_internal *tss, + int *cmsg_flags); +static int receive_fallback_to_copy(struct sock *sk, + struct tcp_zerocopy_receive *zc, int inq, + struct scm_timestamping_internal *tss) +{ + unsigned long copy_address = (unsigned long)zc->copybuf_address; + struct msghdr msg = {}; + int err; + + zc->length = 0; + zc->recv_skip_hint = 0; + + if (copy_address != zc->copybuf_address) + return -EINVAL; + + err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, + &msg.msg_iter); + if (err) + return err; + + err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT, + tss, &zc->msg_flags); + if (err < 0) + return err; + + zc->copybuf_len = err; + if (likely(zc->copybuf_len)) { + struct sk_buff *skb; + u32 offset; + + skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); + if (skb) + tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); + } + return 0; +} + +static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, + struct sk_buff *skb, u32 copylen, + u32 *offset, u32 *seq) +{ + unsigned long copy_address = (unsigned long)zc->copybuf_address; + struct msghdr msg = {}; + int err; + + if (copy_address != zc->copybuf_address) + return -EINVAL; + + err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, + &msg.msg_iter); + if (err) + return err; + err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); + if (err) + return err; + zc->recv_skip_hint -= copylen; + *offset += copylen; + *seq += copylen; + return (__s32)copylen; +} + +static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, + struct sock *sk, + struct sk_buff *skb, + u32 *seq, + s32 copybuf_len, + struct scm_timestamping_internal *tss) +{ + u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); + + if (!copylen) + return 0; + /* skb is null if inq < PAGE_SIZE. */ + if (skb) { + offset = *seq - TCP_SKB_CB(skb)->seq; + } else { + skb = tcp_recv_skb(sk, *seq, &offset); + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } + } + + zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, + seq); + return zc->copybuf_len < 0 ? 0 : copylen; +} + +static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, + struct page **pending_pages, + unsigned long pages_remaining, + unsigned long *address, + u32 *length, + u32 *seq, + struct tcp_zerocopy_receive *zc, + u32 total_bytes_to_map, + int err) +{ + /* At least one page did not map. Try zapping if we skipped earlier. */ + if (err == -EBUSY && + zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { + u32 maybe_zap_len; + + maybe_zap_len = total_bytes_to_map - /* All bytes to map */ + *length + /* Mapped or pending */ + (pages_remaining * PAGE_SIZE); /* Failed map. */ + zap_page_range_single(vma, *address, maybe_zap_len, NULL); + err = 0; + } + + if (!err) { + unsigned long leftover_pages = pages_remaining; + int bytes_mapped; + + /* We called zap_page_range_single, try to reinsert. */ + err = vm_insert_pages(vma, *address, + pending_pages, + &pages_remaining); + bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); + *seq += bytes_mapped; + *address += bytes_mapped; + } + if (err) { + /* Either we were unable to zap, OR we zapped, retried an + * insert, and still had an issue. Either ways, pages_remaining + * is the number of pages we were unable to map, and we unroll + * some state we speculatively touched before. + */ + const int bytes_not_mapped = PAGE_SIZE * pages_remaining; + + *length -= bytes_not_mapped; + zc->recv_skip_hint += bytes_not_mapped; + } + return err; +} + +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, + struct page **pages, + unsigned int pages_to_map, + unsigned long *address, + u32 *length, + u32 *seq, + struct tcp_zerocopy_receive *zc, + u32 total_bytes_to_map) +{ + unsigned long pages_remaining = pages_to_map; + unsigned int pages_mapped; + unsigned int bytes_mapped; + int err; + + err = vm_insert_pages(vma, *address, pages, &pages_remaining); + pages_mapped = pages_to_map - (unsigned int)pages_remaining; + bytes_mapped = PAGE_SIZE * pages_mapped; + /* Even if vm_insert_pages fails, it may have partially succeeded in + * mapping (some but not all of the pages). + */ + *seq += bytes_mapped; + *address += bytes_mapped; + + if (likely(!err)) + return 0; + + /* Error: maybe zap and retry + rollback state for failed inserts. */ + return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, + pages_remaining, address, length, seq, zc, total_bytes_to_map, + err); +} + +#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) +static void tcp_zc_finalize_rx_tstamp(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) +{ + unsigned long msg_control_addr; + struct msghdr cmsg_dummy; + + msg_control_addr = (unsigned long)zc->msg_control; + cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; + cmsg_dummy.msg_controllen = + (__kernel_size_t)zc->msg_controllen; + cmsg_dummy.msg_flags = in_compat_syscall() + ? MSG_CMSG_COMPAT : 0; + cmsg_dummy.msg_control_is_user = true; + zc->msg_flags = 0; + if (zc->msg_control == msg_control_addr && + zc->msg_controllen == cmsg_dummy.msg_controllen) { + tcp_recv_timestamp(&cmsg_dummy, sk, tss); + zc->msg_control = (__u64) + ((uintptr_t)cmsg_dummy.msg_control_user); + zc->msg_controllen = + (__u64)cmsg_dummy.msg_controllen; + zc->msg_flags = (__u32)cmsg_dummy.msg_flags; + } +} + +static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, + unsigned long address, + bool *mmap_locked) +{ + struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); + + if (vma) { + if (vma->vm_ops != &tcp_vm_ops) { + vma_end_read(vma); + return NULL; + } + *mmap_locked = false; + return vma; + } + + mmap_read_lock(mm); + vma = vma_lookup(mm, address); + if (!vma || vma->vm_ops != &tcp_vm_ops) { + mmap_read_unlock(mm); + return NULL; + } + *mmap_locked = true; + return vma; +} + +#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 +static int tcp_zerocopy_receive(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) +{ + u32 length = 0, offset, vma_len, avail_len, copylen = 0; + unsigned long address = (unsigned long)zc->address; + struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; + s32 copybuf_len = zc->copybuf_len; + struct tcp_sock *tp = tcp_sk(sk); + const skb_frag_t *frags = NULL; + unsigned int pages_to_map = 0; + struct vm_area_struct *vma; + struct sk_buff *skb = NULL; + u32 seq = tp->copied_seq; + u32 total_bytes_to_map; + int inq = tcp_inq(sk); + bool mmap_locked; + int ret; + + zc->copybuf_len = 0; + zc->msg_flags = 0; + + if (address & (PAGE_SIZE - 1) || address != zc->address) + return -EINVAL; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + + sock_rps_record_flow(sk); + + if (inq && inq <= copybuf_len) + return receive_fallback_to_copy(sk, zc, inq, tss); + + if (inq < PAGE_SIZE) { + zc->length = 0; + zc->recv_skip_hint = inq; + if (!inq && sock_flag(sk, SOCK_DONE)) + return -EIO; + return 0; + } + + vma = find_tcp_vma(current->mm, address, &mmap_locked); + if (!vma) + return -EINVAL; + + vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); + avail_len = min_t(u32, vma_len, inq); + total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); + if (total_bytes_to_map) { + if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) + zap_page_range_single(vma, address, total_bytes_to_map, + NULL); + zc->length = total_bytes_to_map; + zc->recv_skip_hint = 0; + } else { + zc->length = avail_len; + zc->recv_skip_hint = avail_len; + } + ret = 0; + while (length + PAGE_SIZE <= zc->length) { + int mappable_offset; + struct page *page; + + if (zc->recv_skip_hint < PAGE_SIZE) { + u32 offset_frag; + + if (skb) { + if (zc->recv_skip_hint > 0) + break; + skb = skb->next; + offset = seq - TCP_SKB_CB(skb)->seq; + } else { + skb = tcp_recv_skb(sk, seq, &offset); + } + + if (!skb_frags_readable(skb)) + break; + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + zc->msg_flags |= TCP_CMSG_TS; + } + zc->recv_skip_hint = skb->len - offset; + frags = skb_advance_to_frag(skb, offset, &offset_frag); + if (!frags || offset_frag) + break; + } + + mappable_offset = find_next_mappable_frag(frags, + zc->recv_skip_hint); + if (mappable_offset) { + zc->recv_skip_hint = mappable_offset; + break; + } + page = skb_frag_page(frags); + if (WARN_ON_ONCE(!page)) + break; + + prefetchw(page); + pages[pages_to_map++] = page; + length += PAGE_SIZE; + zc->recv_skip_hint -= PAGE_SIZE; + frags++; + if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || + zc->recv_skip_hint < PAGE_SIZE) { + /* Either full batch, or we're about to go to next skb + * (and we cannot unroll failed ops across skbs). + */ + ret = tcp_zerocopy_vm_insert_batch(vma, pages, + pages_to_map, + &address, &length, + &seq, zc, + total_bytes_to_map); + if (ret) + goto out; + pages_to_map = 0; + } + } + if (pages_to_map) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, + &address, &length, &seq, + zc, total_bytes_to_map); + } +out: + if (mmap_locked) + mmap_read_unlock(current->mm); + else + vma_end_read(vma); + /* Try to copy straggler data. */ + if (!ret) + copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); + + if (length + copylen) { + WRITE_ONCE(tp->copied_seq, seq); + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + tcp_recv_skb(sk, seq, &offset); + tcp_cleanup_rbuf(sk, length + copylen); + ret = 0; + if (length == zc->length) + zc->recv_skip_hint = 0; + } else { + if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) + ret = -EIO; + } + zc->length = length; + return ret; +} +#endif + +/* Similar to __sock_recv_timestamp, but does not require an skb */ +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss) +{ + int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); + u32 tsflags = READ_ONCE(sk->sk_tsflags); + bool has_timestamping = false; + + if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { + if (sock_flag(sk, SOCK_RCVTSTAMP)) { + if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { + if (new_tstamp) { + struct __kernel_timespec kts = { + .tv_sec = tss->ts[0].tv_sec, + .tv_nsec = tss->ts[0].tv_nsec, + }; + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, + sizeof(kts), &kts); + } else { + struct __kernel_old_timespec ts_old = { + .tv_sec = tss->ts[0].tv_sec, + .tv_nsec = tss->ts[0].tv_nsec, + }; + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, + sizeof(ts_old), &ts_old); + } + } else { + if (new_tstamp) { + struct __kernel_sock_timeval stv = { + .tv_sec = tss->ts[0].tv_sec, + .tv_usec = tss->ts[0].tv_nsec / 1000, + }; + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, + sizeof(stv), &stv); + } else { + struct __kernel_old_timeval tv = { + .tv_sec = tss->ts[0].tv_sec, + .tv_usec = tss->ts[0].tv_nsec / 1000, + }; + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, + sizeof(tv), &tv); + } + } + } + + if (tsflags & SOF_TIMESTAMPING_SOFTWARE && + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) + has_timestamping = true; + else + tss->ts[0] = (struct timespec64) {0}; + } + + if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { + if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && + (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) + has_timestamping = true; + else + tss->ts[2] = (struct timespec64) {0}; + } + + if (has_timestamping) { + tss->ts[1] = (struct timespec64) {0}; + if (sock_flag(sk, SOCK_TSTAMP_NEW)) + put_cmsg_scm_timestamping64(msg, tss); + else + put_cmsg_scm_timestamping(msg, tss); + } +} + +static int tcp_inq_hint(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 copied_seq = READ_ONCE(tp->copied_seq); + u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); + int inq; + + inq = rcv_nxt - copied_seq; + if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { + lock_sock(sk); + inq = tp->rcv_nxt - tp->copied_seq; + release_sock(sk); + } + /* After receiving a FIN, tell the user-space to continue reading + * by returning a non-zero inq. + */ + if (inq == 0 && sock_flag(sk, SOCK_DONE)) + inq = 1; + return inq; +} + +/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */ +struct tcp_xa_pool { + u8 max; /* max <= MAX_SKB_FRAGS */ + u8 idx; /* idx <= max */ + __u32 tokens[MAX_SKB_FRAGS]; + netmem_ref netmems[MAX_SKB_FRAGS]; +}; + +static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p) +{ + int i; + + /* Commit part that has been copied to user space. */ + for (i = 0; i < p->idx; i++) + __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY, + (__force void *)p->netmems[i], GFP_KERNEL); + /* Rollback what has been pre-allocated and is no longer needed. */ + for (; i < p->max; i++) + __xa_erase(&sk->sk_user_frags, p->tokens[i]); + + p->max = 0; + p->idx = 0; +} + +static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p) +{ + if (!p->max) + return; + + xa_lock_bh(&sk->sk_user_frags); + + tcp_xa_pool_commit_locked(sk, p); + + xa_unlock_bh(&sk->sk_user_frags); +} + +static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p, + unsigned int max_frags) +{ + int err, k; + + if (p->idx < p->max) + return 0; + + xa_lock_bh(&sk->sk_user_frags); + + tcp_xa_pool_commit_locked(sk, p); + + for (k = 0; k < max_frags; k++) { + err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k], + XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL); + if (err) + break; + } + + xa_unlock_bh(&sk->sk_user_frags); + + p->max = k; + p->idx = 0; + return k ? 0 : err; +} + +/* On error, returns the -errno. On success, returns number of bytes sent to the + * user. May not consume all of @remaining_len. + */ +static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, + unsigned int offset, struct msghdr *msg, + int remaining_len) +{ + struct dmabuf_cmsg dmabuf_cmsg = { 0 }; + struct tcp_xa_pool tcp_xa_pool; + unsigned int start; + int i, copy, n; + int sent = 0; + int err = 0; + + tcp_xa_pool.max = 0; + tcp_xa_pool.idx = 0; + do { + start = skb_headlen(skb); + + if (skb_frags_readable(skb)) { + err = -ENODEV; + goto out; + } + + /* Copy header. */ + copy = start - offset; + if (copy > 0) { + copy = min(copy, remaining_len); + + n = copy_to_iter(skb->data + offset, copy, + &msg->msg_iter); + if (n != copy) { + err = -EFAULT; + goto out; + } + + offset += copy; + remaining_len -= copy; + + /* First a dmabuf_cmsg for # bytes copied to user + * buffer. + */ + memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); + dmabuf_cmsg.frag_size = copy; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) + goto out; + + sent += copy; + + if (remaining_len == 0) + goto out; + } + + /* after that, send information of dmabuf pages through a + * sequence of cmsg + */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct net_iov *niov; + u64 frag_offset; + int end; + + /* !skb_frags_readable() should indicate that ALL the + * frags in this skb are dmabuf net_iovs. We're checking + * for that flag above, but also check individual frags + * here. If the tcp stack is not setting + * skb_frags_readable() correctly, we still don't want + * to crash here. + */ + if (!skb_frag_net_iov(frag)) { + net_err_ratelimited("Found non-dmabuf skb with net_iov"); + err = -ENODEV; + goto out; + } + + niov = skb_frag_net_iov(frag); + if (!net_is_devmem_iov(niov)) { + err = -ENODEV; + goto out; + } + + end = start + skb_frag_size(frag); + copy = end - offset; + + if (copy > 0) { + copy = min(copy, remaining_len); + + frag_offset = net_iov_virtual_addr(niov) + + skb_frag_off(frag) + offset - + start; + dmabuf_cmsg.frag_offset = frag_offset; + dmabuf_cmsg.frag_size = copy; + err = tcp_xa_pool_refill(sk, &tcp_xa_pool, + skb_shinfo(skb)->nr_frags - i); + if (err) + goto out; + + /* Will perform the exchange later */ + dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; + dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); + + offset += copy; + remaining_len -= copy; + + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) + goto out; + + atomic_long_inc(&niov->desc.pp_ref_count); + tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); + + sent += copy; + + if (remaining_len == 0) + goto out; + } + start = end; + } + + tcp_xa_pool_commit(sk, &tcp_xa_pool); + if (!remaining_len) + goto out; + + /* if remaining_len is not satisfied yet, we need to go to the + * next frag in the frag_list to satisfy remaining_len. + */ + skb = skb_shinfo(skb)->frag_list ?: skb->next; + + offset = offset - start; + } while (skb); + + if (remaining_len) { + err = -EFAULT; + goto out; + } + +out: + tcp_xa_pool_commit(sk, &tcp_xa_pool); + if (!sent) + sent = err; + + return sent; +} + /* * This routine copies from a sock struct into the user buffer. * @@ -1538,10 +2631,12 @@ EXPORT_SYMBOL(tcp_read_sock); * Probably, code can be easily improved even more. */ -int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, int *addr_len) +static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, + int flags, struct scm_timestamping_internal *tss, + int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); + int last_copied_dmabuf = -1; /* uninitialized */ int copied = 0; u32 peek_seq; u32 *seq; @@ -1549,22 +2644,19 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int err; int target; /* Read at least this many bytes */ long timeo; - struct task_struct *user_recv = NULL; - bool copied_early = false; - struct sk_buff *skb; + struct sk_buff *skb, *last; + u32 peek_offset = 0; u32 urg_hole = 0; - if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && - (sk->sk_state == TCP_ESTABLISHED)) - sk_busy_loop(sk, nonblock); - - lock_sock(sk); - err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; - timeo = sock_rcvtimeo(sk, nonblock); + if (tp->recvmsg_inq) { + *cmsg_flags = TCP_CMSG_INQ; + msg->msg_get_inq = 1; + } + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) @@ -1587,39 +2679,18 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, seq = &tp->copied_seq; if (flags & MSG_PEEK) { - peek_seq = tp->copied_seq; + peek_offset = max(sk_peek_offset(sk, flags), 0); + peek_seq = tp->copied_seq + peek_offset; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - preempt_disable(); - skb = skb_peek_tail(&sk->sk_receive_queue); - { - int available = 0; - - if (skb) - available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); - if ((available < target) && - (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && - net_dma_find_channel()) { - preempt_enable_no_resched(); - tp->ucopy.pinned_list = - dma_pin_iovec_pages(msg->msg_iov, len); - } else { - preempt_enable_no_resched(); - } - } -#endif - do { u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ - if (tp->urg_data && tp->urg_seq == *seq) { + if (unlikely(tp->urg_data) && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { @@ -1630,38 +2701,42 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Next get a buffer. */ + last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { + last = skb; /* Now that we have two receive queues this * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), - "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len) goto found_ok_skb; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), - "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { - if (sk->sk_err || + if (!timeo || + sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || - !timeo || signal_pending(current)) break; } else { @@ -1677,13 +2752,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, break; if (sk->sk_state == TCP_CLOSE) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - copied = -ENOTCONN; - break; - } + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; break; } @@ -1698,120 +2770,40 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } } - tcp_cleanup_rbuf(sk, copied); - - if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { - /* Install new reader */ - if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { - user_recv = current; - tp->ucopy.task = user_recv; - tp->ucopy.iov = msg->msg_iov; - } - - tp->ucopy.len = len; - - WARN_ON(tp->copied_seq != tp->rcv_nxt && - !(flags & (MSG_PEEK | MSG_TRUNC))); - - /* Ugly... If prequeue is not empty, we have to - * process it before releasing socket, otherwise - * order will be broken at second iteration. - * More elegant solution is required!!! - * - * Look: we have the following (pseudo)queues: - * - * 1. packets in flight - * 2. backlog - * 3. prequeue - * 4. receive_queue - * - * Each queue can be processed only if the next ones - * are empty. At this point we have empty receive_queue. - * But prequeue _can_ be not empty after 2nd iteration, - * when we jumped to start of loop because backlog - * processing added something to receive_queue. - * We cannot release_sock(), because backlog contains - * packets arrived _after_ prequeued ones. - * - * Shortly, algorithm is clear --- to process all - * the queues in order. We could make it more directly, - * requeueing packets from backlog to prequeue, if - * is not empty. It is more elegant, but eats cycles, - * unfortunately. - */ - if (!skb_queue_empty(&tp->ucopy.prequeue)) - goto do_prequeue; - - /* __ Set realtime policy in scheduler __ */ - } - -#ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - if (tp->rcv_wnd == 0 && - !skb_queue_empty(&sk->sk_async_wait_queue)) { - tcp_service_net_dma(sk, true); - tcp_cleanup_rbuf(sk, copied); - } else - dma_async_issue_pending(tp->ucopy.dma_chan); - } -#endif if (copied >= target) { /* Do not sleep, just process backlog. */ - release_sock(sk); - lock_sock(sk); - } else - sk_wait_data(sk, &timeo); - -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, false); /* Don't block */ - tp->ucopy.wakeup = 0; -#endif - - if (user_recv) { - int chunk; - - /* __ Restore normal policy in scheduler __ */ - - if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); - len -= chunk; - copied += chunk; - } - - if (tp->rcv_nxt == tp->copied_seq && - !skb_queue_empty(&tp->ucopy.prequeue)) { -do_prequeue: - tcp_prequeue_process(sk); - - if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } + __sk_flush_backlog(sk); + } else { + tcp_cleanup_rbuf(sk, copied); + err = sk_wait_data(sk, &timeo, last); + if (err < 0) { + err = copied ? : err; + goto out; } } + if ((flags & MSG_PEEK) && - (peek_seq - copied - urg_hole != tp->copied_seq)) { + (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); - peek_seq = tp->copied_seq; + peek_seq = tp->copied_seq + peek_offset; } continue; - found_ok_skb: +found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; /* Do we have urgent data here? */ - if (tp->urg_data) { + if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { - ++*seq; + WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; @@ -1824,119 +2816,90 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { -#ifdef CONFIG_NET_DMA - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan) { - tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( - tp->ucopy.dma_chan, skb, offset, - msg->msg_iov, used, - tp->ucopy.pinned_list); - - if (tp->ucopy.dma_cookie < 0) { - - pr_alert("%s: dma_cookie < 0\n", - __func__); + if (last_copied_dmabuf != -1 && + last_copied_dmabuf != !skb_frags_readable(skb)) + break; + if (skb_frags_readable(skb)) { + err = skb_copy_datagram_msg(skb, offset, msg, + used); + if (err) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } + } else { + if (!(flags & MSG_SOCK_DEVMEM)) { + /* dmabuf skbs can only be received + * with the MSG_SOCK_DEVMEM flag. + */ + if (!copied) + copied = -EFAULT; - dma_async_issue_pending(tp->ucopy.dma_chan); - - if ((offset + used) == skb->len) - copied_early = true; + break; + } - } else -#endif - { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ + err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, + used); + if (err < 0) { if (!copied) - copied = -EFAULT; + copied = err; + break; } + used = err; } } - *seq += used; + last_copied_dmabuf = !skb_frags_readable(skb); + + WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; - + if (flags & MSG_PEEK) + sk_peek_offset_fwd(sk, used); + else + sk_peek_offset_bwd(sk, used); tcp_rcv_space_adjust(sk); skip_copy: - if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { - tp->urg_data = 0; + if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { + WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + *cmsg_flags |= TCP_CMSG_TS; + } + if (used + offset < skb->len) continue; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + tcp_eat_recv_skb(sk, skb); continue; - found_fin_ok: +found_fin_ok: /* Process the FIN. */ - ++*seq; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + WRITE_ONCE(*seq, *seq + 1); + if (!(flags & MSG_PEEK)) + tcp_eat_recv_skb(sk, skb); break; } while (len > 0); - if (user_recv) { - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - int chunk; - - tp->ucopy.len = copied > 0 ? len : 0; - - tcp_prequeue_process(sk); - - if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - } - -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, true); /* Wait for queue to drain */ - tp->ucopy.dma_chan = NULL; - - if (tp->ucopy.pinned_list) { - dma_unpin_iovec_pages(tp->ucopy.pinned_list); - tp->ucopy.pinned_list = NULL; - } -#endif - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); - - release_sock(sk); return copied; out: - release_sock(sk); return err; recv_urg: @@ -1947,17 +2910,88 @@ recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } -EXPORT_SYMBOL(tcp_recvmsg); + +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, + int *addr_len) +{ + int cmsg_flags = 0, ret; + struct scm_timestamping_internal tss; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (sk_can_busy_loop(sk) && + skb_queue_empty_lockless(&sk->sk_receive_queue) && + sk->sk_state == TCP_ESTABLISHED) + sk_busy_loop(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); + release_sock(sk); + + if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) { + if (cmsg_flags & TCP_CMSG_TS) + tcp_recv_timestamp(msg, sk, &tss); + if (msg->msg_get_inq) { + msg->msg_inq = tcp_inq_hint(sk); + if (cmsg_flags & TCP_CMSG_INQ) + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, + sizeof(msg->msg_inq), &msg->msg_inq); + } + } + return ret; +} +EXPORT_IPV6_MOD(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + /* bpf uapi header bpf.h defines an anonymous enum with values + * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux + * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. + * But clang built vmlinux does not have this enum in DWARF + * since clang removes the above code before generating IR/debuginfo. + * Let us explicitly emit the type debuginfo to ensure the + * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF + * regardless of which compiler is used. + */ + BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; + case TCP_CLOSE_WAIT: + if (oldstate == TCP_SYN_RECV) + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) @@ -1967,20 +3001,16 @@ void tcp_set_state(struct sock *sk, int state) if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); - /* fall through */ + fallthrough; default: - if (oldstate == TCP_ESTABLISHED) + if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; - -#ifdef STATE_TRACE - SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -#endif + inet_sk_state_store(sk, state); } EXPORT_SYMBOL_GPL(tcp_set_state); @@ -1993,18 +3023,19 @@ EXPORT_SYMBOL_GPL(tcp_set_state); static const unsigned char new_state[16] = { /* current state: new state: action: */ - /* (Invalid) */ TCP_CLOSE, - /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_SYN_SENT */ TCP_CLOSE, - /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, - /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, - /* TCP_TIME_WAIT */ TCP_CLOSE, - /* TCP_CLOSE */ TCP_CLOSE, - /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, - /* TCP_LAST_ACK */ TCP_LAST_ACK, - /* TCP_LISTEN */ TCP_CLOSE, - /* TCP_CLOSING */ TCP_CLOSING, + [0 /* (Invalid) */] = TCP_CLOSE, + [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_SYN_SENT] = TCP_CLOSE, + [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, + [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, + [TCP_TIME_WAIT] = TCP_CLOSE, + [TCP_CLOSE] = TCP_CLOSE, + [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, + [TCP_LAST_ACK] = TCP_LAST_ACK, + [TCP_LISTEN] = TCP_CLOSE, + [TCP_CLOSING] = TCP_CLOSING, + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) @@ -2034,19 +3065,53 @@ void tcp_shutdown(struct sock *sk, int how) /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | - TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); } } -EXPORT_SYMBOL(tcp_shutdown); +EXPORT_IPV6_MOD(tcp_shutdown); + +int tcp_orphan_count_sum(void) +{ + int i, total = 0; + + for_each_possible_cpu(i) + total += per_cpu(tcp_orphan_count, i); + + return max(total, 0); +} + +static int tcp_orphan_cache; +static struct timer_list tcp_orphan_timer; +#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) + +static void tcp_orphan_update(struct timer_list *unused) +{ + WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); + mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); +} + +static bool tcp_too_many_orphans(int shift) +{ + return READ_ONCE(tcp_orphan_cache) << shift > + READ_ONCE(sysctl_tcp_max_orphans); +} -bool tcp_check_oom(struct sock *sk, int shift) +static bool tcp_out_of_memory(const struct sock *sk) +{ + if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) + return true; + return false; +} + +bool tcp_check_oom(const struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; - too_many_orphans = tcp_too_many_orphans(sk, shift); + too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) @@ -2056,14 +3121,13 @@ bool tcp_check_oom(struct sock *sk, int shift) return too_many_orphans || out_of_socket_memory; } -void tcp_close(struct sock *sk, long timeout) +void __tcp_close(struct sock *sk, long timeout) { + bool data_was_unread = false; struct sk_buff *skb; - int data_was_unread = 0; int state; - lock_sock(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); @@ -2078,14 +3142,15 @@ void tcp_close(struct sock *sk, long timeout) * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - tcp_hdr(skb)->fin; - data_was_unread += len; - __kfree_skb(skb); - } + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; - sk_mem_reclaim(sk); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + end_seq--; + if (after(end_seq, tcp_sk(sk)->copied_seq)) + data_was_unread = true; + tcp_eat_recv_skb(sk, skb); + } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) @@ -2102,13 +3167,14 @@ void tcp_close(struct sock *sk, long timeout) sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, sk->sk_allocation); + tcp_send_active_reset(sk, sk->sk_allocation, + SK_RST_REASON_TCP_ABORT_ON_CLOSE); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. @@ -2118,7 +3184,7 @@ void tcp_close(struct sock *sk, long timeout) * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 - * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), @@ -2149,18 +3215,12 @@ adjudge_to_death: sock_hold(sk); sock_orphan(sk); - /* It is the last release_sock in its life. It will remove backlog. */ - release_sock(sk); - - - /* Now socket is owned by kernel and we acquire BH lock - to finish close. No need to check for user refs. - */ local_bh_disable(); bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); + /* remove backlog if any, without releasing ownership. */ + __release_sock(sk); - percpu_counter_inc(sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) @@ -2169,7 +3229,7 @@ adjudge_to_death: /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill + * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. @@ -2182,16 +3242,17 @@ adjudge_to_death: if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(sock_net(sk), + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_TCP_ABORT_ON_LINGER); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -2200,22 +3261,28 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - sk_mem_reclaim(sk); if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(sock_net(sk), + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_TCP_ABORT_ON_MEMORY); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + } else if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_set_state(sk, TCP_CLOSE); } } if (sk->sk_state == TCP_CLOSE) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. */ - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } @@ -2224,6 +3291,15 @@ adjudge_to_death: out: bh_unlock_sock(sk); local_bh_enable(); +} + +void tcp_close(struct sock *sk, long timeout) +{ + lock_sock(sk); + __tcp_close(sk, timeout); + release_sock(sk); + if (!sk->sk_net_refcnt) + inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); @@ -2237,13 +3313,47 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +static void tcp_rtx_queue_purge(struct sock *sk) +{ + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + + tcp_sk(sk)->highest_sack = NULL; + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + /* Since we are deleting whole queue, no need to + * list_del(&skb->tcp_tsorted_anchor) + */ + tcp_rtx_queue_unlink(skb, sk); + tcp_wmem_free_skb(sk, skb); + } +} + +void tcp_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + tcp_wmem_free_skb(sk, skb); + } + tcp_rtx_queue_purge(sk); + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); + tcp_clear_all_retrans_hints(tcp_sk(sk)); + tcp_sk(sk)->packets_out = 0; + inet_csk(sk)->icsk_backoff = 0; +} + int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int err = 0; int old_state = sk->sk_state; + struct request_sock *req; + u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); @@ -2252,92 +3362,200 @@ int tcp_disconnect(struct sock *sk, int flags) if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { - sk->sk_err = ECONNABORTED; - } else if (tcp_need_reset(old_state) || - (tp->snd_nxt != tp->write_seq && - (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { + WRITE_ONCE(sk->sk_err, ECONNABORTED); + } else if (tcp_need_reset(old_state)) { + tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE); + WRITE_ONCE(sk->sk_err, ECONNRESET); + } else if (tp->snd_nxt != tp->write_seq && + (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ - tcp_send_active_reset(sk, gfp_any()); - sk->sk_err = ECONNRESET; + tcp_send_active_reset(sk, gfp_any(), + SK_RST_REASON_TCP_DISCONNECT_WITH_DATA); + WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) - sk->sk_err = ECONNRESET; + WRITE_ONCE(sk->sk_err, ECONNRESET); tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); + WRITE_ONCE(tp->urg_data, 0); + sk_set_peek_off(sk, -1); tcp_write_queue_purge(sk); - __skb_queue_purge(&tp->out_of_order_queue); -#ifdef CONFIG_NET_DMA - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif + tcp_fastopen_active_disable_ofo_check(sk); + skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); - sk->sk_shutdown = 0; + WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); - tp->srtt = 0; - if ((tp->write_seq += tp->max_window + 2) == 0) - tp->write_seq = 1; + tp->srtt_us = 0; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + tp->rcv_rtt_last_tsecr = 0; + + seq = tp->write_seq + tp->max_window + 2; + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); + icsk->icsk_backoff = 0; - tp->snd_cwnd = 2; - icsk->icsk_probes_out = 0; - tp->packets_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); + icsk->icsk_probes_tstamp = 0; + icsk->icsk_rto = TCP_TIMEOUT_INIT; + WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); + WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX); tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; + tp->is_cwnd_limited = 0; + tp->max_packets_out = 0; tp->window_clamp = 0; + tp->delivered = 0; + tp->delivered_ce = 0; + tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; + if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; tcp_clear_retrans(tp); + tp->total_retrans = 0; inet_csk_delack_init(sk); - tcp_init_send_head(sk); + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window() + */ + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); + dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL))); + tcp_saved_syn_free(tp); + tp->compressed_ack = 0; + tp->segs_in = 0; + tp->segs_out = 0; + tp->bytes_sent = 0; + tp->bytes_acked = 0; + tp->bytes_received = 0; + tp->bytes_retrans = 0; + tp->data_segs_in = 0; + tp->data_segs_out = 0; + tp->duplicate_sack[0].start_seq = 0; + tp->duplicate_sack[0].end_seq = 0; + tp->dsack_dups = 0; + tp->reord_seen = 0; + tp->retrans_out = 0; + tp->sacked_out = 0; + tp->tlp_high_seq = 0; + tp->last_oow_ack_time = 0; + tp->plb_rehash = 0; + /* There's a bubble in the pipe until at least the first ACK. */ + tp->app_limited = ~0U; + tp->rate_app_limited = 1; + tp->rack.mstamp = 0; + tp->rack.advanced = 0; + tp->rack.reo_wnd_steps = 1; + tp->rack.last_delivered = 0; + tp->rack.reo_wnd_persist = 0; + tp->rack.dsack_seen = 0; + tp->syn_data_acked = 0; + tp->syn_fastopen_child = 0; + tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; + + + /* Clean up fastopen related fields */ + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) + reqsk_fastopen_remove(sk, req, false); + tcp_free_fastopen_req(tp); + inet_clear_bit(DEFER_CONNECT, sk); + tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); - sk->sk_error_report(sk); - return err; + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } + sk_error_report(sk); + return 0; } EXPORT_SYMBOL(tcp_disconnect); -void tcp_sock_destruct(struct sock *sk) +static inline bool tcp_can_repair_sock(const struct sock *sk) { - inet_sock_destruct(sk); - - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); + return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && + (sk->sk_state != TCP_LISTEN); } -static inline bool tcp_can_repair_sock(const struct sock *sk) +static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { - return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && - ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); + struct tcp_repair_window opt; + + if (!tp->repair) + return -EPERM; + + if (len != sizeof(opt)) + return -EINVAL; + + if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) + return -EFAULT; + + if (opt.max_window < opt.snd_wnd) + return -EINVAL; + + if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) + return -EINVAL; + + if (after(opt.rcv_wup, tp->rcv_nxt)) + return -EINVAL; + + tp->snd_wl1 = opt.snd_wl1; + tp->snd_wnd = opt.snd_wnd; + tp->max_window = opt.max_window; + + tp->rcv_wnd = opt.rcv_wnd; + tp->rcv_wup = opt.rcv_wup; + + return 0; } -static int tcp_repair_options_est(struct tcp_sock *tp, - struct tcp_repair_opt __user *optbuf, unsigned int len) +static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, + unsigned int len) { + struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; + size_t offset = 0; while (len >= sizeof(opt)) { - if (copy_from_user(&opt, optbuf, sizeof(opt))) + if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; - optbuf++; + offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; + tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; - if (snd_wscale > 14 || rcv_wscale > 14) + if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; @@ -2350,8 +3568,6 @@ static int tcp_repair_options_est(struct tcp_sock *tp, return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; - if (sysctl_tcp_fack) - tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) @@ -2365,14 +3581,252 @@ static int tcp_repair_options_est(struct tcp_sock *tp, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_IPV6_MOD(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 delta = (val - tp->tcp_tx_delay) << 3; + + if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } + /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us, + * tp->rtt_min, icsk_rto and sk->sk_pacing_rate. + * This is best effort. + */ + if (delta && sk->sk_state == TCP_ESTABLISHED) { + s64 srtt = (s64)tp->srtt_us + delta; + + tp->srtt_us = clamp_t(s64, srtt, 1, ~0U); + + /* Note: does not deal with non zero icsk_backoff */ + tcp_set_rto(sk); + + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); + + tcp_update_pacing_rate(sk); + } +} + +/* When set indicates to always queue non-full frames. Later the user clears + * this option and we transmit any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly filled frames when the + * user (for example) must write out headers with a write() call first and then + * use sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is stronger than + * TCP_NODELAY. + */ +void __tcp_sock_set_cork(struct sock *sk, bool on) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (on) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle & TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } +} + +void tcp_sock_set_cork(struct sock *sk, bool on) +{ + lock_sock(sk); + __tcp_sock_set_cork(sk, on); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_cork); + +/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is + * remembered, but it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make an explicit push, which overrides + * even TCP_CORK for currently queued segments. + */ +void __tcp_sock_set_nodelay(struct sock *sk, bool on) +{ + if (on) { + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } else { + tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; + } +} + +void tcp_sock_set_nodelay(struct sock *sk) +{ + lock_sock(sk); + __tcp_sock_set_nodelay(sk, true); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_nodelay); + +static void __tcp_sock_set_quickack(struct sock *sk, int val) +{ + if (!val) { + inet_csk_enter_pingpong_mode(sk); + return; + } + + inet_csk_exit_pingpong_mode(sk); + if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + inet_csk_ack_scheduled(sk)) { + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; + tcp_cleanup_rbuf(sk, 1); + if (!(val & 1)) + inet_csk_enter_pingpong_mode(sk); + } +} + +void tcp_sock_set_quickack(struct sock *sk, int val) +{ + lock_sock(sk); + __tcp_sock_set_quickack(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_quickack); + +int tcp_sock_set_syncnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_SYNCNT) + return -EINVAL; + + WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_syncnt); + +int tcp_sock_set_user_timeout(struct sock *sk, int val) +{ + /* Cap the max time in ms TCP will retry or probe the window + * before giving up and aborting (ETIMEDOUT) a connection. + */ + if (val < 0) + return -EINVAL; + + WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_user_timeout); + +int tcp_sock_set_keepidle_locked(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (val < 1 || val > MAX_TCP_KEEPIDLE) + return -EINVAL; + + /* Paired with WRITE_ONCE() in keepalive_time_when() */ + WRITE_ONCE(tp->keepalive_time, val * HZ); + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + u32 elapsed = keepalive_time_elapsed(tp); + + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + tcp_reset_keepalive_timer(sk, elapsed); + } + + return 0; +} + +int tcp_sock_set_keepidle(struct sock *sk, int val) +{ + int err; + + lock_sock(sk); + err = tcp_sock_set_keepidle_locked(sk, val); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(tcp_sock_set_keepidle); + +int tcp_sock_set_keepintvl(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPINTVL) + return -EINVAL; + + WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepintvl); + +int tcp_sock_set_keepcnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPCNT) + return -EINVAL; + + /* Paired with READ_ONCE() in keepalive_probes() */ + WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepcnt); + +int tcp_set_window_clamp(struct sock *sk, int val) +{ + u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh; + struct tcp_sock *tp = tcp_sk(sk); + + if (!val) { + if (sk->sk_state != TCP_CLOSE) + return -EINVAL; + WRITE_ONCE(tp->window_clamp, 0); + return 0; + } + + old_window_clamp = tp->window_clamp; + new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val); + + if (new_window_clamp == old_window_clamp) + return 0; + + WRITE_ONCE(tp->window_clamp, new_window_clamp); + + /* Need to apply the reserved mem provisioning only + * when shrinking the window clamp. + */ + if (new_window_clamp < old_window_clamp) { + __tcp_adjust_rcv_ssthresh(sk, new_window_clamp); + } else { + new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp); + tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh); + } + return 0; +} + +int tcp_sock_set_maxseg(struct sock *sk, int val) +{ + /* Values greater than interface MTU won't take effect. However + * at the point when this call is done we typically don't yet + * know which interface is going to be used + */ + if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) + return -EINVAL; + + WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val); + return 0; +} + /* * Socket option code for TCP. */ -static int do_tcp_setsockopt(struct sock *sk, int level, - int optname, char __user *optval, unsigned int optlen) +int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); int val; int err = 0; @@ -2384,17 +3838,56 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < 1) return -EINVAL; - val = strncpy_from_user(name, optval, + val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; - lock_sock(sk); - err = tcp_set_congestion_control(sk, name); - release_sock(sk); + sockopt_lock_sock(sk); + err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), + sockopt_ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); + sockopt_release_sock(sk); return err; } + case TCP_ULP: { + char name[TCP_ULP_NAME_MAX]; + + if (optlen < 1) + return -EINVAL; + + val = strncpy_from_sockptr(name, optval, + min_t(long, TCP_ULP_NAME_MAX - 1, + optlen)); + if (val < 0) + return -EFAULT; + name[val] = 0; + + sockopt_lock_sock(sk); + err = tcp_set_ulp(sk, name); + sockopt_release_sock(sk); + return err; + } + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; + __u8 *backup_key = NULL; + + /* Allow a backup key as well to facilitate key rotation + * First key is the active one. + */ + if (optlen != TCP_FASTOPEN_KEY_LENGTH && + optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) + return -EINVAL; + + if (copy_from_sockptr(key, optval, optlen)) + return -EFAULT; + + if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) + backup_key = key + TCP_FASTOPEN_KEY_LENGTH; + + return tcp_fastopen_reset_cipher(net, sk, key, backup_key); + } default: /* fallthru */ break; @@ -2403,38 +3896,63 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); - + /* Handle options that can be set without locking the socket. */ switch (optname) { + case TCP_SYNCNT: + return tcp_sock_set_syncnt(sk, val); + case TCP_USER_TIMEOUT: + return tcp_sock_set_user_timeout(sk, val); + case TCP_KEEPINTVL: + return tcp_sock_set_keepintvl(sk, val); + case TCP_KEEPCNT: + return tcp_sock_set_keepcnt(sk, val); + case TCP_LINGER2: + if (val < 0) + WRITE_ONCE(tp->linger2, -1); + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); + else + WRITE_ONCE(tp->linger2, val * HZ); + return 0; + case TCP_DEFER_ACCEPT: + /* Translate value in seconds to number of retransmits */ + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ)); + return 0; + case TCP_RTO_MAX_MS: + if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); + return 0; + case TCP_RTO_MIN_US: { + int rto_min = usecs_to_jiffies(val); + + if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min); + return 0; + } + case TCP_DELACK_MAX_US: { + int delack_max = usecs_to_jiffies(val); + + if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); + return 0; + } case TCP_MAXSEG: - /* Values greater than interface MTU won't take effect. However - * at the point when this call is done we typically don't yet - * know which interface is going to be used */ - if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { - err = -EINVAL; - break; - } - tp->rx_opt.user_mss = val; - break; + return tcp_sock_set_maxseg(sk, val); + } + sockopt_lock_sock(sk); + + switch (optname) { case TCP_NODELAY: - if (val) { - /* TCP_NODELAY is weaker than TCP_CORK, so that - * this option on corked socket is remembered, but - * it is not activated until cork is cleared. - * - * However, when TCP_NODELAY is set we make - * an explicit push, which overrides even TCP_CORK - * for currently queued segments. - */ - tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } else { - tp->nonagle &= ~TCP_NAGLE_OFF; - } + __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: @@ -2447,23 +3965,22 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; - else - tp->thin_dupack = val; - if (tp->thin_dupack) - tcp_disable_early_retrans(tp); break; case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; - else if (val == 1) { + else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; - } else if (val == 0) { + } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); + } else if (val == TCP_REPAIR_OFF_NO_WP) { + tp->repair = 0; + sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; @@ -2472,208 +3989,248 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_REPAIR_QUEUE: if (!tp->repair) err = -EPERM; - else if (val < TCP_QUEUES_NR) + else if ((unsigned int)val < TCP_QUEUES_NR) tp->repair_queue = val; else err = -EINVAL; break; case TCP_QUEUE_SEQ: - if (sk->sk_state != TCP_CLOSE) + if (sk->sk_state != TCP_CLOSE) { err = -EPERM; - else if (tp->repair_queue == TCP_SEND_QUEUE) - tp->write_seq = val; - else if (tp->repair_queue == TCP_RECV_QUEUE) - tp->rcv_nxt = val; - else + } else if (tp->repair_queue == TCP_SEND_QUEUE) { + if (!tcp_rtx_queue_empty(sk)) + err = -EPERM; + else + WRITE_ONCE(tp->write_seq, val); + } else if (tp->repair_queue == TCP_RECV_QUEUE) { + if (tp->rcv_nxt != tp->copied_seq) { + err = -EPERM; + } else { + WRITE_ONCE(tp->rcv_nxt, val); + WRITE_ONCE(tp->copied_seq, val); + } + } else { err = -EINVAL; + } break; case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; - else if (sk->sk_state == TCP_ESTABLISHED) - err = tcp_repair_options_est(tp, - (struct tcp_repair_opt __user *)optval, - optlen); + else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) + err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; case TCP_CORK: - /* When set indicates to always queue non-full frames. - * Later the user clears this option and we transmit - * any pending partial frames in the queue. This is - * meant to be used alongside sendfile() to get properly - * filled frames when the user (for example) must write - * out headers with a write() call first and then use - * sendfile to send out the data parts. - * - * TCP_CORK can be set together with TCP_NODELAY and it is - * stronger than TCP_NODELAY. - */ - if (val) { - tp->nonagle |= TCP_NAGLE_CORK; - } else { - tp->nonagle &= ~TCP_NAGLE_CORK; - if (tp->nonagle&TCP_NAGLE_OFF) - tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } + __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: - if (val < 1 || val > MAX_TCP_KEEPIDLE) - err = -EINVAL; - else { - tp->keepalive_time = val * HZ; - if (sock_flag(sk, SOCK_KEEPOPEN) && - !((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN))) { - u32 elapsed = keepalive_time_elapsed(tp); - if (tp->keepalive_time > elapsed) - elapsed = tp->keepalive_time - elapsed; - else - elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); - } - } + err = tcp_sock_set_keepidle_locked(sk, val); break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) + case TCP_SAVE_SYN: + /* 0: disable, 1: enable, 2: start from ether_header */ + if (val < 0 || val > 2) err = -EINVAL; else - tp->keepalive_intvl = val * HZ; - break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - err = -EINVAL; - else - tp->keepalive_probes = val; - break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - err = -EINVAL; - else - icsk->icsk_syn_retries = val; - break; - - case TCP_LINGER2: - if (val < 0) - tp->linger2 = -1; - else if (val > sysctl_tcp_fin_timeout / HZ) - tp->linger2 = 0; - else - tp->linger2 = val * HZ; - break; - - case TCP_DEFER_ACCEPT: - /* Translate value in seconds to number of retransmits */ - icsk->icsk_accept_queue.rskq_defer_accept = - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, - TCP_RTO_MAX / HZ); + tp->save_syn = val; break; case TCP_WINDOW_CLAMP: - if (!val) { - if (sk->sk_state != TCP_CLOSE) { - err = -EINVAL; - break; - } - tp->window_clamp = 0; - } else - tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: - if (!val) { - icsk->icsk_ack.pingpong = 1; - } else { - icsk->icsk_ack.pingpong = 0; - if ((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - inet_csk_ack_scheduled(sk)) { - icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - tcp_cleanup_rbuf(sk, 1); - if (!(val & 1)) - icsk->icsk_ack.pingpong = 1; - } - } + __tcp_sock_set_quickack(sk, val); break; + case TCP_AO_REPAIR: + if (!tcp_can_repair_sock(sk)) { + err = -EPERM; + break; + } + err = tcp_ao_set_repair(sk, optval, optlen); + break; +#ifdef CONFIG_TCP_AO + case TCP_AO_ADD_KEY: + case TCP_AO_DEL_KEY: + case TCP_AO_INFO: { + /* If this is the first TCP-AO setsockopt() on the socket, + * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR + * in any state. + */ + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto ao_parse; + if (rcu_dereference_protected(tcp_sk(sk)->ao_info, + lockdep_sock_is_held(sk))) + goto ao_parse; + if (tp->repair) + goto ao_parse; + err = -EISCONN; + break; +ao_parse: + err = tp->af_specific->ao_parse(sk, optname, optval, optlen); + break; + } +#endif #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: - /* Read the IP->Key mappings from userspace */ - err = tp->af_specific->md5_parse(sk, optval, optlen); + case TCP_MD5SIG_EXT: + err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif - case TCP_USER_TIMEOUT: - /* Cap the max timeout in ms TCP will retry/retrans - * before giving up and aborting (ETIMEDOUT) a connection. - */ - if (val < 0) - err = -EINVAL; - else - icsk->icsk_user_timeout = msecs_to_jiffies(val); - break; - case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | - TCPF_LISTEN))) - err = fastopen_init_queue(sk, val); - else + TCPF_LISTEN))) { + tcp_fastopen_init_key_once(net); + + fastopen_queue_tune(sk, val); + } else { err = -EINVAL; + } + break; + case TCP_FASTOPEN_CONNECT: + if (val > 1 || val < 0) { + err = -EINVAL; + } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & + TFO_CLIENT_ENABLE) { + if (sk->sk_state == TCP_CLOSE) + tp->fastopen_connect = val; + else + err = -EINVAL; + } else { + err = -EOPNOTSUPP; + } + break; + case TCP_FASTOPEN_NO_COOKIE: + if (val > 1 || val < 0) + err = -EINVAL; + else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + err = -EINVAL; + else + tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: - if (!tp->repair) + if (!tp->repair) { err = -EPERM; + break; + } + /* val is an opaque field, + * and low order bit contains usec_ts enable bit. + * Its a best effort, and we do not care if user makes an error. + */ + tp->tcp_usec_ts = val & 1; + WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts)); + break; + case TCP_REPAIR_WINDOW: + err = tcp_repair_set_window(tp, optval, optlen); + break; + case TCP_NOTSENT_LOWAT: + WRITE_ONCE(tp->notsent_lowat, val); + sk->sk_write_space(sk); + break; + case TCP_INQ: + if (val > 1 || val < 0) + err = -EINVAL; else - tp->tsoffset = val - tcp_time_stamp; + tp->recvmsg_inq = val; + break; + case TCP_TX_DELAY: + /* tp->srtt_us is u32, and is shifted by 3 */ + if (val < 0 || val >= (1U << (31 - 3))) { + err = -EINVAL; + break; + } + tcp_enable_tx_delay(sk, val); + WRITE_ONCE(tp->tcp_tx_delay, val); break; default: err = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); return err; } -int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, +int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, + optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } -EXPORT_SYMBOL(tcp_setsockopt); +EXPORT_IPV6_MOD(tcp_setsockopt); -#ifdef CONFIG_COMPAT -int compat_tcp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) +static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, + struct tcp_info *info) { - if (level != SOL_TCP) - return inet_csk_compat_setsockopt(sk, level, optname, - optval, optlen); - return do_tcp_setsockopt(sk, level, optname, optval, optlen); + u64 stats[__TCP_CHRONO_MAX], total = 0; + enum tcp_chrono i; + + for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { + stats[i] = tp->chrono_stat[i - 1]; + if (i == tp->chrono_type) + stats[i] += tcp_jiffies32 - tp->chrono_start; + stats[i] *= USEC_PER_SEC / HZ; + total += stats[i]; + } + + info->tcpi_busy_time = total; + info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; + info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; } -EXPORT_SYMBOL(compat_tcp_setsockopt); -#endif /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(const struct sock *sk, struct tcp_info *info) +void tcp_get_info(struct sock *sk, struct tcp_info *info) { - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now = tcp_time_stamp; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + unsigned long rate; + u32 now; + u64 rate64; + bool slow; memset(info, 0, sizeof(*info)); + if (sk->sk_type != SOCK_STREAM) + return; + + info->tcpi_state = inet_sk_state_load(sk); + + /* Report meaningful fields for all TCP states, including listeners */ + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = (rate != ~0UL) ? rate : ~0ULL; + info->tcpi_pacing_rate = rate64; + + rate = READ_ONCE(sk->sk_max_pacing_rate); + rate64 = (rate != ~0UL) ? rate : ~0ULL; + info->tcpi_max_pacing_rate = rate64; + + info->tcpi_reordering = tp->reordering; + info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); + + if (info->tcpi_state == TCP_LISTEN) { + /* listeners aliased fields : + * tcpi_unacked -> Number of children ready for accept() + * tcpi_sacked -> max backlog + */ + info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); + info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); + return; + } + + slow = lock_sock_fast(sk); - info->tcpi_state = sk->sk_state; info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2689,69 +4246,231 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } - if (tp->ecn_flags & TCP_ECN_OK) + if (tcp_ecn_mode_any(tp)) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) + info->tcpi_options |= TCPI_OPT_USEC_TS; + if (tp->syn_fastopen_child) + info->tcpi_options |= TCPI_OPT_TFO_CHILD; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); - info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); + info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, + tcp_delack_max(sk))); info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (sk->sk_state == TCP_LISTEN) { - info->tcpi_unacked = sk->sk_ack_backlog; - info->tcpi_sacked = sk->sk_max_ack_backlog; - } else { - info->tcpi_unacked = tp->packets_out; - info->tcpi_sacked = tp->sacked_out; - } + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; - info->tcpi_fackets = tp->fackets_out; + now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; - info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; - info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_rtt = tp->srtt_us >> 3; + info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; - info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; - info->tcpi_reordering = tp->reordering; - info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; + info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; + + info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; + info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); + tcp_get_info_chrono_stats(tp, info); + + info->tcpi_segs_out = tp->segs_out; + + /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ + info->tcpi_segs_in = READ_ONCE(tp->segs_in); + info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in); + + info->tcpi_min_rtt = tcp_min_rtt(tp); + info->tcpi_data_segs_out = tp->data_segs_out; + + info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; + rate64 = tcp_compute_delivery_rate(tp); + if (rate64) + info->tcpi_delivery_rate = rate64; + info->tcpi_delivered = tp->delivered; + info->tcpi_delivered_ce = tp->delivered_ce; + info->tcpi_bytes_sent = tp->bytes_sent; + info->tcpi_bytes_retrans = tp->bytes_retrans; + info->tcpi_dsack_dups = tp->dsack_dups; + info->tcpi_reord_seen = tp->reord_seen; + info->tcpi_rcv_ooopack = tp->rcv_ooopack; + info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_rcv_wnd = tp->rcv_wnd; + info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; + info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; + + info->tcpi_total_rto = tp->total_rto; + info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; + info->tcpi_total_rto_time = tp->total_rto_time; + if (tp->rto_stamp) + info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); -static int do_tcp_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, int __user *optlen) +static size_t tcp_opt_stats_get_size(void) +{ + return + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ + nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ + 0; +} + +/* Returns TTL or hop limit of an incoming packet from skb. */ +static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return ip_hdr(skb)->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ipv6_hdr(skb)->hop_limit; + else + return 0; +} + +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb, + const struct sk_buff *ack_skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *stats; + struct tcp_info info; + unsigned long rate; + u64 rate64; + + stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); + if (!stats) + return NULL; + + tcp_get_info_chrono_stats(tp, &info); + nla_put_u64_64bit(stats, TCP_NLA_BUSY, + info.tcpi_busy_time, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, + info.tcpi_rwnd_limited, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, + info.tcpi_sndbuf_limited, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, + tp->data_segs_out, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, + tp->total_retrans, TCP_NLA_PAD); + + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = (rate != ~0UL) ? rate : ~0ULL; + nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); + + rate64 = tcp_compute_delivery_rate(tp); + nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); + + nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); + nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); + nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); + + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, + READ_ONCE(inet_csk(sk)->icsk_retransmits)); + nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); + nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); + nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); + + nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); + nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + + nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, + TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, + TCP_NLA_PAD); + nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); + nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); + nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); + nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); + nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, + max_t(int, 0, tp->write_seq - tp->snd_nxt)); + nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, + TCP_NLA_PAD); + if (ack_skb) + nla_put_u8(stats, TCP_NLA_TTL, + tcp_skb_ttl_or_hop_limit(ack_skb)); + + nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); + return stats; +} + +int do_tcp_getsockopt(struct sock *sk, int level, + int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + int user_mss; int val, len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; - len = min_t(unsigned int, len, sizeof(int)); - if (len < 0) return -EINVAL; + len = min_t(unsigned int, len, sizeof(int)); + switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; - if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) - val = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + val = user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; @@ -2771,54 +4490,109 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + val = READ_ONCE(icsk->icsk_syn_retries) ? : + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: - val = tp->linger2; + val = READ_ONCE(tp->linger2); if (val >= 0) - val = (val ? : sysctl_tcp_fin_timeout) / HZ; + val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: - val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, - TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); + val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); + val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: - val = tp->window_clamp; + val = READ_ONCE(tp->window_clamp); break; case TCP_INFO: { struct tcp_info info; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + if (copy_to_sockptr(optval, &info, len)) + return -EFAULT; + return 0; + } + case TCP_CC_INFO: { + const struct tcp_congestion_ops *ca_ops; + union tcp_cc_info info; + size_t sz = 0; + int attr; + + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &info, len)) + + ca_ops = icsk->icsk_ca_ops; + if (ca_ops && ca_ops->get_info) + sz = ca_ops->get_info(sk, ~0U, &attr, &info); + + len = min_t(unsigned int, len, sz); + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_QUICKACK: - val = !icsk->icsk_ack.pingpong; + val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; - if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) + return 0; + + case TCP_ULP: + if (copy_from_sockptr(&len, optlen, sizeof(int))) + return -EFAULT; + len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); + if (!icsk->icsk_ulp_ops) { + len = 0; + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + return 0; + } + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; + case TCP_FASTOPEN_KEY: { + u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; + unsigned int key_len; + + if (copy_from_sockptr(&len, optlen, sizeof(int))) + return -EFAULT; + + key_len = tcp_fastopen_get_cipher(net, icsk, key) * + TCP_FASTOPEN_KEY_LENGTH; + len = min_t(unsigned int, len, key_len); + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + if (copy_to_sockptr(optval, key, len)) + return -EFAULT; + return 0; + } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; + case TCP_THIN_DUPACK: - val = tp->thin_dupack; + val = 0; break; case TCP_REPAIR: @@ -2832,6 +4606,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EINVAL; break; + case TCP_REPAIR_WINDOW: { + struct tcp_repair_window opt; + + if (copy_from_sockptr(&len, optlen, sizeof(int))) + return -EFAULT; + + if (len != sizeof(opt)) + return -EINVAL; + + if (!tp->repair) + return -EPERM; + + opt.snd_wl1 = tp->snd_wl1; + opt.snd_wnd = tp->snd_wnd; + opt.max_window = tp->max_window; + opt.rcv_wnd = tp->rcv_wnd; + opt.rcv_wup = tp->rcv_wup; + + if (copy_to_sockptr(optval, &opt, len)) + return -EFAULT; + return 0; + } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; @@ -2842,206 +4638,384 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_USER_TIMEOUT: - val = jiffies_to_msecs(icsk->icsk_user_timeout); + val = READ_ONCE(icsk->icsk_user_timeout); + break; + + case TCP_FASTOPEN: + val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); + break; + + case TCP_FASTOPEN_CONNECT: + val = tp->fastopen_connect; + break; + + case TCP_FASTOPEN_NO_COOKIE: + val = tp->fastopen_no_cookie; + break; + + case TCP_TX_DELAY: + val = READ_ONCE(tp->tcp_tx_delay); break; + case TCP_TIMESTAMP: - val = tcp_time_stamp + tp->tsoffset; + val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset); + if (tp->tcp_usec_ts) + val |= 1; + else + val &= ~1; + break; + case TCP_NOTSENT_LOWAT: + val = READ_ONCE(tp->notsent_lowat); + break; + case TCP_INQ: + val = tp->recvmsg_inq; + break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (copy_from_sockptr(&len, optlen, sizeof(int))) + return -EFAULT; + + sockopt_lock_sock(sk); + if (tp->saved_syn) { + if (len < tcp_saved_syn_len(tp->saved_syn)) { + len = tcp_saved_syn_len(tp->saved_syn); + if (copy_to_sockptr(optlen, &len, sizeof(int))) { + sockopt_release_sock(sk); + return -EFAULT; + } + sockopt_release_sock(sk); + return -EINVAL; + } + len = tcp_saved_syn_len(tp->saved_syn); + if (copy_to_sockptr(optlen, &len, sizeof(int))) { + sockopt_release_sock(sk); + return -EFAULT; + } + if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { + sockopt_release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + sockopt_release_sock(sk); + } else { + sockopt_release_sock(sk); + len = 0; + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + } + return 0; + } +#ifdef CONFIG_MMU + case TCP_ZEROCOPY_RECEIVE: { + struct scm_timestamping_internal tss; + struct tcp_zerocopy_receive zc = {}; + int err; + + if (copy_from_sockptr(&len, optlen, sizeof(int))) + return -EFAULT; + if (len < 0 || + len < offsetofend(struct tcp_zerocopy_receive, length)) + return -EINVAL; + if (unlikely(len > sizeof(zc))) { + err = check_zeroed_sockptr(optval, sizeof(zc), + len - sizeof(zc)); + if (err < 1) + return err == 0 ? -EINVAL : err; + len = sizeof(zc); + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + } + if (copy_from_sockptr(&zc, optval, len)) + return -EFAULT; + if (zc.reserved) + return -EINVAL; + if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) + return -EINVAL; + sockopt_lock_sock(sk); + err = tcp_zerocopy_receive(sk, &zc, &tss); + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); + sockopt_release_sock(sk); + if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) + goto zerocopy_rcv_cmsg; + switch (len) { + case offsetofend(struct tcp_zerocopy_receive, msg_flags): + goto zerocopy_rcv_cmsg; + case offsetofend(struct tcp_zerocopy_receive, msg_controllen): + case offsetofend(struct tcp_zerocopy_receive, msg_control): + case offsetofend(struct tcp_zerocopy_receive, flags): + case offsetofend(struct tcp_zerocopy_receive, copybuf_len): + case offsetofend(struct tcp_zerocopy_receive, copybuf_address): + case offsetofend(struct tcp_zerocopy_receive, err): + goto zerocopy_rcv_sk_err; + case offsetofend(struct tcp_zerocopy_receive, inq): + goto zerocopy_rcv_inq; + case offsetofend(struct tcp_zerocopy_receive, length): + default: + goto zerocopy_rcv_out; + } +zerocopy_rcv_cmsg: + if (zc.msg_flags & TCP_CMSG_TS) + tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); + else + zc.msg_flags = 0; +zerocopy_rcv_sk_err: + if (!err) + zc.err = sock_error(sk); +zerocopy_rcv_inq: + zc.inq = tcp_inq_hint(sk); +zerocopy_rcv_out: + if (!err && copy_to_sockptr(optval, &zc, len)) + err = -EFAULT; + return err; + } +#endif + case TCP_AO_REPAIR: + if (!tcp_can_repair_sock(sk)) + return -EPERM; + return tcp_ao_get_repair(sk, optval, optlen); + case TCP_AO_GET_KEYS: + case TCP_AO_INFO: { + int err; + + sockopt_lock_sock(sk); + if (optname == TCP_AO_GET_KEYS) + err = tcp_ao_get_mkts(sk, optval, optlen); + else + err = tcp_ao_get_sock_info(sk, optval, optlen); + sockopt_release_sock(sk); + + return err; + } + case TCP_IS_MPTCP: + val = 0; + break; + case TCP_RTO_MAX_MS: + val = jiffies_to_msecs(tcp_rto_max(sk)); + break; + case TCP_RTO_MIN_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min)); + break; + case TCP_DELACK_MAX_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max)); break; default: return -ENOPROTOOPT; } - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, len)) + if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } -int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, - int __user *optlen) +bool tcp_bpf_bypass_getsockopt(int level, int optname) { - struct inet_connection_sock *icsk = inet_csk(sk); + /* TCP do_tcp_getsockopt has optimized getsockopt implementation + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. + */ + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) + return true; - if (level != SOL_TCP) - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); + return false; } -EXPORT_SYMBOL(tcp_getsockopt); +EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt); -#ifdef CONFIG_COMPAT -int compat_tcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) +int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, + int __user *optlen) { + struct inet_connection_sock *icsk = inet_csk(sk); + if (level != SOL_TCP) - return inet_csk_compat_getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, + optval, optlen); + return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); } -EXPORT_SYMBOL(compat_tcp_getsockopt); -#endif +EXPORT_IPV6_MOD(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG -static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly; -static DEFINE_MUTEX(tcp_md5sig_mutex); - -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) +void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb, + unsigned int header_len) { - int cpu; + const unsigned int head_data_len = skb_headlen(skb) > header_len ? + skb_headlen(skb) - header_len : 0; + const struct skb_shared_info *shi = skb_shinfo(skb); + struct sk_buff *frag_iter; + unsigned int i; - for_each_possible_cpu(cpu) { - struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); + md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len); - if (p->md5_desc.tfm) - crypto_free_hash(p->md5_desc.tfm); + for (i = 0; i < shi->nr_frags; ++i) { + const skb_frag_t *f = &shi->frags[i]; + u32 p_off, p_len, copied; + const void *vaddr; + struct page *p; + + skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), + p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); + md5_update(ctx, vaddr + p_off, p_len); + kunmap_local(vaddr); + } } - free_percpu(pool); + + skb_walk_frags(skb, frag_iter) + tcp_md5_hash_skb_data(ctx, frag_iter, 0); } +EXPORT_IPV6_MOD(tcp_md5_hash_skb_data); -static void __tcp_alloc_md5sig_pool(void) +void tcp_md5_hash_key(struct md5_ctx *ctx, + const struct tcp_md5sig_key *key) { - int cpu; - struct tcp_md5sig_pool __percpu *pool; + u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ - pool = alloc_percpu(struct tcp_md5sig_pool); - if (!pool) - return; - - for_each_possible_cpu(cpu) { - struct crypto_hash *hash; - - hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(hash)) - goto out_free; - - per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; - } - /* before setting tcp_md5sig_pool, we must commit all writes - * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool() + /* We use data_race() because tcp_md5_do_add() might change + * key->key under us */ - smp_wmb(); - tcp_md5sig_pool = pool; - return; -out_free: - __tcp_free_md5sig_pool(pool); + data_race(({ md5_update(ctx, key->key, keylen), 0; })); } +EXPORT_IPV6_MOD(tcp_md5_hash_key); -bool tcp_alloc_md5sig_pool(void) +/* Called with rcu_read_lock() */ +static enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int l3index, const __u8 *hash_location) { - if (unlikely(!tcp_md5sig_pool)) { - mutex_lock(&tcp_md5sig_mutex); - - if (!tcp_md5sig_pool) - __tcp_alloc_md5sig_pool(); + /* This gets called for each TCP segment that has TCP-MD5 option. + * We have 2 drop cases: + * o An MD5 signature is present, but we're not expecting one. + * o The MD5 signature is wrong. + */ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *key; + u8 newhash[16]; + + key = tcp_md5_do_lookup(sk, l3index, saddr, family); + if (!key) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + trace_tcp_hash_md5_unexpected(sk, skb); + return SKB_DROP_REASON_TCP_MD5UNEXPECTED; + } - mutex_unlock(&tcp_md5sig_mutex); + /* Check the signature. + * To support dual stack listeners, we need to handle + * IPv4-mapped case. + */ + if (family == AF_INET) + tcp_v4_md5_hash_skb(newhash, key, NULL, skb); + else + tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); + if (memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); + trace_tcp_hash_md5_mismatch(sk, skb); + return SKB_DROP_REASON_TCP_MD5FAILURE; } - return tcp_md5sig_pool != NULL; + return SKB_NOT_DROPPED_YET; } -EXPORT_SYMBOL(tcp_alloc_md5sig_pool); - - -/** - * tcp_get_md5sig_pool - get md5sig_pool for this user - * - * We use percpu structure, so if we succeed, we exit with preemption - * and BH disabled, to make sure another thread or softirq handling - * wont try to get same context. - */ -struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) +#else +static inline enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int l3index, const __u8 *hash_location) { - struct tcp_md5sig_pool __percpu *p; - - local_bh_disable(); - p = ACCESS_ONCE(tcp_md5sig_pool); - if (p) - return __this_cpu_ptr(p); - - local_bh_enable(); - return NULL; + return SKB_NOT_DROPPED_YET; } -EXPORT_SYMBOL(tcp_get_md5sig_pool); -int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, - const struct tcphdr *th) -{ - struct scatterlist sg; - struct tcphdr hdr; - int err; - - /* We are not allowed to change tcphdr, make a local copy */ - memcpy(&hdr, th, sizeof(hdr)); - hdr.check = 0; - - /* options aren't included in the hash */ - sg_init_one(&sg, &hdr, sizeof(hdr)); - err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr)); - return err; -} -EXPORT_SYMBOL(tcp_md5_hash_header); +#endif -int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, - const struct sk_buff *skb, unsigned int header_len) +/* Called with rcu_read_lock() */ +enum skb_drop_reason +tcp_inbound_hash(struct sock *sk, const struct request_sock *req, + const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif) { - struct scatterlist sg; - const struct tcphdr *tp = tcp_hdr(skb); - struct hash_desc *desc = &hp->md5_desc; - unsigned int i; - const unsigned int head_data_len = skb_headlen(skb) > header_len ? - skb_headlen(skb) - header_len : 0; - const struct skb_shared_info *shi = skb_shinfo(skb); - struct sk_buff *frag_iter; - - sg_init_table(&sg, 1); + const struct tcphdr *th = tcp_hdr(skb); + const struct tcp_ao_hdr *aoh; + const __u8 *md5_location; + int l3index; + + /* Invalid option or two times meet any of auth options */ + if (tcp_parse_auth_options(th, &md5_location, &aoh)) { + trace_tcp_hash_bad_header(sk, skb); + return SKB_DROP_REASON_TCP_AUTH_HDR; + } - sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); - if (crypto_hash_update(desc, &sg, head_data_len)) - return 1; + if (req) { + if (tcp_rsk_used_ao(req) != !!aoh) { + u8 keyid, rnext, maclen; - for (i = 0; i < shi->nr_frags; ++i) { - const struct skb_frag_struct *f = &shi->frags[i]; - unsigned int offset = f->page_offset; - struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); + if (aoh) { + keyid = aoh->keyid; + rnext = aoh->rnext_keyid; + maclen = tcp_ao_hdr_maclen(aoh); + } else { + keyid = rnext = maclen = 0; + } - sg_set_page(&sg, page, skb_frag_size(f), - offset_in_page(offset)); - if (crypto_hash_update(desc, &sg, skb_frag_size(f))) - return 1; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); + trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen); + return SKB_DROP_REASON_TCP_AOFAILURE; + } } - skb_walk_frags(skb, frag_iter) - if (tcp_md5_hash_skb_data(hp, frag_iter, 0)) - return 1; + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + /* Fast path: unsigned segments */ + if (likely(!md5_location && !aoh)) { + /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid + * for the remote peer. On TCP-AO established connection + * the last key is impossible to remove, so there's + * always at least one current_key. + */ + if (tcp_ao_required(sk, saddr, family, l3index, true)) { + trace_tcp_hash_ao_required(sk, skb); + return SKB_DROP_REASON_TCP_AONOTFOUND; + } + if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + trace_tcp_hash_md5_required(sk, skb); + return SKB_DROP_REASON_TCP_MD5NOTFOUND; + } + return SKB_NOT_DROPPED_YET; + } - return 0; -} -EXPORT_SYMBOL(tcp_md5_hash_skb_data); + if (aoh) + return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh); -int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) -{ - struct scatterlist sg; - - sg_init_one(&sg, key->key, key->keylen); - return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); + return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, + l3index, md5_location); } -EXPORT_SYMBOL(tcp_md5_hash_key); - -#endif +EXPORT_IPV6_MOD_GPL(tcp_inbound_hash); void tcp_done(struct sock *sk) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + /* We might be called with a new socket, after + * inet_csk_prepare_forced_close() has been called + * so we can not use lockdep_sock_is_held(sk) + */ + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); @@ -3050,6 +5024,62 @@ void tcp_done(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_done); +int tcp_abort(struct sock *sk, int err) +{ + int state = inet_sk_state_load(sk); + + if (state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + + local_bh_disable(); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + local_bh_enable(); + return 0; + } + if (state == TCP_TIME_WAIT) { + struct inet_timewait_sock *tw = inet_twsk(sk); + + refcount_inc(&tw->tw_refcnt); + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; + } + + /* BPF context ensures sock locking. */ + if (!has_current_bpf_ctx()) + /* Don't race with userspace socket closes such as tcp_close. */ + lock_sock(sk); + + /* Avoid closing the same socket twice. */ + if (sk->sk_state == TCP_CLOSE) { + if (!has_current_bpf_ctx()) + release_sock(sk); + return -ENOENT; + } + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); + inet_csk_listen_stop(sk); + } + + /* Don't race with BH socket closes such as inet_csk_listen_stop. */ + local_bh_disable(); + bh_lock_sock(sk); + + if (tcp_need_reset(sk->sk_state)) + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_TCP_STATE); + tcp_done_with_error(sk, err); + + bh_unlock_sock(sk); + local_bh_enable(); + if (!has_current_bpf_ctx()) + release_sock(sk); + return 0; +} +EXPORT_SYMBOL_GPL(tcp_abort); + extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; @@ -3068,30 +5098,140 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); -void tcp_init_mem(struct net *net) +static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ +} + +static void __init tcp_struct_check(void) +{ + /* TX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked); +#endif + + /* TXRX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); + + /* RX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); + + /* TX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); + + /* TXRX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); + + /* RX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); } void __init tcp_init(void) { - struct sk_buff *skb = NULL; - unsigned long limit; int max_rshare, max_wshare, cnt; + unsigned long limit; unsigned int i; - BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); + BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); + BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > + sizeof_field(struct sk_buff, cb)); + + tcp_struct_check(); + + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); + + timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); + mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); - percpu_counter_init(&tcp_sockets_allocated, 0); - percpu_counter_init(&tcp_orphan_count, 0); + inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", + thash_entries, 21, /* one slot per 2 MB*/ + 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT, + NULL); + tcp_hashinfo.bind2_bucket_cachep = + kmem_cache_create("tcp_bind2_bucket", + sizeof(struct inet_bind2_bucket), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT, + NULL); /* Size and allocate the main established and bind bucket * hash tables. @@ -3108,15 +5248,14 @@ void __init tcp_init(void) &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); - for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { + for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); - INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); - } + if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", - sizeof(struct inet_bind_hashbucket), + 2 * sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, @@ -3125,38 +5264,39 @@ void __init tcp_init(void) 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); + spin_lock_init(&tcp_hashinfo.bhash2[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } + tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; - - tcp_death_row.sysctl_max_tw_buckets = cnt / 2; sysctl_tcp_max_orphans = cnt / 2; - sysctl_max_syn_backlog = max(128, cnt / 256); - tcp_init_mem(&init_net); + tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_rshare = min(32UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; + init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; + init_net.ipv4.sysctl_tcp_rmem[1] = 131072; + init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); + tcp_v4_init(); tcp_metrics_init(); - - tcp_register_congestion_control(&tcp_reno); - - tcp_tasklet_init(); + BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); + tcp_tsq_work_init(); + mptcp_init(); } |
