diff options
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 303 |
1 files changed, 185 insertions, 118 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a22ee5838751..6a14f9e6fef6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -66,10 +66,12 @@ #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/inet_ecn.h> #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h> #include <net/busy_poll.h> +#include <net/rstreason.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -78,6 +80,7 @@ #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <linux/btf_ids.h> +#include <linux/skbuff_ref.h> #include <crypto/hash.h> #include <linux/scatterlist.h> @@ -90,9 +93,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, #endif struct inet_hashinfo tcp_hashinfo; -EXPORT_SYMBOL(tcp_hashinfo); -static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); +static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; + +static DEFINE_MUTEX(tcp_exit_batch_mutex); static u32 tcp_v4_init_seq(const struct sk_buff *skb) { @@ -113,6 +119,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); + int ts_recent_stamp; + u32 reuse_thresh; + + if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) + reuse = 0; if (reuse == 2) { /* Still does not detect *everything* that goes through @@ -151,9 +162,17 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) If TW bucket has been already destroyed we fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tcptw->tw_ts_recent_stamp && - (!twp || (reuse && time_after32(ktime_get_seconds(), - tcptw->tw_ts_recent_stamp)))) { + ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); + if (ts_recent_stamp && + (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { + /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk + * and releasing the bucket lock. + */ + if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) + return 0; + /* In case of repair and re-using TIME-WAIT sockets we still * want to be sure that it is safe as above but honor the * sequence numbers and time stamps set as part of the repair @@ -171,16 +190,16 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); + tp->rx_opt.ts_recent_stamp = ts_recent_stamp; } - sock_hold(sktw); + return 1; } return 0; } -EXPORT_SYMBOL_GPL(tcp_twsk_unique); +EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -340,7 +359,7 @@ failure: inet->inet_dport = 0; return err; } -EXPORT_SYMBOL(tcp_v4_connect); +EXPORT_IPV6_MOD(tcp_v4_connect); /* * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. @@ -381,7 +400,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ } -EXPORT_SYMBOL(tcp_v4_mtu_reduced); +EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); static void do_redirect(struct sk_buff *skb, struct sock *sk) { @@ -415,7 +434,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) } reqsk_put(req); } -EXPORT_SYMBOL(tcp_req_err); +EXPORT_IPV6_MOD(tcp_req_err); /* TCP-LD (RFC 6069) logic */ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) @@ -439,15 +458,14 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) icsk->icsk_backoff--; icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); if (remaining > 0) { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - remaining, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); } else { /* RTO revert clocked out retransmission. * Will retransmit now. @@ -455,7 +473,7 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) tcp_retransmit_timer(sk); } } -EXPORT_SYMBOL(tcp_ld_RTO_revert); +EXPORT_IPV6_MOD(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some @@ -477,14 +495,14 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); - struct tcp_sock *tp; + struct net *net = dev_net_rcu(skb->dev); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - struct sock *sk; struct request_sock *fastopen; + struct tcp_sock *tp; u32 seq, snd_una; + struct sock *sk; int err; - struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->daddr, th->dest, iph->saddr, @@ -604,15 +622,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - - sk_error_report(sk); - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; } @@ -662,7 +675,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } -EXPORT_SYMBOL(tcp_v4_send_check); +EXPORT_IPV6_MOD(tcp_v4_send_check); #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) @@ -723,7 +736,8 @@ out: * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -773,7 +787,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) @@ -866,23 +880,25 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) { + if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; - if (sk_fullsock(sk)) - trace_tcp_send_reset(sk, skb); - } + + trace_tcp_send_reset(sk, skb, reason); BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); - arg.tos = ip_hdr(skb)->tos; + /* ECN bits of TW reset are cleared */ + arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(ipv4_tcp_sk); + local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); + ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); + sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); @@ -893,7 +909,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ctl_sk->sk_mark = 0; ctl_sk->sk_priority = 0; } - ip_send_unicast_reply(ctl_sk, + ip_send_unicast_reply(ctl_sk, sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, @@ -903,6 +919,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); + local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG @@ -998,14 +1015,15 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(ipv4_tcp_sk); + local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); + ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); - ip_send_unicast_reply(ctl_sk, + ip_send_unicast_reply(ctl_sk, sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, @@ -1013,14 +1031,25 @@ static void tcp_v4_send_ack(const struct sock *sk, sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); } -static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, + enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; + u8 tos = tw->tw_tos; + + /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, + * while not cleaning ECN bits of other TW ACKs to avoid these ACKs + * being placed in a different service queues (Classic rather than L4S) + */ + if (tw_status == TCP_TW_ACK_OOW) + tos &= ~INET_ECN_MASK; + #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; @@ -1036,7 +1065,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) } if (aoh) - key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); + key.ao_key = tcp_ao_established_key(sk, ao_info, + aoh->rnext_keyid, -1); } } if (key.ao_key) { @@ -1050,22 +1080,20 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) #else if (0) { #endif -#ifdef CONFIG_TCP_MD5SIG - } else if (static_branch_unlikely(&tcp_md5_needed.key)) { + } else if (static_branch_tcp_md5()) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; -#endif } tcp_v4_send_ack(sk, skb, - tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), - tcptw->tw_ts_recent, + READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos, + tos, tw->tw_txhash); inet_twsk_put(tw); @@ -1124,8 +1152,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, #else if (0) { #endif -#ifdef CONFIG_TCP_MD5SIG - } else if (static_branch_unlikely(&tcp_md5_needed.key)) { + } else if (static_branch_tcp_md5()) { const union tcp_md5_addr *addr; int l3index; @@ -1134,22 +1161,17 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key.md5_key) key.type = TCP_KEY_MD5; -#endif } - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of <SYN> segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ + /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), + req->ts_recent, 0, &key, inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos, + ip_hdr(skb)->tos & ~INET_ECN_MASK, READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); @@ -1220,7 +1242,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); -EXPORT_SYMBOL(tcp_md5_needed); +EXPORT_IPV6_MOD(tcp_md5_needed); static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) { @@ -1279,7 +1301,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, } return best_match; } -EXPORT_SYMBOL(__tcp_md5_do_lookup); +EXPORT_IPV6_MOD(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1326,7 +1348,7 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } -EXPORT_SYMBOL(tcp_v4_md5_lookup); +EXPORT_IPV6_MOD(tcp_v4_md5_lookup); static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) { @@ -1422,7 +1444,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, newkey, newkeylen, GFP_KERNEL); } -EXPORT_SYMBOL(tcp_md5_do_add); +EXPORT_IPV6_MOD(tcp_md5_do_add); int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, @@ -1454,7 +1476,7 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, key->flags, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } -EXPORT_SYMBOL(tcp_md5_key_copy); +EXPORT_IPV6_MOD(tcp_md5_key_copy); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags) @@ -1469,7 +1491,7 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, kfree_rcu(key, rcu); return 0; } -EXPORT_SYMBOL(tcp_md5_do_del); +EXPORT_IPV6_MOD(tcp_md5_do_del); void tcp_clear_md5_list(struct sock *sk) { @@ -1648,7 +1670,7 @@ clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } -EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); #endif @@ -1667,7 +1689,8 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { tcp_v4_init_req(req, sk, skb); @@ -1720,7 +1743,7 @@ drop: tcp_listendrop(sk); return 0; } -EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_IPV6_MOD(tcp_v4_conn_request); /* @@ -1758,10 +1781,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - sk_daddr_set(newsk, ireq->ir_rmt_addr); - sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); - newsk->sk_bound_dev_if = ireq->ir_iif; - newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = rcu_dereference(ireq->ireq_opt); RCU_INIT_POINTER(newinet->inet_opt, inet_opt); newinet->mc_index = inet_iif(skb); @@ -1844,7 +1863,7 @@ put_and_exit: tcp_done(newsk); goto exit; } -EXPORT_SYMBOL(tcp_v4_syn_recv_sock); +EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { @@ -1934,9 +1953,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; reset: - tcp_v4_send_reset(rsk, skb); + tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); discard: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, @@ -1955,7 +1974,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; @@ -1995,7 +2014,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { - u32 limit, tail_gso_size, tail_gso_segs; + u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -2004,6 +2023,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, bool fragstolen; u32 gso_segs; u32 gso_size; + u64 limit; int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), @@ -2014,7 +2034,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, */ skb_condense(skb); - skb_dst_drop(skb); + tcp_cleanup_skb(skb); if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); @@ -2044,11 +2064,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, !((TCP_SKB_CB(tail)->tcp_flags & TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ - TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || -#ifdef CONFIG_TLS_DEVICE - tail->decrypted != skb->decrypted || -#endif - !mptcp_skb_can_collapse(tail, skb) || + TCP_SKB_CB(skb)->tcp_flags) & + (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || + !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; @@ -2101,7 +2119,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, __skb_push(skb, hdrlen); no_coalesce: - limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); + /* sk->sk_backlog.len is reset only at the end of __release_sock(). + * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach + * sk_rcvbuf in normal conditions. + */ + limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; + + limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; /* Only socket owner can try to collapse/prune rx queues * to reduce memory overhead, so add a little headroom here. @@ -2109,6 +2133,8 @@ no_coalesce: */ limit += 64 * 1024; + limit = min_t(u64, limit, UINT_MAX); + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); *reason = SKB_DROP_REASON_SOCKET_BACKLOG; @@ -2117,7 +2143,7 @@ no_coalesce: } return false; } -EXPORT_SYMBOL(tcp_add_backlog); +EXPORT_IPV6_MOD(tcp_add_backlog); int tcp_filter(struct sock *sk, struct sk_buff *skb) { @@ -2125,7 +2151,7 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb) return sk_filter_trim_cap(sk, skb, th->doff * 4); } -EXPORT_SYMBOL(tcp_filter); +EXPORT_IPV6_MOD(tcp_filter); static void tcp_v4_restore_cb(struct sk_buff *skb) { @@ -2147,8 +2173,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -2161,15 +2186,17 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; + enum tcp_tw_status tw_status; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; + struct sock *sk = NULL; bool refcounted; - struct sock *sk; int ret; + u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -2207,7 +2234,6 @@ lookup: if (!sk) goto no_tcp_socket; -process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; @@ -2254,7 +2280,8 @@ process: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } @@ -2279,7 +2306,10 @@ process: } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { - tcp_v4_send_reset(nsk, skb); + enum sk_rst_reason rst_reason; + + rst_reason = sk_rst_convert_drop_reason(drop_reason); + tcp_v4_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); @@ -2287,6 +2317,7 @@ process: } } +process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { @@ -2357,13 +2388,13 @@ csum_error: bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { - tcp_v4_send_reset(NULL, skb); + tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); /* Discard frame. */ - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: @@ -2385,7 +2416,10 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); + switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2399,16 +2433,18 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: - tcp_v4_timewait_ack(sk, skb); + case TCP_TW_ACK_OOW: + tcp_v4_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: - tcp_v4_send_reset(sk, skb); + tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS:; @@ -2418,7 +2454,6 @@ do_time_wait: static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, }; @@ -2431,7 +2466,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst_ifindex = skb->skb_iif; } } -EXPORT_SYMBOL(inet_sk_rx_dst_set); +EXPORT_IPV6_MOD(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, @@ -2443,11 +2478,9 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = inet_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in), .mtu_reduced = tcp_v4_mtu_reduced, }; -EXPORT_SYMBOL(ipv4_specific); +EXPORT_IPV6_MOD(ipv4_specific); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { @@ -2495,10 +2528,25 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head) } #endif +static void tcp_release_user_frags(struct sock *sk) +{ +#ifdef CONFIG_PAGE_POOL + unsigned long index; + void *netmem; + + xa_for_each(&sk->sk_user_frags, index, netmem) + WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); +#endif +} + void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + tcp_release_user_frags(sk); + + xa_destroy(&sk->sk_user_frags); + trace_tcp_destroy_sock(sk); tcp_clear_xmit_timers(sk); @@ -2542,7 +2590,7 @@ void tcp_v4_destroy_sock(struct sock *sk) sk_sockets_allocated_dec(sk); } -EXPORT_SYMBOL(tcp_v4_destroy_sock); +EXPORT_IPV6_MOD(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ @@ -2778,7 +2826,7 @@ out: st->last_pos = *pos; return rc; } -EXPORT_SYMBOL(tcp_seq_start); +EXPORT_IPV6_MOD(tcp_seq_start); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { @@ -2809,7 +2857,7 @@ out: st->last_pos = *pos; return rc; } -EXPORT_SYMBOL(tcp_seq_next); +EXPORT_IPV6_MOD(tcp_seq_next); void tcp_seq_stop(struct seq_file *seq, void *v) { @@ -2827,7 +2875,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v) break; } } -EXPORT_SYMBOL(tcp_seq_stop); +EXPORT_IPV6_MOD(tcp_seq_stop); static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i) @@ -2867,17 +2915,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); + u8 icsk_pending; int rx_queue; int state; - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + timer_expires = icsk_timeout(icsk); + } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sk->sk_timer)) { timer_active = 2; timer_expires = sk->sk_timer.expires; @@ -2931,7 +2981,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", - i, src, srcp, dest, destp, tw->tw_substate, 0, 0, + i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } @@ -3421,6 +3471,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); @@ -3444,8 +3495,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; - /* Default TSQ limit of 16 TSO segments */ - net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; + /* Default TSQ limit of 4 MB */ + net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; @@ -3493,6 +3544,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_shrink_window = 0; net->ipv4.sysctl_tcp_pingpong_thresh = 1; + net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); + net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; return 0; } @@ -3501,13 +3554,25 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; - tcp_twsk_purge(net_exit_list, AF_INET); + /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work + * and failed setup_net error unwinding path are serialized. + * + * tcp_twsk_purge() handles twsk in any dead netns, not just those in + * net_exit_list, the thread that dismantles a particular twsk must + * do so without other thread progressing to refcount_dec_and_test() of + * tcp_death_row.tw_refcount. + */ + mutex_lock(&tcp_exit_batch_mutex); + + tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); tcp_fastopen_ctx_destroy(net); } + + mutex_unlock(&tcp_exit_batch_mutex); } static struct pernet_operations __net_initdata tcp_sk_ops = { @@ -3607,7 +3672,9 @@ void __init tcp_v4_init(void) */ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; - per_cpu(ipv4_tcp_sk, cpu) = sk; + sk->sk_clockid = CLOCK_MONOTONIC; + + per_cpu(ipv4_tcp_sk.sock, cpu) = sk; } if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); |