diff options
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 784 |
1 files changed, 510 insertions, 274 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ab1c08658528..bd5462154f97 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -18,36 +19,12 @@ * Jorge Cwik, <jorge@laser.satlink.net> */ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/sysctl.h> -#include <linux/workqueue.h> #include <net/tcp.h> -#include <net/inet_common.h> +#include <net/tcp_ecn.h> #include <net/xfrm.h> - -int sysctl_tcp_syncookies __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_syncookies); - -int sysctl_tcp_abort_on_overflow __read_mostly; - -struct inet_timewait_death_row tcp_death_row = { - .sysctl_max_tw_buckets = NR_FILE * 2, - .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, - .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), - .hashinfo = &tcp_hashinfo, - .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, - (unsigned long)&tcp_death_row), - .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, - inet_twdr_twkill_work), -/* Short-time timewait calendar */ - - .twcal_hand = -1, - .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, - (unsigned long)&tcp_death_row), -}; -EXPORT_SYMBOL_GPL(tcp_death_row); +#include <net/busy_poll.h> +#include <net/rstreason.h> +#include <net/psp.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -58,6 +35,38 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) return seq == e_win && seq == end_seq; } +static enum tcp_tw_status +tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, + const struct sk_buff *skb, int mib_idx) +{ + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + + if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, + &tcptw->tw_last_oow_ack_time)) { + /* Send ACK. Note, we do not put the bucket, + * it will be released by caller. + */ + return TCP_TW_ACK_OOW; + } + + /* We are rate-limiting, so just release the tw sock and drop skb. */ + inet_twsk_put(tw); + return TCP_TW_SUCCESS; +} + +static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, + u32 rcv_nxt) +{ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao; + + ao = rcu_dereference(tcptw->ao_info); + if (unlikely(ao && seq < rcv_nxt)) + WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); +#endif + WRITE_ONCE(tcptw->tw_rcv_nxt, seq); +} + /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -90,43 +99,59 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th) + const struct tcphdr *th, u32 *tw_isn, + enum skb_drop_reason *drop_reason) { - struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); + struct tcp_options_received tmp_opt; + enum skb_drop_reason psp_drop; bool paws_reject = false; + int ts_recent_stamp; + + /* Instead of dropping immediately, wait to see what value is + * returned. We will accept a non psp-encapsulated syn in the + * case where TCP_TW_SYN is returned. + */ + psp_drop = psp_twsk_rx_policy_check(tw, skb); tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, 0, NULL); + ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { + tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { - tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; - tmp_opt.ts_recent = tcptw->tw_ts_recent; - tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + if (tmp_opt.rcv_tsecr) + tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; + tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); + tmp_opt.ts_recent_stamp = ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } - if (tw->tw_substate == TCP_FIN_WAIT2) { + if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ + if (psp_drop) + goto out_put; + /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcptw->tw_rcv_nxt, - tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) - return TCP_TW_ACK; + rcv_nxt, + rcv_nxt + tcptw->tw_rcv_wnd)) + return tcp_timewait_check_oow_rate_limit( + tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) - goto kill_with_rst; + if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt)) + return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || - !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || + !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; @@ -136,29 +161,25 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * reset. */ if (!th->fin || - TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { -kill_with_rst: - inet_twsk_deschedule(tw, &tcp_death_row); - inet_twsk_put(tw); + TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) return TCP_TW_RST; - } /* FIN arrived, enter true time-wait state. */ - tw->tw_substate = TCP_TIME_WAIT; - tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); + twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, + rcv_nxt); + if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = get_seconds(); - tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + u64 ts = tcp_clock_ms(); + + WRITE_ONCE(tw->tw_entry_stamp, ts); + WRITE_ONCE(tcptw->tw_ts_recent_stamp, + div_u64(ts, MSEC_PER_SEC)); + WRITE_ONCE(tcptw->tw_ts_recent, + tmp_opt.rcv_tsval); } - if (tcp_death_row.sysctl_tw_recycle && - tcptw->tw_ts_recent_stamp && - tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, - TCP_TIMEWAIT_LEN); - else - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -180,28 +201,32 @@ kill_with_rst: */ if (!paws_reject && - (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ + if (psp_drop) + goto out_put; + if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if (sysctl_tcp_rfc1337 == 0) { + if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: - inet_twsk_deschedule(tw, &tcp_death_row); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } + } else { + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = get_seconds(); + WRITE_ONCE(tcptw->tw_ts_recent, + tmp_opt.rcv_tsval); + WRITE_ONCE(tcptw->tw_ts_recent_stamp, + ktime_get_seconds()); } inet_twsk_put(tw); @@ -226,18 +251,23 @@ kill: */ if (th->syn && !th->rst && !th->ack && !paws_reject && - (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || + (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || (tmp_opt.saw_tstamp && - (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->when = isn; + *tw_isn = isn; return TCP_TW_SYN; } - if (paws_reject) - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + if (psp_drop) + goto out_put; + + if (paws_reject) { + *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); + } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. @@ -247,127 +277,262 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); - /* Send ACK. Note, we do not put the bucket, - * it will be released by caller. - */ - return TCP_TW_ACK; + return tcp_timewait_check_oow_rate_limit( + tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } + +out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } -EXPORT_SYMBOL(tcp_timewait_state_process); +EXPORT_IPV6_MOD(tcp_timewait_state_process); + +static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) +{ +#ifdef CONFIG_TCP_MD5SIG + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *key; + + /* + * The timewait bucket does not have the key DB from the + * sock structure. We just make a quick copy of the + * md5 key being used (if indeed we are using one) + * so the timewait ack generating code has the key. + */ + tcptw->tw_md5_key = NULL; + if (!static_branch_unlikely(&tcp_md5_needed.key)) + return; + + key = tp->af_specific->md5_lookup(sk, sk); + if (key) { + tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); + if (!tcptw->tw_md5_key) + return; + if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) + goto out_free; + } + return; +out_free: + WARN_ON_ONCE(1); + kfree(tcptw->tw_md5_key); + tcptw->tw_md5_key = NULL; +#endif +} /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct inet_timewait_sock *tw = NULL; const struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_sock *tp = tcp_sk(sk); - bool recycle_ok = false; - - if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = tcp_remember_stamp(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + struct inet_timewait_sock *tw; - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) - tw = inet_twsk_alloc(sk, state); + tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); - if (tw != NULL) { + if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - struct inet_sock *inet = inet_sk(sk); - tw->tw_transparent = inet->transparent; + tw->tw_mark = sk->sk_mark; + tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + /* refreshed when we enter true TIME-WAIT state */ + tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; - + tw->tw_usec_ts = tp->tcp_usec_ts; + tcptw->tw_last_oow_ack_time = 0; + tcptw->tw_tx_delay = tp->tcp_tx_delay; + tw->tw_txhash = sk->sk_txhash; + tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping; +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING + tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping; +#endif #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_timewait_sock *tw6; - tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); - tw6 = inet6_twsk((struct sock *)tw); - tw6->tw_v6_daddr = np->daddr; - tw6->tw_v6_rcv_saddr = np->rcv_saddr; + tw->tw_v6_daddr = sk->sk_v6_daddr; + tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; - tw->tw_ipv6only = np->ipv6only; + tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); + tw->tw_ipv6only = sk->sk_ipv6only; } #endif -#ifdef CONFIG_TCP_MD5SIG - /* - * The timewait bucket does not have the key DB from the - * sock structure. We just make a quick copy of the - * md5 key being used (if indeed we are using one) - * so the timewait ack generating code has the key. - */ - do { - struct tcp_md5sig_key *key; - tcptw->tw_md5_key = NULL; - key = tp->af_specific->md5_lookup(sk, sk); - if (key != NULL) { - tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) - BUG(); - } - } while (0); -#endif - - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); + tcp_time_wait_init(sk, tcptw); + tcp_ao_time_wait(tcptw, tp); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; - if (recycle_ok) { - tw->tw_timeout = rto; - } else { - tw->tw_timeout = TCP_TIMEWAIT_LEN; - if (state == TCP_TIME_WAIT) - timeo = TCP_TIMEWAIT_LEN; - } + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; - inet_twsk_schedule(tw, &tcp_death_row, timeo, - TCP_TIMEWAIT_LEN); - inet_twsk_put(tw); + /* Linkage updates. + * Note that access to tw after this point is illegal. + */ + inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); + NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } +EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG - struct tcp_timewait_sock *twsk = tcp_twsk(sk); + if (static_branch_unlikely(&tcp_md5_needed.key)) { + struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) - kfree_rcu(twsk->tw_md5_key, rcu); + if (twsk->tw_md5_key) { + kfree(twsk->tw_md5_key); + static_branch_slow_dec_deferred(&tcp_md5_needed); + } + } #endif + tcp_ao_destroy_sock(sk, true); + psp_twsk_assoc_free(inet_twsk(sk)); +} + +void tcp_twsk_purge(struct list_head *net_exit_list) +{ + bool purged_once = false; + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) { + if (net->ipv4.tcp_death_row.hashinfo->pernet) { + /* Even if tw_refcount == 1, we must clean up kernel reqsk */ + inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); + } else if (!purged_once) { + inet_twsk_purge(&tcp_hashinfo); + purged_once = true; + } + } +} + +/* Warning : This function is called without sk_listener being locked. + * Be sure to read socket fields once, as their value could change under us. + */ +void tcp_openreq_init_rwin(struct request_sock *req, + const struct sock *sk_listener, + const struct dst_entry *dst) +{ + struct inet_request_sock *ireq = inet_rsk(req); + const struct tcp_sock *tp = tcp_sk(sk_listener); + int full_space = tcp_full_space(sk_listener); + u32 window_clamp; + __u8 rcv_wscale; + u32 rcv_wnd; + int mss; + + mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); + window_clamp = READ_ONCE(tp->window_clamp); + /* Set this up on the first call only */ + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + + rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); + if (rcv_wnd == 0) + rcv_wnd = dst_metric(dst, RTAX_INITRWND); + else if (full_space < rcv_wnd * mss) + full_space = rcv_wnd * mss; + + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(sk_listener, full_space, + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, + ireq->wscale_ok, + &rcv_wscale, + rcv_wnd); + ireq->rcv_wscale = rcv_wscale; +} + +static void tcp_ecn_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) +{ + const struct tcp_request_sock *treq = tcp_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + + if (treq->accecn_ok) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters_payload(sk, skb); + } else { + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); + } } -EXPORT_SYMBOL_GPL(tcp_twsk_destructor); -static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, - struct request_sock *req) +void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { - tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; + struct inet_connection_sock *icsk = inet_csk(sk); + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + + rcu_read_lock(); + ca = tcp_ca_find_key(ca_key); + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { + icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); + icsk->icsk_ca_ops = ca; + ca_got_dst = true; + } + rcu_read_unlock(); + } + + /* If no valid choice made yet, assign current system default ca. */ + if (!ca_got_dst && + (!icsk->icsk_ca_setsockopt || + !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) + tcp_assign_congestion_control(sk); + + tcp_set_ca_state(sk, TCP_CA_Open); +} +EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); + +static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, + struct request_sock *req, + struct tcp_sock *newtp) +{ +#if IS_ENABLED(CONFIG_SMC) + struct inet_request_sock *ireq; + + if (static_branch_unlikely(&tcp_have_smc)) { + ireq = inet_rsk(req); + if (oldtp->syn_smc && !ireq->smc_ok) + newtp->syn_smc = 0; + } +#endif } /* This is not only more efficient than what we used to do, it eliminates @@ -376,113 +541,129 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) +struct sock *tcp_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk; + const struct tcp_sock *oldtp; + struct tcp_sock *newtp; + u32 seq; + + if (!newsk) + return NULL; - if (newsk != NULL) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_request_sock *treq = tcp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp = tcp_sk(newsk); - - /* Now setup tcp_sock */ - newtp->pred_flags = 0; - - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; - - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - - tcp_prequeue_init(newtp); - INIT_LIST_HEAD(&newtp->tsq_node); + newicsk = inet_csk(newsk); + newtp = tcp_sk(newsk); + oldtp = tcp_sk(sk); - tcp_init_wl(newtp, treq->rcv_isn); + smc_check_reset_syn_req(oldtp, req, newtp); - newtp->srtt = 0; - newtp->mdev = TCP_TIMEOUT_INIT; - newicsk->icsk_rto = TCP_TIMEOUT_INIT; + /* Now setup tcp_sock */ + newtp->pred_flags = 0; - newtp->packets_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->fackets_out = 0; - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tcp_enable_early_retrans(newtp); - newtp->tlp_high_seq = 0; + seq = treq->rcv_isn + 1; + newtp->rcv_wup = seq; + WRITE_ONCE(newtp->copied_seq, seq); + WRITE_ONCE(newtp->rcv_nxt, seq); + newtp->segs_in = 1; - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = TCP_INIT_CWND; - newtp->snd_cwnd_cnt = 0; + seq = treq->snt_isn + 1; + newtp->snd_sml = newtp->snd_una = seq; + WRITE_ONCE(newtp->snd_nxt, seq); + newtp->snd_up = seq; - if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && - !try_module_get(newicsk->icsk_ca_ops->owner)) - newicsk->icsk_ca_ops = &tcp_init_congestion_ops; + INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); - tcp_set_ca_state(newsk, TCP_CA_Open); - tcp_init_xmit_timers(newsk); - skb_queue_head_init(&newtp->out_of_order_queue); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + tcp_init_wl(newtp, treq->rcv_isn); - newtp->rx_opt.saw_tstamp = 0; + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; - newtp->rx_opt.dsack = 0; - newtp->rx_opt.num_sacks = 0; + newtp->lsndtime = tcp_jiffies32; + newsk->sk_txhash = READ_ONCE(treq->txhash); + newtp->total_retrans = req->num_retrans; - newtp->urg_data = 0; + tcp_init_xmit_timers(newsk); + WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); - if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); + if (sock_flag(newsk, SOCK_KEEPOPEN)) + tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); - newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { - if (sysctl_tcp_fack) - tcp_enable_fack(newtp); - } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; - newtp->rx_opt.wscale_ok = ireq->wscale_ok; - if (newtp->rx_opt.wscale_ok) { - newtp->rx_opt.snd_wscale = ireq->snd_wscale; - newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; - } else { - newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp, 65535U); - } - newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << - newtp->rx_opt.snd_wscale); - newtp->max_window = newtp->snd_wnd; - - if (newtp->rx_opt.tstamp_ok) { - newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + newtp->rx_opt.sack_ok = ireq->sack_ok; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->tcp_usec_ts = treq->req_usec_ts; + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->tcp_usec_ts = 0; + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + if (req->num_timeout) { + newtp->total_rto = req->num_timeout; + newtp->undo_marker = treq->snt_isn; + if (newtp->tcp_usec_ts) { + newtp->retrans_stamp = treq->snt_synack; + newtp->total_rto_time = (u32)(tcp_clock_us() - + newtp->retrans_stamp) / USEC_PER_MSEC; } else { - newtp->rx_opt.ts_recent_stamp = 0; - newtp->tcp_header_len = sizeof(struct tcphdr); + newtp->retrans_stamp = div_u64(treq->snt_synack, + USEC_PER_SEC / TCP_TS_HZ); + newtp->total_rto_time = tcp_clock_ms() - + newtp->retrans_stamp; } - newtp->tsoffset = 0; + newtp->total_rto_recoveries = 1; + } + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG - newtp->md5sig_info = NULL; /*XXX*/ - if (newtp->af_specific->md5_lookup(sk, newsk)) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; + newtp->md5sig_info = NULL; /*XXX*/ #endif - if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) - newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; - newtp->rx_opt.mss_clamp = req->mss; - TCP_ECN_openreq_child(newtp, req); - newtp->fastopen_rsk = NULL; - newtp->syn_data_acked = 0; - - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); +#ifdef CONFIG_TCP_AO + newtp->ao_info = NULL; + + if (tcp_rsk_used_ao(req)) { + struct tcp_ao_key *ao_key; + + ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); + if (ao_key) + newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); } + #endif + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + tcp_ecn_openreq_child(newsk, req, skb); + newtp->fastopen_req = NULL; + RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + + newtp->bpf_chg_cc_inprogress = 0; + tcp_bpf_clone(sk, newsk); + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + + xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); + return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); @@ -496,32 +677,44 @@ EXPORT_SYMBOL(tcp_create_openreq_child); * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results + * + * Note: If @fastopen is true, this can be called from process context. + * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct request_sock **prev, - bool fastopen) + bool fastopen, bool *req_stolen, + enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + bool tsecr_reject = false; bool paws_reject = false; - - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); + bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, 0, NULL); + tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; + if (tmp_opt.rcv_tsecr) { + if (inet_rsk(req)->tstamp_ok && !fastopen) + tsecr_reject = !between(tmp_opt.rcv_tsecr, + tcp_rsk(req)->snt_tsval_first, + READ_ONCE(tcp_rsk(req)->snt_tsval_last)); + tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; + } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); + tmp_opt.ts_recent_stamp = ktime_get_seconds() - + tcp_reqsk_timeout(req) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } @@ -553,9 +746,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ - if (!inet_rtx_syn_ack(sk, req)) - req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, - TCP_RTO_MAX) + jiffies; + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSYNRECV, + &tcp_rsk(req)->last_oow_ack_time) && + + !tcp_rtx_synack(sk, req)) { + unsigned long expires = jiffies; + + expires += tcp_reqsk_timeout(req); + if (!fastopen) + mod_timer_pending(&req->rsk_timer, expires); + else + req->rsk_timer.expires = expires; + } return NULL; } @@ -621,28 +824,34 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1)) return sk; - /* Also, it would be not so bad idea to check rcv_tsecr, which - * is essentially ACK extension and too early or too late values - * should cause reset in unsynchronized states. - */ - /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + if (paws_reject || tsecr_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ - if (!(flg & TCP_FLAG_RST)) + if (!(flg & TCP_FLAG_RST) && + !tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSYNRECV, + &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); - if (paws_reject) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + if (paws_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + } else if (tsecr_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); + } else { + SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); + } return NULL; } /* In sequence, PAWS is OK. */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) - req->ts_recent = tmp_opt.rcv_tsval; - if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ @@ -653,7 +862,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } @@ -666,11 +875,17 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ - if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) - tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; - else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */ - tcp_rsk(req)->snt_synack = 0; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } /* For Fast Open no more processing is needed (sk is the * child socket). @@ -679,10 +894,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ - if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } @@ -692,18 +907,32 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); + if (!child) goto listen_overflow; - inet_csk_reqsk_queue_unlink(sk, req, prev); - inet_csk_reqsk_queue_removed(sk, req); + if (own_req && tmp_opt.saw_tstamp && + !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) + tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; + + if (own_req && rsk_drop_req(req)) { + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); + return child; + } - inet_csk_reqsk_queue_add(sk, req, child); - return child; + sock_rps_save_rxhash(child, skb); + tcp_synack_rtt_meas(child, req); + *req_stolen = !own_req; + return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: - if (!sysctl_tcp_abort_on_overflow) { + SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); + if (sk != req->rsk_listener) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } @@ -715,18 +944,21 @@ embryonic_reset: * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ - req->rsk_ops->send_reset(sk, skb); + req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); - tcp_reset(sk); + tcp_reset(sk, skb); } if (!fastopen) { - inet_csk_reqsk_queue_drop(sk, req, prev); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + bool unlinked = inet_csk_reqsk_queue_drop(sk, req); + + if (unlinked) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + *req_stolen = !unlinked; } return NULL; } -EXPORT_SYMBOL(tcp_check_req); +EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, @@ -740,18 +972,22 @@ EXPORT_SYMBOL(tcp_check_req); * be created. */ -int tcp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb) +enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) + __releases(&((child)->sk_lock.slock)) { - int ret = 0; + enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; int state = child->sk_state; + /* record sk_napi_id and sk_rx_queue_mapping of child. */ + sk_mark_napi_id_set(child, skb); + + tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), - skb->len); + reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening @@ -762,6 +998,6 @@ int tcp_child_process(struct sock *parent, struct sock *child, bh_unlock_sock(child); sock_put(child); - return ret; + return reason; } -EXPORT_SYMBOL(tcp_child_process); +EXPORT_IPV6_MOD(tcp_child_process); |
