diff options
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r-- | include/net/tcp.h | 282 |
1 files changed, 156 insertions, 126 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index f6eba9652d01..2d08473a6dc0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -41,6 +41,7 @@ #include <net/inet_ecn.h> #include <net/dst.h> #include <net/mptcp.h> +#include <net/xfrm.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> @@ -52,6 +53,8 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); +DECLARE_PER_CPU(u32, tcp_tw_isn); + void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -294,14 +297,6 @@ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) return seq3 - seq2 >= seq1 - seq2; } -static inline bool tcp_out_of_memory(struct sock *sk) -{ - if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) - return true; - return false; -} - static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); @@ -314,7 +309,7 @@ static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) void sk_forced_mem_schedule(struct sock *sk, int size); -bool tcp_check_oom(struct sock *sk, int shift); +bool tcp_check_oom(const struct sock *sk, int shift); extern struct proto tcp_prot; @@ -348,12 +343,12 @@ void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); +enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); -void tcp_twsk_purge(struct list_head *net_exit_list, int family); +void tcp_twsk_purge(struct list_head *net_exit_list); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); @@ -392,12 +387,13 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th); + const struct tcphdr *th, + u32 *tw_isn); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); -int tcp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); +enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); @@ -498,6 +494,22 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct tcp_options_received *tcp_opt, int mss, u32 tsoff); +#if IS_ENABLED(CONFIG_BPF) +struct bpf_tcp_req_attrs { + u32 rcv_tsval; + u32 rcv_tsecr; + u16 mss; + u8 rcv_wscale; + u8 snd_wscale; + u8 ecn_ok; + u8 wscale_ok; + u8 sack_ok; + u8 tstamp_ok; + u8 usec_ts_ok; + u8 reserved[3]; +}; +#endif + #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. @@ -577,6 +589,15 @@ static inline u32 tcp_cookie_time(void) return val; } +/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ +static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val) +{ + if (usec_ts) + return div_u64(val, NSEC_PER_USEC); + + return div_u64(val, NSEC_PER_MSEC); +} + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); @@ -590,6 +611,26 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry * dst_feature(dst, RTAX_FEATURE_ECN); } +#if IS_ENABLED(CONFIG_BPF) +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return skb->sk; +} + +struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb); +#else +static inline bool cookie_bpf_ok(struct sk_buff *skb) +{ + return false; +} + +static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return NULL; +} +#endif + /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); @@ -622,7 +663,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, void tcp_send_probe0(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); -void tcp_send_active_reset(struct sock *sk, gfp_t priority); +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); @@ -636,11 +678,25 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); +void tcp_done_with_error(struct sock *sk, int err); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); +static inline void tcp_cleanup_skb(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); +} + +static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb)); + __skb_queue_tail(&sk->sk_receive_queue, skb); +} + /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) @@ -687,6 +743,9 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); void tcp_read_done(struct sock *sk, size_t len); @@ -697,7 +756,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -static inline void tcp_bound_rto(const struct sock *sk) +static inline void tcp_bound_rto(struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; @@ -880,6 +939,19 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +/* State flags for sacked in struct tcp_skb_cb */ +enum tcp_skb_cb_sacked_flags { + TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ + TCPCB_SACKED_RETRANS = (1 << 1), /* SKB retransmitted */ + TCPCB_LOST = (1 << 2), /* SKB is lost */ + TCPCB_TAGBITS = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS | + TCPCB_LOST), /* All tag bits */ + TCPCB_REPAIRED = (1 << 4), /* SKB repaired (no skb_mstamp_ns) */ + TCPCB_EVER_RETRANS = (1 << 7), /* Ever retransmitted frame */ + TCPCB_RETRANS = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS | + TCPCB_REPAIRED), +}; + /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. @@ -890,13 +962,10 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : tcp_tw_isn is used in input path only - * (isn chosen by tcp_timewait_state_process()) - * + /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ - __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; @@ -905,15 +974,6 @@ struct tcp_skb_cb { __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK. */ -#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ -#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ -#define TCPCB_LOST 0x04 /* SKB is lost */ -#define TCPCB_TAGBITS 0x07 /* All tag bits */ -#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */ -#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ -#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ - TCPCB_REPAIRED) - __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ @@ -1023,9 +1083,18 @@ static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { + /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */ return likely(tcp_skb_can_collapse_to(to) && mptcp_skb_can_collapse(to, from) && - skb_pure_zcopy_same(to, from)); + skb_pure_zcopy_same(to, from) && + skb_frags_readable(to) == skb_frags_readable(from)); +} + +static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to, + const struct sk_buff *from) +{ + return likely(mptcp_skb_can_collapse(to, from) && + !skb_cmp_decrypted(to, from)); } /* Events passed to congestion control interface */ @@ -1122,7 +1191,7 @@ struct tcp_congestion_ops { /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ @@ -1173,7 +1242,7 @@ extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else @@ -1494,11 +1563,10 @@ static inline int tcp_space_from_win(const struct sock *sk, int win) return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); } -/* Assume a conservative default of 1200 bytes of payload per 4K page. +/* Assume a 50% default for skb->len/skb->truesize ratio. * This may be adjusted later in tcp_measure_rcv_mss(). */ -#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \ - SKB_TRUESIZE(4096)) +#define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1)) static inline void tcp_scaling_ratio_init(struct sock *sk) { @@ -1766,7 +1834,7 @@ int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @c: returned tcp_sigpool for usage (uninitialized on failure) * - * Returns 0 on success, error otherwise. + * Returns: 0 on success, error otherwise. */ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); /** @@ -1813,12 +1881,6 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, return __tcp_md5_do_lookup(sk, 0, addr, family, true); } -enum skb_drop_reason -tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - const void *saddr, const void *daddr, - int family, int l3index, const __u8 *hash_location); - - #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else static inline struct tcp_md5sig_key * @@ -1835,13 +1897,6 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, return NULL; } -static inline enum skb_drop_reason -tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - const void *saddr, const void *daddr, - int family, int l3index, const __u8 *hash_location) -{ - return SKB_NOT_DROPPED_YET; -} #define tcp_twsk_md5_key(twsk) NULL #endif @@ -2053,6 +2108,14 @@ static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct soc tcp_wmem_free_skb(sk, skb); } +static inline void tcp_write_collapse_fence(struct sock *sk) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + + if (skb) + TCP_SKB_CB(skb)->eor = 1; +} + static inline void tcp_push_pending_frames(struct sock *sk) { if (tcp_send_head(sk)) { @@ -2150,7 +2213,10 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); @@ -2239,7 +2305,8 @@ struct tcp_request_sock_ops { struct dst_entry *(*route_req)(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req); + struct request_sock *req, + u32 tw_isn); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, @@ -2324,21 +2391,15 @@ static inline void tcp_get_current_key(const struct sock *sk, static inline bool tcp_key_is_md5(const struct tcp_key *key) { -#ifdef CONFIG_TCP_MD5SIG - if (static_branch_unlikely(&tcp_md5_needed.key) && - key->type == TCP_KEY_MD5) - return true; -#endif + if (static_branch_tcp_md5()) + return key->type == TCP_KEY_MD5; return false; } static inline bool tcp_key_is_ao(const struct tcp_key *key) { -#ifdef CONFIG_TCP_AO - if (static_branch_unlikely(&tcp_ao_needed.key) && - key->type == TCP_KEY_AO) - return true; -#endif + if (static_branch_tcp_ao()) + return key->type == TCP_KEY_AO; return false; } @@ -2386,14 +2447,35 @@ void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); +static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) +{ + WARN_ONCE(cond, + "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", + str, + tcp_snd_cwnd(tcp_sk(sk)), + tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, + tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, + tcp_sk(sk)->tlp_high_seq, sk->sk_state, + inet_csk(sk)->icsk_ca_state, + tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, + inet_csk(sk)->icsk_pmtu_cookie); +} + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; - u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); - return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; + if (likely(skb)) { + u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); + + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; + } else { + tcp_warn_once(sk, 1, "rtx queue empty: "); + return jiffies_to_usecs(rto); + } + } /* @@ -2534,6 +2616,11 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#ifdef CONFIG_BPF_STREAM_PARSER +struct strparser; +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +#endif /* CONFIG_BPF_STREAM_PARSER */ #endif /* CONFIG_BPF_SYSCALL */ #ifdef CONFIG_INET @@ -2661,10 +2748,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } -static inline void tcp_bpf_rtt(struct sock *sk) +static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) - tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) @@ -2750,66 +2837,9 @@ static inline bool tcp_ao_required(struct sock *sk, const void *saddr, return false; } -/* Called with rcu_read_lock() */ -static inline enum skb_drop_reason -tcp_inbound_hash(struct sock *sk, const struct request_sock *req, - const struct sk_buff *skb, - const void *saddr, const void *daddr, - int family, int dif, int sdif) -{ - const struct tcphdr *th = tcp_hdr(skb); - const struct tcp_ao_hdr *aoh; - const __u8 *md5_location; - int l3index; - - /* Invalid option or two times meet any of auth options */ - if (tcp_parse_auth_options(th, &md5_location, &aoh)) { - tcp_hash_fail("TCP segment has incorrect auth options set", - family, skb, ""); - return SKB_DROP_REASON_TCP_AUTH_HDR; - } - - if (req) { - if (tcp_rsk_used_ao(req) != !!aoh) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); - tcp_hash_fail("TCP connection can't start/end using TCP-AO", - family, skb, "%s", - !aoh ? "missing AO" : "AO signed"); - return SKB_DROP_REASON_TCP_AOFAILURE; - } - } - - /* sdif set, means packet ingressed via a device - * in an L3 domain and dif is set to the l3mdev - */ - l3index = sdif ? dif : 0; - - /* Fast path: unsigned segments */ - if (likely(!md5_location && !aoh)) { - /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid - * for the remote peer. On TCP-AO established connection - * the last key is impossible to remove, so there's - * always at least one current_key. - */ - if (tcp_ao_required(sk, saddr, family, l3index, true)) { - tcp_hash_fail("AO hash is required, but not found", - family, skb, "L3 index %d", l3index); - return SKB_DROP_REASON_TCP_AONOTFOUND; - } - if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); - tcp_hash_fail("MD5 Hash not found", - family, skb, "L3 index %d", l3index); - return SKB_DROP_REASON_TCP_MD5NOTFOUND; - } - return SKB_NOT_DROPPED_YET; - } - - if (aoh) - return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh); - - return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, - l3index, md5_location); -} +enum skb_drop_reason tcp_inbound_hash(struct sock *sk, + const struct request_sock *req, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); #endif /* _TCP_H */ |