diff options
Diffstat (limited to 'include/net/sock.h')
| -rw-r--r-- | include/net/sock.h | 427 |
1 files changed, 312 insertions, 115 deletions
diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..aafe8bdb2c0f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair; * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting + * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol @@ -174,6 +175,7 @@ struct sock_common { unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; + unsigned char skc_bypass_prot_mem:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -249,6 +251,7 @@ struct sk_filter; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @psp_assoc: PSP association, if socket is PSP-secured * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -282,9 +285,11 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter + * @sk_drop_counters: optional pointer to numa_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_ino: inode number (zero if orphaned) * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting @@ -300,15 +305,19 @@ struct sk_filter; * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer + * @tcp_retransmit_timer: tcp retransmit timer + * @mptcp_retransmit_timer: mptcp retransmit timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags + * @sk_bpf_cb_flags: used in bpf_setsockopt() * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests + * @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh. * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. @@ -336,8 +345,16 @@ struct sk_filter; * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags + * @sk_scm_recv_flags: all flags used by scm_recv() + * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS + * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY + * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS + * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. + * @sk_owner: reference to the real owner of the socket that calls + * sock_lock_init_class_and_name(). */ struct sock { /* @@ -368,6 +385,7 @@ struct sock { #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt +#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot @@ -434,10 +452,15 @@ struct sock { __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; +#ifdef CONFIG_MEMCG struct mem_cgroup *sk_memcg; +#endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -450,7 +473,7 @@ struct sock { __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; - int sk_sndbuf; + int sk_err_soft; int sk_wmem_queued; refcount_t sk_wmem_alloc; @@ -460,21 +483,28 @@ struct sock { struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; - u32 sk_dst_pending_confirm; - u32 sk_pacing_status; /* see enum sk_pacing */ struct page_frag sk_frag; - struct timer_list sk_timer; - + union { + struct timer_list sk_timer; + struct timer_list tcp_retransmit_timer; + struct timer_list mptcp_retransmit_timer; + }; unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; + unsigned long sk_tx_queue_mapping_jiffies; __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); + u32 sk_dst_pending_confirm; + u32 sk_pacing_status; /* see enum sk_pacing */ unsigned long sk_max_pacing_rate; long sk_sndtimeo; u32 sk_priority; u32 sk_mark; + kuid_t sk_uid; + u16 sk_protocol; + u16 sk_type; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT @@ -487,6 +517,7 @@ struct sock { unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; + int sk_sndbuf; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); @@ -500,15 +531,12 @@ struct sock { sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; - u16 sk_type; - u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; - int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; - kuid_t sk_uid; + unsigned long sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; @@ -520,11 +548,23 @@ struct sock { #endif int sk_disconnects; - u8 sk_txrehash; + union { + u8 sk_txrehash; + u8 sk_scm_recv_flags; + struct { + u8 sk_scm_credentials : 1, + sk_scm_security : 1, + sk_scm_pidfd : 1, + sk_scm_rights : 1, + sk_scm_unused : 4; + }; + }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG)) + u8 sk_bpf_cb_flags; void *sk_user_data; #ifdef CONFIG_SECURITY @@ -541,9 +581,14 @@ struct sock { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif + struct numa_drop_counters *sk_drop_counters; struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) + struct module *sk_owner; +#endif }; struct sock_bh_locked { @@ -793,11 +838,9 @@ static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); - if (rc) { - /* paranoid for a while -acme */ - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (rc) __sock_put(sk); - } + return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) @@ -815,14 +858,25 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); - if (rc) { - /* paranoid for a while -acme */ - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (rc) __sock_put(sk); - } + return rc; } +static inline bool sk_nulls_replace_node_init_rcu(struct sock *old, + struct sock *new) +{ + if (sk_hashed(old)) { + hlist_nulls_replace_init_rcu(&old->sk_nulls_node, + &new->sk_nulls_node); + __sock_put(old); + return true; + } + + return false; +} + static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); @@ -953,6 +1007,8 @@ enum sock_flags { SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ + SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ + SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1223,10 +1279,10 @@ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int addr_len); int (*connect)(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); @@ -1255,9 +1311,9 @@ struct proto { size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, - struct sockaddr *addr, int addr_len); + struct sockaddr_unsized *addr, int addr_len); int (*bind_add)(struct sock *sk, - struct sockaddr *addr, int addr_len); + struct sockaddr_unsized *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); @@ -1283,10 +1339,6 @@ struct proto { unsigned int inuse_idx; #endif -#if IS_ENABLED(CONFIG_MPTCP) - int (*forward_alloc_get)(const struct sock *sk); -#endif - bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ @@ -1321,8 +1373,6 @@ struct proto { unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ - unsigned int __percpu *orphan_count; - struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; @@ -1347,15 +1397,6 @@ int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); -static inline int sk_forward_alloc_get(const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_MPTCP) - if (sk->sk_prot->forward_alloc_get) - return sk->sk_prot->forward_alloc_get(sk); -#endif - return READ_ONCE(sk->sk_forward_alloc); -} - static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) @@ -1472,6 +1513,10 @@ static inline int __sk_prot_rehash(struct sock *sk) #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 +/** + * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time + */ +#define SOCK_CONNECT_BIND 16 struct socket_alloc { struct socket socket; @@ -1527,7 +1572,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; @@ -1535,7 +1580,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || - skb_pfmemalloc(skb); + pfmemalloc; +} + +static inline bool +sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size) +{ + return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) @@ -1585,6 +1636,37 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) sk_mem_reclaim(sk); } +void __sk_charge(struct sock *sk, gfp_t gfp); + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ + __module_get(owner); + sk->sk_owner = owner; +} + +static inline void sk_owner_clear(struct sock *sk) +{ + sk->sk_owner = NULL; +} + +static inline void sk_owner_put(struct sock *sk) +{ + module_put(sk->sk_owner); +} +#else +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ +} + +static inline void sk_owner_clear(struct sock *sk) +{ +} + +static inline void sk_owner_put(struct sock *sk) +{ +} +#endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. @@ -1594,13 +1676,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ + sk_owner_set(sk, THIS_MODULE); \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ - sizeof((sk)->sk_lock)); \ + sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ - (skey), (sname)); \ + (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) @@ -1744,9 +1827,14 @@ static inline bool sock_allow_reclassification(const struct sock *csk) struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); +void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, bool lock); + +static inline struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +{ + return sk_clone(sk, priority, true); +} struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); @@ -1798,6 +1886,8 @@ static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); @@ -1814,13 +1904,17 @@ struct sockcm_cookie { u32 mark; u32 tsflags; u32 ts_opt_id; + u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { - .tsflags = READ_ONCE(sk->sk_tsflags) + .mark = READ_ONCE(sk->sk_mark), + .tsflags = READ_ONCE(sk->sk_tsflags), + .priority = READ_ONCE(sk->sk_priority), }; } @@ -1833,8 +1927,8 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ -int sock_no_bind(struct socket *, struct sockaddr *, int); -int sock_no_connect(struct socket *, struct sockaddr *, int, int); +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len); +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); @@ -1924,7 +2018,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ - WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); + if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) { + WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); + return; + } + + /* Refresh sk_tx_queue_mapping_jiffies if too old. */ + if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ)) + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); } #define NO_QUEUE_MAPPING USHRT_MAX @@ -1937,19 +2039,7 @@ static inline void sk_tx_queue_clear(struct sock *sk) WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } -static inline int sk_tx_queue_get(const struct sock *sk) -{ - if (sk) { - /* Paired with WRITE_ONCE() in sk_tx_queue_clear() - * and sk_tx_queue_set(). - */ - int val = READ_ONCE(sk->sk_tx_queue_mapping); - - if (val != NO_QUEUE_MAPPING) - return val; - } - return -1; -} +int sk_tx_queue_get(const struct sock *sk); static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, @@ -2000,6 +2090,13 @@ static inline int sk_rx_queue_get(const struct sock *sk) static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; + if (sock) { + WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); + WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); + } } static inline wait_queue_head_t *sk_sleep(struct sock *sk) @@ -2030,18 +2127,25 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); - sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } -kuid_t sock_i_uid(struct sock *sk); -unsigned long __sock_i_ino(struct sock *sk); -unsigned long sock_i_ino(struct sock *sk); +static inline unsigned long sock_i_ino(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */ + return READ_ONCE(sk->sk_ino); +} + +static inline kuid_t sk_uid(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sockfs_setattr() */ + return READ_ONCE(sk->sk_uid); +} static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { - return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); + return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) @@ -2221,6 +2325,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro return 0; } +#define SK_WMEM_ALLOC_BIAS 1 /** * sk_wmem_alloc_get - returns write allocations * @sk: socket @@ -2229,7 +2334,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro */ static inline int sk_wmem_alloc_get(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) - 1; + return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS; } /** @@ -2291,7 +2396,7 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) } /** - * sock_poll_wait - place memory barrier behind the poll_wait call. + * sock_poll_wait - wrapper for the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table @@ -2301,15 +2406,12 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { - if (!poll_does_not_wait(p)) { - poll_wait(filp, &sock->wq.wait, p); - /* We need to be sure we are in sync with the - * socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - } + /* Provides a barrier we need to be sure we are in sync + * with the socket flags modification. + * + * This memory barrier is paired in the wq_has_sleeper. + */ + poll_wait(filp, &sock->wq.wait, p); } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) @@ -2517,12 +2619,16 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +static inline bool __sock_writeable(const struct sock *sk, int wmem_alloc) +{ + return wmem_alloc < (READ_ONCE(sk->sk_sndbuf) >> 1); +} /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); + return __sock_writeable(sk, refcount_read(&sk->sk_wmem_alloc)); } static inline gfp_t gfp_any(void) @@ -2535,14 +2641,62 @@ static inline gfp_t gfp_memcg_charge(void) return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return sk->sk_memcg; +} + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); +} + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + +#ifdef CONFIG_MEMCG_V1 + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ + + do { + if (time_before64(get_jiffies_64(), + mem_cgroup_get_socket_pressure(memcg))) { + memcg_memory_event(mem_cgroup_from_sk(sk), + MEMCG_SOCK_THROTTLED); + return true; + } + } while ((memcg = parent_mem_cgroup(memcg))); + + return false; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return NULL; +} + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return false; +} + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + return false; +} +#endif + static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_rcvtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_sndtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo); } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) @@ -2568,8 +2722,8 @@ struct sock_skb_cb { * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) @@ -2577,18 +2731,53 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_add(struct sock *sk, int segs) +{ + struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) + numa_drop_add(ndc, segs); + else + atomic_add(segs, &sk->sk_drops); +} + +static inline void sk_drops_inc(struct sock *sk) +{ + sk_drops_add(sk, 1); +} + +static inline int sk_drops_read(const struct sock *sk) +{ + const struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) { + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); + return numa_drop_read(ndc); + } + return atomic_read(&sk->sk_drops); +} + +static inline void sk_drops_reset(struct sock *sk) +{ + struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) + numa_drop_reset(ndc); + atomic_set(&sk->sk_drops, 0); +} + static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? - atomic_read(&sk->sk_drops) : 0; + sk_drops_read(sk) : 0; } -static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - atomic_add(segs, &sk->sk_drops); + sk_drops_add(sk, segs); } static inline ktime_t sock_read_timestamp(struct sock *sk) @@ -2624,6 +2813,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk); +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts); + static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { @@ -2658,12 +2851,13 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVMARK)) + (1UL << SOCK_RCVMARK) | \ + (1UL << SOCK_RCVPRIORITY) | \ + (1UL << SOCK_TIMESTAMPING_ANY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_RECV_CMSGS || - READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) + if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); @@ -2698,8 +2892,6 @@ static inline void _sock_tx_timestamp(struct sock *sk, *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } } - if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) - *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, @@ -2737,9 +2929,14 @@ static inline bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static inline bool sk_is_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX; +} + static inline bool sk_is_stream_unix(const struct sock *sk) { - return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; + return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) @@ -2747,6 +2944,13 @@ static inline bool sk_is_vsock(const struct sock *sk) return sk->sk_family == AF_VSOCK; } +static inline bool sk_may_scm_recv(const struct sock *sk) +{ + return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || + sk->sk_family == AF_NETLINK || + (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from @@ -2786,26 +2990,10 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } -/* Checks if this SKB belongs to an HW offloaded socket - * and whether any SW fallbacks are required based on dev. - * Check decrypted mark in case skb_orphan() cleared socket. - */ -static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, - struct net_device *dev) +static inline bool +sk_requests_wifi_status(struct sock *sk) { -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sock *sk = skb->sk; - - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); - } else if (unlikely(skb_is_decrypted(skb))) { - pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); - kfree_skb(skb); - skb = NULL; - } -#endif - - return skb; + return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV @@ -2844,8 +3032,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; @@ -2912,7 +3100,13 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); -void sock_enable_timestamps(struct sock *sk); +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); +#else +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ +} +#endif void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); @@ -2922,7 +3116,7 @@ void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); -int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); +int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, @@ -2933,8 +3127,11 @@ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { - if (sk->sk_prot->sock_is_readable) - return sk->sk_prot->sock_is_readable(sk); + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->sock_is_readable) + return prot->sock_is_readable(sk); + return false; } #endif /* _SOCK_H */ |
