summaryrefslogtreecommitdiff
path: root/include/net/sock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h427
1 files changed, 312 insertions, 115 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 7464e9f9f47c..aafe8bdb2c0f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair;
* @skc_reuseport: %SO_REUSEPORT setting
* @skc_ipv6only: socket is IPV6 only
* @skc_net_refcnt: socket is using net ref counting
+ * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
* @skc_bound_dev_if: bound device index if != 0
* @skc_bind_node: bind hash linkage for various protocol lookup tables
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -174,6 +175,7 @@ struct sock_common {
unsigned char skc_reuseport:1;
unsigned char skc_ipv6only:1;
unsigned char skc_net_refcnt:1;
+ unsigned char skc_bypass_prot_mem:1;
int skc_bound_dev_if;
union {
struct hlist_node skc_bind_node;
@@ -249,6 +251,7 @@ struct sk_filter;
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
+ * @psp_assoc: PSP association, if socket is PSP-secured
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_tsq_flags: TCP Small Queues flags
@@ -282,9 +285,11 @@ struct sk_filter;
* @sk_err_soft: errors that don't cause failure but are the cause of a
* persistent failure not just 'timed out'
* @sk_drops: raw/udp drops counter
+ * @sk_drop_counters: optional pointer to numa_drop_counters
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner
+ * @sk_ino: inode number (zero if orphaned)
* @sk_prefer_busy_poll: prefer busypolling over softirq processing
* @sk_busy_poll_budget: napi processing budget when busypolling
* @sk_priority: %SO_PRIORITY setting
@@ -300,15 +305,19 @@ struct sk_filter;
* @sk_txrehash: enable TX hash rethink
* @sk_filter: socket filtering instructions
* @sk_timer: sock cleanup timer
+ * @tcp_retransmit_timer: tcp retransmit timer
+ * @mptcp_retransmit_timer: mptcp retransmit timer
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
+ * @sk_bpf_cb_flags: used in bpf_setsockopt()
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
* @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
* for timestamping
* @sk_tskey: counter to disambiguate concurrent tstamp requests
+ * @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh.
* @sk_zckey: counter to order MSG_ZEROCOPY notifications
* @sk_socket: Identd and reporting IO signals
* @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
@@ -336,8 +345,16 @@ struct sk_filter;
* @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
+ * @sk_scm_recv_flags: all flags used by scm_recv()
+ * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS
+ * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY
+ * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD
+ * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS
+ * @sk_scm_unused: unused flags for scm_recv()
* @ns_tracker: tracker for netns reference
* @sk_user_frags: xarray of pages the user is holding a reference on.
+ * @sk_owner: reference to the real owner of the socket that calls
+ * sock_lock_init_class_and_name().
*/
struct sock {
/*
@@ -368,6 +385,7 @@ struct sock {
#define sk_reuseport __sk_common.skc_reuseport
#define sk_ipv6only __sk_common.skc_ipv6only
#define sk_net_refcnt __sk_common.skc_net_refcnt
+#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot
@@ -434,10 +452,15 @@ struct sock {
__cacheline_group_begin(sock_read_rxtx);
int sk_err;
struct socket *sk_socket;
+#ifdef CONFIG_MEMCG
struct mem_cgroup *sk_memcg;
+#endif
#ifdef CONFIG_XFRM
struct xfrm_policy __rcu *sk_policy[2];
#endif
+#if IS_ENABLED(CONFIG_INET_PSP)
+ struct psp_assoc __rcu *psp_assoc;
+#endif
__cacheline_group_end(sock_read_rxtx);
__cacheline_group_begin(sock_write_rxtx);
@@ -450,7 +473,7 @@ struct sock {
__cacheline_group_begin(sock_write_tx);
int sk_write_pending;
atomic_t sk_omem_alloc;
- int sk_sndbuf;
+ int sk_err_soft;
int sk_wmem_queued;
refcount_t sk_wmem_alloc;
@@ -460,21 +483,28 @@ struct sock {
struct rb_root tcp_rtx_queue;
};
struct sk_buff_head sk_write_queue;
- u32 sk_dst_pending_confirm;
- u32 sk_pacing_status; /* see enum sk_pacing */
struct page_frag sk_frag;
- struct timer_list sk_timer;
-
+ union {
+ struct timer_list sk_timer;
+ struct timer_list tcp_retransmit_timer;
+ struct timer_list mptcp_retransmit_timer;
+ };
unsigned long sk_pacing_rate; /* bytes per second */
atomic_t sk_zckey;
atomic_t sk_tskey;
+ unsigned long sk_tx_queue_mapping_jiffies;
__cacheline_group_end(sock_write_tx);
__cacheline_group_begin(sock_read_tx);
+ u32 sk_dst_pending_confirm;
+ u32 sk_pacing_status; /* see enum sk_pacing */
unsigned long sk_max_pacing_rate;
long sk_sndtimeo;
u32 sk_priority;
u32 sk_mark;
+ kuid_t sk_uid;
+ u16 sk_protocol;
+ u16 sk_type;
struct dst_entry __rcu *sk_dst_cache;
netdev_features_t sk_route_caps;
#ifdef CONFIG_SOCK_VALIDATE_XMIT
@@ -487,6 +517,7 @@ struct sock {
unsigned int sk_gso_max_size;
gfp_t sk_allocation;
u32 sk_txhash;
+ int sk_sndbuf;
u8 sk_pacing_shift;
bool sk_use_task_frag;
__cacheline_group_end(sock_read_tx);
@@ -500,15 +531,12 @@ struct sock {
sk_no_check_tx : 1,
sk_no_check_rx : 1;
u8 sk_shutdown;
- u16 sk_type;
- u16 sk_protocol;
unsigned long sk_lingertime;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
- int sk_err_soft;
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
- kuid_t sk_uid;
+ unsigned long sk_ino;
spinlock_t sk_peer_lock;
int sk_bind_phc;
struct pid *sk_peer_pid;
@@ -520,11 +548,23 @@ struct sock {
#endif
int sk_disconnects;
- u8 sk_txrehash;
+ union {
+ u8 sk_txrehash;
+ u8 sk_scm_recv_flags;
+ struct {
+ u8 sk_scm_credentials : 1,
+ sk_scm_security : 1,
+ sk_scm_pidfd : 1,
+ sk_scm_rights : 1,
+ sk_scm_unused : 4;
+ };
+ };
u8 sk_clockid;
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+ u8 sk_bpf_cb_flags;
void *sk_user_data;
#ifdef CONFIG_SECURITY
@@ -541,9 +581,14 @@ struct sock {
#ifdef CONFIG_BPF_SYSCALL
struct bpf_local_storage __rcu *sk_bpf_storage;
#endif
+ struct numa_drop_counters *sk_drop_counters;
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
struct xarray sk_user_frags;
+
+#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
+ struct module *sk_owner;
+#endif
};
struct sock_bh_locked {
@@ -793,11 +838,9 @@ static inline bool sk_del_node_init(struct sock *sk)
{
bool rc = __sk_del_node_init(sk);
- if (rc) {
- /* paranoid for a while -acme */
- WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
+ if (rc)
__sock_put(sk);
- }
+
return rc;
}
#define sk_del_node_init_rcu(sk) sk_del_node_init(sk)
@@ -815,14 +858,25 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
{
bool rc = __sk_nulls_del_node_init_rcu(sk);
- if (rc) {
- /* paranoid for a while -acme */
- WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
+ if (rc)
__sock_put(sk);
- }
+
return rc;
}
+static inline bool sk_nulls_replace_node_init_rcu(struct sock *old,
+ struct sock *new)
+{
+ if (sk_hashed(old)) {
+ hlist_nulls_replace_init_rcu(&old->sk_nulls_node,
+ &new->sk_nulls_node);
+ __sock_put(old);
+ return true;
+ }
+
+ return false;
+}
+
static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
hlist_add_head(&sk->sk_node, list);
@@ -953,6 +1007,8 @@ enum sock_flags {
SOCK_XDP, /* XDP is attached */
SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */
+ SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */
+ SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */
};
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -1223,10 +1279,10 @@ struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*pre_connect)(struct sock *sk,
- struct sockaddr *uaddr,
+ struct sockaddr_unsized *uaddr,
int addr_len);
int (*connect)(struct sock *sk,
- struct sockaddr *uaddr,
+ struct sockaddr_unsized *uaddr,
int addr_len);
int (*disconnect)(struct sock *sk, int flags);
@@ -1255,9 +1311,9 @@ struct proto {
size_t len, int flags, int *addr_len);
void (*splice_eof)(struct socket *sock);
int (*bind)(struct sock *sk,
- struct sockaddr *addr, int addr_len);
+ struct sockaddr_unsized *addr, int addr_len);
int (*bind_add)(struct sock *sk,
- struct sockaddr *addr, int addr_len);
+ struct sockaddr_unsized *addr, int addr_len);
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
@@ -1283,10 +1339,6 @@ struct proto {
unsigned int inuse_idx;
#endif
-#if IS_ENABLED(CONFIG_MPTCP)
- int (*forward_alloc_get)(const struct sock *sk);
-#endif
-
bool (*stream_memory_free)(const struct sock *sk, int wake);
bool (*sock_is_readable)(struct sock *sk);
/* Memory pressure */
@@ -1321,8 +1373,6 @@ struct proto {
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
- unsigned int __percpu *orphan_count;
-
struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
@@ -1347,15 +1397,6 @@ int sock_load_diag_module(int family, int protocol);
INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
-static inline int sk_forward_alloc_get(const struct sock *sk)
-{
-#if IS_ENABLED(CONFIG_MPTCP)
- if (sk->sk_prot->forward_alloc_get)
- return sk->sk_prot->forward_alloc_get(sk);
-#endif
- return READ_ONCE(sk->sk_forward_alloc);
-}
-
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1472,6 +1513,10 @@ static inline int __sk_prot_rehash(struct sock *sk)
#define SOCK_BINDADDR_LOCK 4
#define SOCK_BINDPORT_LOCK 8
+/**
+ * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time
+ */
+#define SOCK_CONNECT_BIND 16
struct socket_alloc {
struct socket socket;
@@ -1527,7 +1572,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size)
}
static inline bool
-sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
+__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
{
int delta;
@@ -1535,7 +1580,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
return true;
delta = size - sk->sk_forward_alloc;
return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
- skb_pfmemalloc(skb);
+ pfmemalloc;
+}
+
+static inline bool
+sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size)
+{
+ return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}
static inline int sk_unused_reserved_mem(const struct sock *sk)
@@ -1585,6 +1636,37 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
sk_mem_reclaim(sk);
}
+void __sk_charge(struct sock *sk, gfp_t gfp);
+
+#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
+static inline void sk_owner_set(struct sock *sk, struct module *owner)
+{
+ __module_get(owner);
+ sk->sk_owner = owner;
+}
+
+static inline void sk_owner_clear(struct sock *sk)
+{
+ sk->sk_owner = NULL;
+}
+
+static inline void sk_owner_put(struct sock *sk)
+{
+ module_put(sk->sk_owner);
+}
+#else
+static inline void sk_owner_set(struct sock *sk, struct module *owner)
+{
+}
+
+static inline void sk_owner_clear(struct sock *sk)
+{
+}
+
+static inline void sk_owner_put(struct sock *sk)
+{
+}
+#endif
/*
* Macro so as to not evaluate some arguments when
* lockdep is not enabled.
@@ -1594,13 +1676,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
*/
#define sock_lock_init_class_and_name(sk, sname, skey, name, key) \
do { \
+ sk_owner_set(sk, THIS_MODULE); \
sk->sk_lock.owned = 0; \
init_waitqueue_head(&sk->sk_lock.wq); \
spin_lock_init(&(sk)->sk_lock.slock); \
debug_check_no_locks_freed((void *)&(sk)->sk_lock, \
- sizeof((sk)->sk_lock)); \
+ sizeof((sk)->sk_lock)); \
lockdep_set_class_and_name(&(sk)->sk_lock.slock, \
- (skey), (sname)); \
+ (skey), (sname)); \
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
} while (0)
@@ -1744,9 +1827,14 @@ static inline bool sock_allow_reclassification(const struct sock *csk)
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot, int kern);
void sk_free(struct sock *sk);
+void sk_net_refcnt_upgrade(struct sock *sk);
void sk_destruct(struct sock *sk);
-struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
-void sk_free_unlock_clone(struct sock *sk);
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority, bool lock);
+
+static inline struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
+{
+ return sk_clone(sk, priority, true);
+}
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority);
@@ -1798,6 +1886,8 @@ static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
}
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
+void *sock_kmemdup(struct sock *sk, const void *src,
+ int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);
@@ -1814,13 +1904,17 @@ struct sockcm_cookie {
u32 mark;
u32 tsflags;
u32 ts_opt_id;
+ u32 priority;
+ u32 dmabuf_id;
};
static inline void sockcm_init(struct sockcm_cookie *sockc,
const struct sock *sk)
{
*sockc = (struct sockcm_cookie) {
- .tsflags = READ_ONCE(sk->sk_tsflags)
+ .mark = READ_ONCE(sk->sk_mark),
+ .tsflags = READ_ONCE(sk->sk_tsflags),
+ .priority = READ_ONCE(sk->sk_priority),
};
}
@@ -1833,8 +1927,8 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
* Functions to fill in entries in struct proto_ops when a protocol
* does not implement a particular function.
*/
-int sock_no_bind(struct socket *, struct sockaddr *, int);
-int sock_no_connect(struct socket *, struct sockaddr *, int, int);
+int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len);
+int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
int sock_no_getname(struct socket *, struct sockaddr *, int);
@@ -1924,7 +2018,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
/* Paired with READ_ONCE() in sk_tx_queue_get() and
* other WRITE_ONCE() because socket lock might be not held.
*/
- WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+ if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) {
+ WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+ WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
+ return;
+ }
+
+ /* Refresh sk_tx_queue_mapping_jiffies if too old. */
+ if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ))
+ WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
}
#define NO_QUEUE_MAPPING USHRT_MAX
@@ -1937,19 +2039,7 @@ static inline void sk_tx_queue_clear(struct sock *sk)
WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}
-static inline int sk_tx_queue_get(const struct sock *sk)
-{
- if (sk) {
- /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
- * and sk_tx_queue_set().
- */
- int val = READ_ONCE(sk->sk_tx_queue_mapping);
-
- if (val != NO_QUEUE_MAPPING)
- return val;
- }
- return -1;
-}
+int sk_tx_queue_get(const struct sock *sk);
static inline void __sk_rx_queue_set(struct sock *sk,
const struct sk_buff *skb,
@@ -2000,6 +2090,13 @@ static inline int sk_rx_queue_get(const struct sock *sk)
static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
sk->sk_socket = sock;
+ if (sock) {
+ WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
+ WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
+ } else {
+ /* Note: sk_uid is unchanged. */
+ WRITE_ONCE(sk->sk_ino, 0);
+ }
}
static inline wait_queue_head_t *sk_sleep(struct sock *sk)
@@ -2030,18 +2127,25 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
rcu_assign_pointer(sk->sk_wq, &parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
- sk->sk_uid = SOCK_INODE(parent)->i_uid;
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
-kuid_t sock_i_uid(struct sock *sk);
-unsigned long __sock_i_ino(struct sock *sk);
-unsigned long sock_i_ino(struct sock *sk);
+static inline unsigned long sock_i_ino(const struct sock *sk)
+{
+ /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */
+ return READ_ONCE(sk->sk_ino);
+}
+
+static inline kuid_t sk_uid(const struct sock *sk)
+{
+ /* Paired with WRITE_ONCE() in sockfs_setattr() */
+ return READ_ONCE(sk->sk_uid);
+}
static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
- return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+ return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
}
static inline u32 net_tx_rndhash(void)
@@ -2221,6 +2325,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
return 0;
}
+#define SK_WMEM_ALLOC_BIAS 1
/**
* sk_wmem_alloc_get - returns write allocations
* @sk: socket
@@ -2229,7 +2334,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
*/
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
- return refcount_read(&sk->sk_wmem_alloc) - 1;
+ return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS;
}
/**
@@ -2291,7 +2396,7 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq)
}
/**
- * sock_poll_wait - place memory barrier behind the poll_wait call.
+ * sock_poll_wait - wrapper for the poll_wait call.
* @filp: file
* @sock: socket to wait on
* @p: poll_table
@@ -2301,15 +2406,12 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq)
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
- if (!poll_does_not_wait(p)) {
- poll_wait(filp, &sock->wq.wait, p);
- /* We need to be sure we are in sync with the
- * socket flags modification.
- *
- * This memory barrier is paired in the wq_has_sleeper.
- */
- smp_mb();
- }
+ /* Provides a barrier we need to be sure we are in sync
+ * with the socket flags modification.
+ *
+ * This memory barrier is paired in the wq_has_sleeper.
+ */
+ poll_wait(filp, &sock->wq.wait, p);
}
static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
@@ -2517,12 +2619,16 @@ static inline struct page_frag *sk_page_frag(struct sock *sk)
bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
+static inline bool __sock_writeable(const struct sock *sk, int wmem_alloc)
+{
+ return wmem_alloc < (READ_ONCE(sk->sk_sndbuf) >> 1);
+}
/*
* Default write policy as shown to user space via poll/select/SIGIO
*/
static inline bool sock_writeable(const struct sock *sk)
{
- return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
+ return __sock_writeable(sk, refcount_read(&sk->sk_wmem_alloc));
}
static inline gfp_t gfp_any(void)
@@ -2535,14 +2641,62 @@ static inline gfp_t gfp_memcg_charge(void)
return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
+{
+ return sk->sk_memcg;
+}
+
+static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
+{
+ return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk);
+}
+
+static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
+#ifdef CONFIG_MEMCG_V1
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return !!memcg->tcpmem_pressure;
+#endif /* CONFIG_MEMCG_V1 */
+
+ do {
+ if (time_before64(get_jiffies_64(),
+ mem_cgroup_get_socket_pressure(memcg))) {
+ memcg_memory_event(mem_cgroup_from_sk(sk),
+ MEMCG_SOCK_THROTTLED);
+ return true;
+ }
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ return false;
+}
+#else
+static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
+{
+ return NULL;
+}
+
+static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
+{
+ return false;
+}
+
+static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
+{
+ return false;
+}
+#endif
+
static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
- return noblock ? 0 : sk->sk_rcvtimeo;
+ return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo);
}
static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
- return noblock ? 0 : sk->sk_sndtimeo;
+ return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo);
}
static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
@@ -2568,8 +2722,8 @@ struct sock_skb_cb {
* using skb->cb[] would keep using it directly and utilize its
* alignment guarantee.
*/
-#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
- sizeof(struct sock_skb_cb)))
+#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \
+ sizeof(struct sock_skb_cb))
#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
SOCK_SKB_CB_OFFSET))
@@ -2577,18 +2731,53 @@ struct sock_skb_cb {
#define sock_skb_cb_check_size(size) \
BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)
+static inline void sk_drops_add(struct sock *sk, int segs)
+{
+ struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc)
+ numa_drop_add(ndc, segs);
+ else
+ atomic_add(segs, &sk->sk_drops);
+}
+
+static inline void sk_drops_inc(struct sock *sk)
+{
+ sk_drops_add(sk, 1);
+}
+
+static inline int sk_drops_read(const struct sock *sk)
+{
+ const struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc) {
+ DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops));
+ return numa_drop_read(ndc);
+ }
+ return atomic_read(&sk->sk_drops);
+}
+
+static inline void sk_drops_reset(struct sock *sk)
+{
+ struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc)
+ numa_drop_reset(ndc);
+ atomic_set(&sk->sk_drops, 0);
+}
+
static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
- atomic_read(&sk->sk_drops) : 0;
+ sk_drops_read(sk) : 0;
}
-static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
+static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb)
{
int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
- atomic_add(segs, &sk->sk_drops);
+ sk_drops_add(sk, segs);
}
static inline ktime_t sock_read_timestamp(struct sock *sk)
@@ -2624,6 +2813,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb);
+bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk);
+int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
+ struct timespec64 *ts);
+
static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
@@ -2658,12 +2851,13 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
{
#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_RCVTSTAMP) | \
- (1UL << SOCK_RCVMARK))
+ (1UL << SOCK_RCVMARK) | \
+ (1UL << SOCK_RCVPRIORITY) | \
+ (1UL << SOCK_TIMESTAMPING_ANY))
#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \
SOF_TIMESTAMPING_RAW_HARDWARE)
- if (sk->sk_flags & FLAGS_RECV_CMSGS ||
- READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY)
+ if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS)
__sock_recv_cmsgs(msg, sk, skb);
else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
sock_write_timestamp(sk, skb->tstamp);
@@ -2698,8 +2892,6 @@ static inline void _sock_tx_timestamp(struct sock *sk,
*tskey = atomic_inc_return(&sk->sk_tskey) - 1;
}
}
- if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
- *tx_flags |= SKBTX_WIFI_STATUS;
}
static inline void sock_tx_timestamp(struct sock *sk,
@@ -2737,9 +2929,14 @@ static inline bool sk_is_udp(const struct sock *sk)
sk->sk_protocol == IPPROTO_UDP;
}
+static inline bool sk_is_unix(const struct sock *sk)
+{
+ return sk->sk_family == AF_UNIX;
+}
+
static inline bool sk_is_stream_unix(const struct sock *sk)
{
- return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM;
+ return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM;
}
static inline bool sk_is_vsock(const struct sock *sk)
@@ -2747,6 +2944,13 @@ static inline bool sk_is_vsock(const struct sock *sk)
return sk->sk_family == AF_VSOCK;
}
+static inline bool sk_may_scm_recv(const struct sock *sk)
+{
+ return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) ||
+ sk->sk_family == AF_NETLINK ||
+ (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH);
+}
+
/**
* sk_eat_skb - Release a skb if it is no longer needed
* @sk: socket to eat this skb from
@@ -2786,26 +2990,10 @@ sk_is_refcounted(struct sock *sk)
return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}
-/* Checks if this SKB belongs to an HW offloaded socket
- * and whether any SW fallbacks are required based on dev.
- * Check decrypted mark in case skb_orphan() cleared socket.
- */
-static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
- struct net_device *dev)
+static inline bool
+sk_requests_wifi_status(struct sock *sk)
{
-#ifdef CONFIG_SOCK_VALIDATE_XMIT
- struct sock *sk = skb->sk;
-
- if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
- skb = sk->sk_validate_xmit_skb(sk, dev, skb);
- } else if (unlikely(skb_is_decrypted(skb))) {
- pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
- kfree_skb(skb);
- skb = NULL;
- }
-#endif
-
- return skb;
+ return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS);
}
/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
@@ -2844,8 +3032,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
*/
#define _SK_MEM_PACKETS 256
#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
-#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;
@@ -2912,7 +3100,13 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
-void sock_enable_timestamps(struct sock *sk);
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
+#else
+static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+}
+#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
@@ -2922,7 +3116,7 @@ void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);
-int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
+int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len);
int sock_get_timeout(long timeo, void *optval, bool old_timeval);
int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
@@ -2933,8 +3127,11 @@ int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
static inline bool sk_is_readable(struct sock *sk)
{
- if (sk->sk_prot->sock_is_readable)
- return sk->sk_prot->sock_is_readable(sk);
+ const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+ if (prot->sock_is_readable)
+ return prot->sock_is_readable(sk);
+
return false;
}
#endif /* _SOCK_H */