summaryrefslogtreecommitdiff
path: root/include/net/sock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h261
1 files changed, 126 insertions, 135 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index b5e00702acc1..8daf1b3b12c6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -303,6 +303,7 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
+ * @sk_bpf_cb_flags: used in bpf_setsockopt()
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
@@ -337,6 +338,7 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
+ * @sk_user_frags: xarray of pages the user is holding a reference on.
*/
struct sock {
/*
@@ -524,6 +526,8 @@ struct sock {
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+ u8 sk_bpf_cb_flags;
void *sk_user_data;
#ifdef CONFIG_SECURITY
@@ -542,6 +546,12 @@ struct sock {
#endif
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
+ struct xarray sk_user_frags;
+};
+
+struct sock_bh_locked {
+ struct sock *sock;
+ local_lock_t bh_lock;
};
enum sk_pacing {
@@ -887,6 +897,8 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
+#define sk_for_each_bound_safe(__sk, tmp, list) \
+ hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node)
/**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
@@ -944,9 +956,17 @@ enum sock_flags {
SOCK_XDP, /* XDP is attached */
SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */
+ SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */
+ SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */
};
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+/*
+ * The highest bit of sk_tsflags is reserved for kernel-internal
+ * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that
+ * SOF_TIMESTAMPING* values do not reach this reserved area
+ */
+#define SOCKCM_FLAG_TS_OPT_ID BIT(31)
static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk)
{
@@ -1194,6 +1214,13 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size)
size - offsetof(struct sock, sk_node.pprev));
}
+struct proto_accept_arg {
+ int flags;
+ int err;
+ int is_empty;
+ bool kern;
+};
+
/* Networking protocol blocks we attach to sockets.
* socket layer -> transport layer interface
*/
@@ -1208,8 +1235,8 @@ struct proto {
int addr_len);
int (*disconnect)(struct sock *sk, int flags);
- struct sock * (*accept)(struct sock *sk, int flags, int *err,
- bool kern);
+ struct sock * (*accept)(struct sock *sk,
+ struct proto_accept_arg *arg);
int (*ioctl)(struct sock *sk, int cmd,
int *karg);
@@ -1261,10 +1288,6 @@ struct proto {
unsigned int inuse_idx;
#endif
-#if IS_ENABLED(CONFIG_MPTCP)
- int (*forward_alloc_get)(const struct sock *sk);
-#endif
-
bool (*stream_memory_free)(const struct sock *sk, int wake);
bool (*sock_is_readable)(struct sock *sk);
/* Memory pressure */
@@ -1325,15 +1348,6 @@ int sock_load_diag_module(int family, int protocol);
INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
-static inline int sk_forward_alloc_get(const struct sock *sk)
-{
-#if IS_ENABLED(CONFIG_MPTCP)
- if (sk->sk_prot->forward_alloc_get)
- return sk->sk_prot->forward_alloc_get(sk);
-#endif
- return READ_ONCE(sk->sk_forward_alloc);
-}
-
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1371,73 +1385,6 @@ static inline int sk_under_cgroup_hierarchy(struct sock *sk,
#endif
}
-static inline bool sk_has_memory_pressure(const struct sock *sk)
-{
- return sk->sk_prot->memory_pressure != NULL;
-}
-
-static inline bool sk_under_global_memory_pressure(const struct sock *sk)
-{
- return sk->sk_prot->memory_pressure &&
- !!READ_ONCE(*sk->sk_prot->memory_pressure);
-}
-
-static inline bool sk_under_memory_pressure(const struct sock *sk)
-{
- if (!sk->sk_prot->memory_pressure)
- return false;
-
- if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- mem_cgroup_under_socket_pressure(sk->sk_memcg))
- return true;
-
- return !!READ_ONCE(*sk->sk_prot->memory_pressure);
-}
-
-static inline long
-proto_memory_allocated(const struct proto *prot)
-{
- return max(0L, atomic_long_read(prot->memory_allocated));
-}
-
-static inline long
-sk_memory_allocated(const struct sock *sk)
-{
- return proto_memory_allocated(sk->sk_prot);
-}
-
-/* 1 MB per cpu, in page units */
-#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
-extern int sysctl_mem_pcpu_rsv;
-
-static inline void
-sk_memory_allocated_add(struct sock *sk, int amt)
-{
- int local_reserve;
-
- preempt_disable();
- local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
- if (local_reserve >= READ_ONCE(sysctl_mem_pcpu_rsv)) {
- __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
- atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
- }
- preempt_enable();
-}
-
-static inline void
-sk_memory_allocated_sub(struct sock *sk, int amt)
-{
- int local_reserve;
-
- preempt_disable();
- local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
- if (local_reserve <= -READ_ONCE(sysctl_mem_pcpu_rsv)) {
- __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
- atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
- }
- preempt_enable();
-}
-
#define SK_ALLOC_PERCPU_COUNTER_BATCH 16
static inline void sk_sockets_allocated_dec(struct sock *sk)
@@ -1464,15 +1411,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
return percpu_counter_sum_positive(prot->sockets_allocated);
}
-static inline bool
-proto_memory_pressure(struct proto *prot)
-{
- if (!prot->memory_pressure)
- return false;
- return !!READ_ONCE(*prot->memory_pressure);
-}
-
-
#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR 64 /* should be enough for the first time */
struct prot_inuse {
@@ -1581,7 +1519,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size)
}
static inline bool
-sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
+__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
{
int delta;
@@ -1589,7 +1527,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
return true;
delta = size - sk->sk_forward_alloc;
return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
- skb_pfmemalloc(skb);
+ pfmemalloc;
+}
+
+static inline bool
+sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
+{
+ return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}
static inline int sk_unused_reserved_mem(const struct sock *sk)
@@ -1688,7 +1632,7 @@ bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock);
* lock_sock_fast - fast version of lock_sock
* @sk: socket
*
- * This version should be used for very small section, where process wont block
+ * This version should be used for very small section, where process won't block
* return false if fast path is taken:
*
* sk_lock.slock locked, owned = 0, BH disabled
@@ -1759,6 +1703,13 @@ static inline void sock_owned_by_me(const struct sock *sk)
#endif
}
+static inline void sock_not_owned_by_me(const struct sock *sk)
+{
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks);
+#endif
+}
+
static inline bool sock_owned_by_user(const struct sock *sk)
{
sock_owned_by_me(sk);
@@ -1791,6 +1742,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk)
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot, int kern);
void sk_free(struct sock *sk);
+void sk_net_refcnt_upgrade(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
void sk_free_unlock_clone(struct sock *sk);
@@ -1807,6 +1759,15 @@ void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);
+
+static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ }
+}
#else
#define sock_edemux sock_efree
#endif
@@ -1836,6 +1797,8 @@ static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
}
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
+void *sock_kmemdup(struct sock *sk, const void *src,
+ int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);
@@ -1851,13 +1814,17 @@ struct sockcm_cookie {
u64 transmit_time;
u32 mark;
u32 tsflags;
+ u32 ts_opt_id;
+ u32 priority;
};
static inline void sockcm_init(struct sockcm_cookie *sockc,
const struct sock *sk)
{
*sockc = (struct sockcm_cookie) {
- .tsflags = READ_ONCE(sk->sk_tsflags)
+ .mark = READ_ONCE(sk->sk_mark),
+ .tsflags = READ_ONCE(sk->sk_tsflags),
+ .priority = READ_ONCE(sk->sk_priority),
};
}
@@ -1873,7 +1840,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
int sock_no_bind(struct socket *, struct sockaddr *, int);
int sock_no_connect(struct socket *, struct sockaddr *, int, int);
int sock_no_socketpair(struct socket *, struct socket *);
-int sock_no_accept(struct socket *, struct socket *, int, bool);
+int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
int sock_no_getname(struct socket *, struct sockaddr *, int);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
@@ -2125,17 +2092,10 @@ sk_dst_get(const struct sock *sk)
static inline void __dst_negative_advice(struct sock *sk)
{
- struct dst_entry *ndst, *dst = __sk_dst_get(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
- if (dst && dst->ops->negative_advice) {
- ndst = dst->ops->negative_advice(dst);
-
- if (ndst != dst) {
- rcu_assign_pointer(sk->sk_dst_cache, ndst);
- sk_tx_queue_clear(sk);
- WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
- }
- }
+ if (dst && dst->ops->negative_advice)
+ dst->ops->negative_advice(sk, dst);
}
static inline void dst_negative_advice(struct sock *sk)
@@ -2164,7 +2124,7 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst)
sk_tx_queue_clear(sk);
WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
- old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
+ old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst)));
dst_release(old_dst);
}
@@ -2335,7 +2295,7 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq)
}
/**
- * sock_poll_wait - place memory barrier behind the poll_wait call.
+ * sock_poll_wait - wrapper for the poll_wait call.
* @filp: file
* @sock: socket to wait on
* @p: poll_table
@@ -2345,15 +2305,12 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq)
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
- if (!poll_does_not_wait(p)) {
- poll_wait(filp, &sock->wq.wait, p);
- /* We need to be sure we are in sync with the
- * socket flags modification.
- *
- * This memory barrier is paired in the wq_has_sleeper.
- */
- smp_mb();
- }
+ /* Provides a barrier we need to be sure we are in sync
+ * with the socket flags modification.
+ *
+ * This memory barrier is paired in the wq_has_sleeper.
+ */
+ poll_wait(filp, &sock->wq.wait, p);
}
static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
@@ -2506,6 +2463,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band)
}
}
+static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band)
+{
+ if (unlikely(sock_flag(sk, SOCK_FASYNC)))
+ sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
+}
+
/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
* need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
* Note: for send buffers, TCP works better if we can build two skbs at
@@ -2604,7 +2567,7 @@ struct sock_skb_cb {
/* Store sock_skb_cb at the end of skb->cb[] so protocol families
* using skb->cb[] would keep using it directly and utilize its
- * alignement guarantee.
+ * alignment guarantee.
*/
#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
sizeof(struct sock_skb_cb)))
@@ -2696,12 +2659,13 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
{
#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_RCVTSTAMP) | \
- (1UL << SOCK_RCVMARK))
+ (1UL << SOCK_RCVMARK) | \
+ (1UL << SOCK_RCVPRIORITY) | \
+ (1UL << SOCK_TIMESTAMPING_ANY))
#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \
SOF_TIMESTAMPING_RAW_HARDWARE)
- if (sk->sk_flags & FLAGS_RECV_CMSGS ||
- READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY)
+ if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS)
__sock_recv_cmsgs(msg, sk, skb);
else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
sock_write_timestamp(sk, skb->tstamp);
@@ -2709,39 +2673,48 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
sock_write_timestamp(sk, 0);
}
-void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);
+void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags);
/**
* _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
* @sk: socket sending this packet
- * @tsflags: timestamping flags to use
+ * @sockc: pointer to socket cmsg cookie to get timestamping info
* @tx_flags: completed with instructions for time stamping
* @tskey: filled in with next sk_tskey (not for TCP, which uses seqno)
*
* Note: callers should take care of initial ``*tx_flags`` value (usually 0)
*/
-static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags,
+static inline void _sock_tx_timestamp(struct sock *sk,
+ const struct sockcm_cookie *sockc,
__u8 *tx_flags, __u32 *tskey)
{
+ __u32 tsflags = sockc->tsflags;
+
if (unlikely(tsflags)) {
__sock_tx_timestamp(tsflags, tx_flags);
if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
- tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
- *tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+ tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) {
+ if (tsflags & SOCKCM_FLAG_TS_OPT_ID)
+ *tskey = sockc->ts_opt_id;
+ else
+ *tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+ }
}
if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
*tx_flags |= SKBTX_WIFI_STATUS;
}
-static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags,
+static inline void sock_tx_timestamp(struct sock *sk,
+ const struct sockcm_cookie *sockc,
__u8 *tx_flags)
{
- _sock_tx_timestamp(sk, tsflags, tx_flags, NULL);
+ _sock_tx_timestamp(sk, sockc, tx_flags, NULL);
}
-static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
+static inline void skb_setup_tx_timestamp(struct sk_buff *skb,
+ const struct sockcm_cookie *sockc)
{
- _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags,
+ _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags,
&skb_shinfo(skb)->tskey);
}
@@ -2771,6 +2744,11 @@ static inline bool sk_is_stream_unix(const struct sock *sk)
return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM;
}
+static inline bool sk_is_vsock(const struct sock *sk)
+{
+ return sk->sk_family == AF_VSOCK;
+}
+
/**
* sk_eat_skb - Release a skb if it is no longer needed
* @sk: socket to eat this skb from
@@ -2822,12 +2800,10 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
skb = sk->sk_validate_xmit_skb(sk, dev, skb);
-#ifdef CONFIG_TLS_DEVICE
- } else if (unlikely(skb->decrypted)) {
+ } else if (unlikely(skb_is_decrypted(skb))) {
pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
kfree_skb(skb);
skb = NULL;
-#endif
}
#endif
@@ -2842,6 +2818,16 @@ static inline bool sk_listener(const struct sock *sk)
return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}
+/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT
+ * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE)
+ * TCP RST and ACK can be attached to TIME_WAIT.
+ */
+static inline bool sk_listener_or_tw(const struct sock *sk)
+{
+ return (1 << READ_ONCE(sk->sk_state)) &
+ (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT);
+}
+
void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
int type);
@@ -2866,8 +2852,6 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;
-extern int sysctl_tstamp_allow_data;
-
extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
@@ -2931,6 +2915,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
void sock_enable_timestamps(struct sock *sk);
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
+#else
+static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+}
+#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);