summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/bpf_sk_storage.c16
-rw-r--r--net/core/datagram.c60
-rw-r--r--net/core/dev.c601
-rw-r--r--net/core/dev.h14
-rw-r--r--net/core/dev_ioctl.c36
-rw-r--r--net/core/devmem.c41
-rw-r--r--net/core/devmem.h3
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/filter.c352
-rw-r--r--net/core/gen_estimator.c2
-rw-r--r--net/core/gro.c12
-rw-r--r--net/core/gro_cells.c9
-rw-r--r--net/core/hotdata.c2
-rw-r--r--net/core/link_watch.c4
-rw-r--r--net/core/lwt_bpf.c4
-rw-r--r--net/core/neighbour.c131
-rw-r--r--net/core/net-procfs.c3
-rw-r--r--net/core/net-sysfs.c10
-rw-r--r--net/core/net_namespace.c72
-rw-r--r--net/core/netdev-genl-gen.c3
-rw-r--r--net/core/netdev-genl-gen.h1
-rw-r--r--net/core/netdev-genl.c122
-rw-r--r--net/core/netdev_queues.c27
-rw-r--r--net/core/netdev_rx_queue.c9
-rw-r--r--net/core/netmem_priv.h16
-rw-r--r--net/core/netpoll.c19
-rw-r--r--net/core/page_pool.c127
-rw-r--r--net/core/pktgen.c7
-rw-r--r--net/core/request_sock.c4
-rw-r--r--net/core/rtnetlink.c30
-rw-r--r--net/core/scm.c18
-rw-r--r--net/core/selftests.c48
-rw-r--r--net/core/skbuff.c153
-rw-r--r--net/core/skmsg.c2
-rw-r--r--net/core/sock.c234
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/core/sysctl_net_core.c16
-rw-r--r--net/core/xdp.c21
39 files changed, 1595 insertions, 639 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index b2a76ce33932..9ef2099c5426 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-y += hotdata.o
obj-y += netdev_rx_queue.o
+obj-y += netdev_queues.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 2e538399757f..850dd736ccd1 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -50,16 +50,14 @@ void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage)
goto out;
bpf_local_storage_destroy(sk_storage);
out:
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
@@ -138,7 +136,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
{
struct bpf_local_storage_elem *copy_selem;
- copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC);
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC);
if (!copy_selem)
return NULL;
@@ -161,8 +159,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage || hlist_empty(&sk_storage->list))
@@ -199,7 +196,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
} else {
ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
if (ret) {
- bpf_selem_free(copy_selem, smap, true);
+ bpf_selem_free(copy_selem, true);
atomic_sub(smap->elem_size,
&newsk->sk_omem_alloc);
bpf_map_put(map);
@@ -213,8 +210,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
}
out:
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
/* In case of an error, don't free anything explicitly here, the
* caller is responsible to call bpf_sk_storage_free.
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 94cc4705e91d..c285c6465923 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
spin_unlock_bh(&sk_queue->lock);
}
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
return err;
}
EXPORT_SYMBOL(__sk_queue_drop_skb);
@@ -618,6 +618,20 @@ fault:
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
+int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset,
+ struct iov_iter *from, int len)
+{
+ struct iov_iter_state state;
+ int ret;
+
+ iov_iter_save_state(from, &state);
+ ret = skb_copy_datagram_from_iter(skb, offset, from, len);
+ if (ret)
+ iov_iter_restore(from, &state);
+ return ret;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iter_full);
+
int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
struct iov_iter *from, size_t length)
{
@@ -906,21 +920,22 @@ fault:
EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
/**
- * datagram_poll - generic datagram poll
+ * datagram_poll_queue - same as datagram_poll, but on a specific receive
+ * queue
* @file: file struct
* @sock: socket
* @wait: poll table
+ * @rcv_queue: receive queue to poll
*
- * Datagram poll: Again totally generic. This also handles
- * sequenced packet sockets providing the socket receive queue
- * is only ever holding data ready to receive.
+ * Performs polling on the given receive queue, handling shutdown, error,
+ * and connection state. This is useful for protocols that deliver
+ * userspace-bound packets through a custom queue instead of
+ * sk->sk_receive_queue.
*
- * Note: when you *don't* use this routine for this protocol,
- * and you use a different write policy from sock_writeable()
- * then please supply your own write_space callback.
+ * Return: poll bitmask indicating the socket's current state
*/
-__poll_t datagram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+__poll_t datagram_poll_queue(struct file *file, struct socket *sock,
+ poll_table *wait, struct sk_buff_head *rcv_queue)
{
struct sock *sk = sock->sk;
__poll_t mask;
@@ -942,7 +957,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(rcv_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
@@ -964,4 +979,27 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
return mask;
}
+EXPORT_SYMBOL(datagram_poll_queue);
+
+/**
+ * datagram_poll - generic datagram poll
+ * @file: file struct
+ * @sock: socket
+ * @wait: poll table
+ *
+ * Datagram poll: Again totally generic. This also handles
+ * sequenced packet sockets providing the socket receive queue
+ * is only ever holding data ready to receive.
+ *
+ * Note: when you *don't* use this routine for this protocol,
+ * and you use a different write policy from sock_writeable()
+ * then please supply your own write_space callback.
+ *
+ * Return: poll bitmask indicating the socket's current state
+ */
+__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ return datagram_poll_queue(file, sock, wait,
+ &sock->sk->sk_receive_queue);
+}
EXPORT_SYMBOL(datagram_poll);
diff --git a/net/core/dev.c b/net/core/dev.c
index 68dc47d7e700..9094c0fb8c68 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1163,6 +1163,7 @@ void netdev_copy_name(struct net_device *dev, char *name)
strscpy(name, dev->name, IFNAMSIZ);
} while (read_seqretry(&netdev_rename_lock, seq));
}
+EXPORT_IPV6_MOD_GPL(netdev_copy_name);
/**
* netdev_get_name - get a netdevice name, knowing its ifindex.
@@ -2462,9 +2463,9 @@ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
}
-static inline int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev,
- struct net_device *orig_dev)
+static int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
{
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
return -ENOMEM;
@@ -2483,7 +2484,7 @@ static inline void deliver_ptype_list_skb(struct sk_buff *skb,
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (ptype->type != type)
continue;
- if (pt_prev)
+ if (unlikely(pt_prev))
deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
@@ -2544,7 +2545,7 @@ again:
if (skb_loop_sk(ptype, skb))
continue;
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
deliver_skb(skb2, pt_prev, skb->dev);
pt_prev = ptype;
continue;
@@ -3373,6 +3374,13 @@ static void __netif_reschedule(struct Qdisc *q)
void __netif_schedule(struct Qdisc *q)
{
+ /* If q->defer_list is not empty, at least one thread is
+ * in __dev_xmit_skb() before llist_del_all(&q->defer_list).
+ * This thread will attempt to run the queue.
+ */
+ if (!llist_empty(&q->defer_list))
+ return;
+
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
__netif_reschedule(q);
}
@@ -3768,8 +3776,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
features &= ~dev->gso_partial_features;
- /* Make sure to clear the IPv4 ID mangling feature if the
- * IPv4 header has the potential to be fragmented.
+ /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
+ * has the potential to be fragmented so that TSO does not generate
+ * segments with the same ID. For encapsulated packets, the ID mangling
+ * feature is guaranteed not to use the same ID for the outer IPv4
+ * headers of the generated segments if the headers have the potential
+ * to be fragmented, so there is no need to clear the IPv4 ID mangling
+ * feature (see the section about NETIF_F_TSO_MANGLEID in
+ * segmentation-offloads.rst).
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
struct iphdr *iph = skb->encapsulation ?
@@ -3779,6 +3793,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
features &= ~NETIF_F_TSO_MANGLEID;
}
+ /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
+ * so neither does TSO that depends on it.
+ */
+ if (features & NETIF_F_IPV6_CSUM &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+ vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
+ skb_transport_header_was_set(skb) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
+
return features;
}
@@ -3895,6 +3921,38 @@ sw_checksum:
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+/* Checks if this SKB belongs to an HW offloaded socket
+ * and whether any SW fallbacks are required based on dev.
+ * Check decrypted mark in case skb_orphan() cleared socket.
+ */
+static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
+ struct sk_buff *skb);
+ struct sock *sk = skb->sk;
+
+ sk_validate = NULL;
+ if (sk) {
+ if (sk_fullsock(sk))
+ sk_validate = sk->sk_validate_xmit_skb;
+ else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
+ sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
+ }
+
+ if (sk_validate) {
+ skb = sk_validate(sk, dev, skb);
+ } else if (unlikely(skb_is_decrypted(skb))) {
+ pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
+ kfree_skb(skb);
+ skb = NULL;
+ }
+#endif
+
+ return skb;
+}
+
static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
struct net_device *dev)
{
@@ -4011,17 +4069,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
-static void qdisc_pkt_len_init(struct sk_buff *skb)
+static void qdisc_pkt_len_segs_init(struct sk_buff *skb)
{
- const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u16 gso_segs;
qdisc_skb_cb(skb)->pkt_len = skb->len;
+ if (!shinfo->gso_size) {
+ qdisc_skb_cb(skb)->pkt_segs = 1;
+ return;
+ }
+
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;
/* To get more precise estimation of bytes sent on wire,
* we add to pkt_len the headers size of all segments
*/
- if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
- u16 gso_segs = shinfo->gso_segs;
+ if (skb_transport_header_was_set(skb)) {
unsigned int hdr_len;
/* mac layer + network layer */
@@ -4054,6 +4118,8 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
if (payload <= 0)
return;
gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
+ shinfo->gso_segs = gso_segs;
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs;
}
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
@@ -4075,9 +4141,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
+ struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
spinlock_t *root_lock = qdisc_lock(q);
- struct sk_buff *to_free = NULL;
- bool contended;
+ struct llist_node *ll_list, *first_n;
+ unsigned long defer_count = 0;
int rc;
qdisc_calculate_pkt_len(skb, q);
@@ -4093,9 +4160,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
if (unlikely(!nolock_qdisc_is_empty(q))) {
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
__qdisc_run(q);
- qdisc_run_end(q);
+ to_free2 = qdisc_run_end(q);
- goto no_lock_out;
+ goto free_skbs;
}
qdisc_bstats_cpu_update(q, skb);
@@ -4103,81 +4170,93 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
!nolock_qdisc_is_empty(q))
__qdisc_run(q);
- qdisc_run_end(q);
- return NET_XMIT_SUCCESS;
+ to_free2 = qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ goto free_skbs;
}
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
- qdisc_run(q);
-
-no_lock_out:
- if (unlikely(to_free))
- kfree_skb_list_reason(to_free,
- tcf_get_drop_reason(to_free));
- return rc;
+ to_free2 = qdisc_run(q);
+ goto free_skbs;
}
- if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
- return NET_XMIT_DROP;
- }
- /*
- * Heuristic to force contended enqueues to serialize on a
- * separate lock before trying to get qdisc main lock.
- * This permits qdisc->running owner to get the lock more
- * often and dequeue packets faster.
- * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
- * and then other tasks will only enqueue packets. The packets will be
- * sent after the qdisc owner is scheduled again. To prevent this
- * scenario the task always serialize on the lock.
+ /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+ * In the try_cmpxchg() loop, we want to increment q->defer_count
+ * at most once to limit the number of skbs in defer_list.
+ * We perform the defer_count increment only if the list is not empty,
+ * because some arches have slow atomic_long_inc_return().
+ */
+ first_n = READ_ONCE(q->defer_list.first);
+ do {
+ if (first_n && !defer_count) {
+ defer_count = atomic_long_inc_return(&q->defer_count);
+ if (unlikely(defer_count > READ_ONCE(q->limit))) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+ return NET_XMIT_DROP;
+ }
+ }
+ skb->ll_node.next = first_n;
+ } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
+
+ /* If defer_list was not empty, we know the cpu which queued
+ * the first skb will process the whole list for us.
*/
- contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
- if (unlikely(contended))
- spin_lock(&q->busylock);
+ if (first_n)
+ return NET_XMIT_SUCCESS;
spin_lock(root_lock);
+
+ ll_list = llist_del_all(&q->defer_list);
+ /* There is a small race because we clear defer_count not atomically
+ * with the prior llist_del_all(). This means defer_list could grow
+ * over q->limit.
+ */
+ atomic_long_set(&q->defer_count, 0);
+
+ ll_list = llist_reverse_order(ll_list);
+
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
- __qdisc_drop(skb, &to_free);
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+ __qdisc_drop(skb, &to_free);
rc = NET_XMIT_DROP;
- } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
- qdisc_run_begin(q)) {
+ goto unlock;
+ }
+ if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+ !llist_next(ll_list) && qdisc_run_begin(q)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
+ DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+ struct sk_buff,
+ ll_node));
qdisc_bstats_update(q, skb);
-
- if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
- if (unlikely(contended)) {
- spin_unlock(&q->busylock);
- contended = false;
- }
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
__qdisc_run(q);
- }
-
- qdisc_run_end(q);
+ to_free2 = qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
- WRITE_ONCE(q->owner, smp_processor_id());
- rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
- WRITE_ONCE(q->owner, -1);
- if (qdisc_run_begin(q)) {
- if (unlikely(contended)) {
- spin_unlock(&q->busylock);
- contended = false;
- }
- __qdisc_run(q);
- qdisc_run_end(q);
+ int count = 0;
+
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+ prefetch(next);
+ prefetch(&next->priority);
+ skb_mark_not_on_list(skb);
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ count++;
}
+ to_free2 = qdisc_run(q);
+ if (count != 1)
+ rc = NET_XMIT_SUCCESS;
}
+unlock:
spin_unlock(root_lock);
- if (unlikely(to_free))
- kfree_skb_list_reason(to_free,
- tcf_get_drop_reason(to_free));
- if (unlikely(contended))
- spin_unlock(&q->busylock);
+
+free_skbs:
+ tcf_kfree_skb_list(to_free);
+ tcf_kfree_skb_list(to_free2);
return rc;
}
@@ -4282,7 +4361,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
return ret;
tc_skb_cb(skb)->mru = 0;
- tc_skb_cb(skb)->post_ct = false;
+ qdisc_skb_cb(skb)->post_ct = false;
tcf_set_drop_reason(skb, *drop_reason);
mini_qdisc_bstats_cpu_update(miniq, skb);
@@ -4348,12 +4427,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
return skb;
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- if (*pt_prev) {
+ if (unlikely(*pt_prev)) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_pkt_len_segs_init(skb);
tcx_set_ingress(skb, true);
if (static_branch_unlikely(&tcx_needed_key)) {
@@ -4541,6 +4620,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
}
EXPORT_SYMBOL(dev_pick_tx_zero);
+int sk_tx_queue_get(const struct sock *sk)
+{
+ int resel, val;
+
+ if (!sk)
+ return -1;
+ /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+ * and sk_tx_queue_set().
+ */
+ val = READ_ONCE(sk->sk_tx_queue_mapping);
+
+ if (val == NO_QUEUE_MAPPING)
+ return -1;
+
+ if (!sk_fullsock(sk))
+ return val;
+
+ resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
+ if (resel && time_is_before_jiffies(
+ READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
+ return -1;
+
+ return val;
+}
+EXPORT_SYMBOL(sk_tx_queue_get);
+
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
{
@@ -4556,8 +4661,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
if (new_index < 0)
new_index = skb_tx_hash(dev, sb_dev, skb);
- if (queue_index != new_index && sk &&
- sk_fullsock(sk) &&
+ if (sk && sk_fullsock(sk) &&
rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, new_index);
@@ -4639,7 +4743,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_update_prio(skb);
- qdisc_pkt_len_init(skb);
+ qdisc_pkt_len_segs_init(skb);
tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
@@ -4837,9 +4941,40 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
return hash_32(hash, flow_table->log);
}
+#ifdef CONFIG_RFS_ACCEL
+/**
+ * rps_flow_is_active - check whether the flow is recently active.
+ * @rflow: Specific flow to check activity.
+ * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @cpu: CPU saved in @rflow.
+ *
+ * If the CPU has processed many packets since the flow's last activity
+ * (beyond 10 times the table size), the flow is considered stale.
+ *
+ * Return: true if flow was recently active.
+ */
+static bool rps_flow_is_active(struct rps_dev_flow *rflow,
+ struct rps_dev_flow_table *flow_table,
+ unsigned int cpu)
+{
+ unsigned int flow_last_active;
+ unsigned int sd_input_head;
+
+ if (cpu >= nr_cpu_ids)
+ return false;
+
+ sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
+ flow_last_active = READ_ONCE(rflow->last_qtail);
+
+ return (int)(sd_input_head - flow_last_active) <
+ (int)(10 << flow_table->log);
+}
+#endif
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow *rflow, u16 next_cpu)
+ struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
+ u32 flow_id)
{
if (next_cpu < nr_cpu_ids) {
u32 head;
@@ -4847,8 +4982,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
+ struct rps_dev_flow *tmp_rflow;
+ unsigned int tmp_cpu;
u16 rxq_index;
- u32 flow_id;
int rc;
/* Should we steer this flow to a different hardware queue? */
@@ -4863,14 +4999,29 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = rfs_slot(skb_get_hash(skb), flow_table);
+
+ tmp_rflow = &flow_table->flows[flow_id];
+ tmp_cpu = READ_ONCE(tmp_rflow->cpu);
+
+ if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
+ if (rps_flow_is_active(tmp_rflow, flow_table,
+ tmp_cpu)) {
+ if (hash != READ_ONCE(tmp_rflow->hash) ||
+ next_cpu == tmp_cpu)
+ goto out;
+ }
+ }
+
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
+
old_rflow = rflow;
- rflow = &flow_table->flows[flow_id];
+ rflow = tmp_rflow;
WRITE_ONCE(rflow->filter, rc);
+ WRITE_ONCE(rflow->hash, hash);
+
if (old_rflow->filter == rc)
WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out:
@@ -4896,6 +5047,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
+ u32 flow_id;
u32 tcpu;
u32 hash;
@@ -4942,7 +5094,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
+ flow_id = rfs_slot(hash, flow_table);
+ rflow = &flow_table->flows[flow_id];
tcpu = rflow->cpu;
/*
@@ -4961,7 +5114,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
- rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
+ flow_id);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
@@ -5005,17 +5159,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id < (1UL << flow_table->log)) {
+ unsigned int cpu;
+
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
- if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
- ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
- READ_ONCE(rflow->last_qtail)) <
- (int)(10 << flow_table->log)))
+ if (READ_ONCE(rflow->filter) == filter_id &&
+ rps_flow_is_active(rflow, flow_table, cpu))
expire = false;
}
rcu_read_unlock();
@@ -5081,8 +5234,9 @@ static void napi_schedule_rps(struct softnet_data *sd)
__napi_schedule_irqoff(&mysd->backlog);
}
-void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
+void kick_defer_list_purge(unsigned int cpu)
{
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
unsigned long flags;
if (use_backlog_threads()) {
@@ -5102,14 +5256,15 @@ void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif
-static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen,
+ int max_backlog)
{
#ifdef CONFIG_NET_FLOW_LIMIT
- struct sd_flow_limit *fl;
- struct softnet_data *sd;
unsigned int old_flow, new_flow;
+ const struct softnet_data *sd;
+ struct sd_flow_limit *fl;
- if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
+ if (likely(qlen < (max_backlog >> 1)))
return false;
sd = this_cpu_ptr(&softnet_data);
@@ -5154,19 +5309,19 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
u32 tail;
reason = SKB_DROP_REASON_DEV_READY;
- if (!netif_running(skb->dev))
+ if (unlikely(!netif_running(skb->dev)))
goto bad_dev;
- reason = SKB_DROP_REASON_CPU_BACKLOG;
sd = &per_cpu(softnet_data, cpu);
qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
max_backlog = READ_ONCE(net_hotdata.max_backlog);
- if (unlikely(qlen > max_backlog))
+ if (unlikely(qlen > max_backlog) ||
+ skb_flow_limit(skb, qlen, max_backlog))
goto cpu_backlog_drop;
backlog_lock_irq_save(sd, &flags);
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (likely(qlen <= max_backlog)) {
if (!qlen) {
/* Schedule NAPI for backlog device. We can use
* non atomic operation as we own the queue lock.
@@ -5187,7 +5342,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
backlog_unlock_irq_restore(sd, &flags);
cpu_backlog_drop:
- atomic_inc(&sd->dropped);
+ reason = SKB_DROP_REASON_CPU_BACKLOG;
+ numa_drop_add(&sd->drop_counters, 1);
bad_dev:
dev_core_stats_rx_dropped_inc(skb->dev);
kfree_skb_reason(skb, reason);
@@ -5593,8 +5749,9 @@ static __latent_entropy void net_tx_action(void)
rcu_read_lock();
while (head) {
- struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
+ struct sk_buff *to_free;
+ struct Qdisc *q = head;
head = head->next_sched;
@@ -5621,9 +5778,10 @@ static __latent_entropy void net_tx_action(void)
}
clear_bit(__QDISC_STATE_SCHED, &q->state);
- qdisc_run(q);
+ to_free = qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
+ tcf_kfree_skb_list(to_free);
}
rcu_read_unlock();
@@ -5733,7 +5891,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
if (nf_hook_ingress_active(skb)) {
int ingress_retval;
- if (*pt_prev) {
+ if (unlikely(*pt_prev)) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
@@ -5810,13 +5968,13 @@ another_round:
list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
list) {
- if (pt_prev)
+ if (unlikely(pt_prev))
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
- if (pt_prev)
+ if (unlikely(pt_prev))
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
@@ -5847,7 +6005,7 @@ skip_classify:
}
if (skb_vlan_tag_present(skb)) {
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
@@ -5859,7 +6017,7 @@ skip_classify:
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
@@ -6616,24 +6774,25 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
}
EXPORT_SYMBOL(napi_complete_done);
-static void skb_defer_free_flush(struct softnet_data *sd)
+static void skb_defer_free_flush(void)
{
+ struct llist_node *free_list;
struct sk_buff *skb, *next;
+ struct skb_defer_node *sdn;
+ int node;
- /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
- if (!READ_ONCE(sd->defer_list))
- return;
+ for_each_node(node) {
+ sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;
- spin_lock(&sd->defer_lock);
- skb = sd->defer_list;
- sd->defer_list = NULL;
- sd->defer_count = 0;
- spin_unlock(&sd->defer_lock);
+ if (llist_empty(&sdn->defer_list))
+ continue;
+ atomic_long_set(&sdn->defer_count, 0);
+ free_list = llist_del_all(&sdn->defer_list);
- while (skb != NULL) {
- next = skb->next;
- napi_consume_skb(skb, 1);
- skb = next;
+ llist_for_each_entry_safe(skb, next, free_list, ll_node) {
+ prefetch(next);
+ napi_consume_skb(skb, 1);
+ }
}
}
@@ -6761,7 +6920,7 @@ count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
- skb_defer_free_flush(this_cpu_ptr(&softnet_data));
+ skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
@@ -6939,7 +7098,8 @@ static void napi_stop_kthread(struct napi_struct *napi)
*/
if ((val & NAPIF_STATE_SCHED_THREADED) ||
!(val & NAPIF_STATE_SCHED)) {
- new = val & (~NAPIF_STATE_THREADED);
+ new = val & (~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL));
} else {
msleep(20);
continue;
@@ -6953,7 +7113,7 @@ static void napi_stop_kthread(struct napi_struct *napi)
* the kthread.
*/
while (true) {
- if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state))
+ if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
break;
msleep(20);
@@ -6963,6 +7123,16 @@ static void napi_stop_kthread(struct napi_struct *napi)
napi->thread = NULL;
}
+static void napi_set_threaded_state(struct napi_struct *napi,
+ enum netdev_napi_threaded threaded_mode)
+{
+ bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
+ bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
+
+ assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+ assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
+}
+
int napi_set_threaded(struct napi_struct *napi,
enum netdev_napi_threaded threaded)
{
@@ -6989,7 +7159,7 @@ int napi_set_threaded(struct napi_struct *napi,
} else {
/* Make sure kthread is created before THREADED bit is set. */
smp_mb__before_atomic();
- assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+ napi_set_threaded_state(napi, threaded);
}
return 0;
@@ -6999,7 +7169,7 @@ int netif_set_threaded(struct net_device *dev,
enum netdev_napi_threaded threaded)
{
struct napi_struct *napi;
- int err = 0;
+ int i, err = 0;
netdev_assert_locked_or_invisible(dev);
@@ -7021,6 +7191,10 @@ int netif_set_threaded(struct net_device *dev,
list_for_each_entry(napi, &dev->napi_list, dev_list)
WARN_ON_ONCE(napi_set_threaded(napi, threaded));
+ /* Override the config for all NAPIs even if currently not listed */
+ for (i = 0; i < dev->num_napi_configs; i++)
+ dev->napi_config[i].threaded = threaded;
+
return err;
}
@@ -7353,8 +7527,9 @@ void netif_napi_add_weight_locked(struct net_device *dev,
* Clear dev->threaded if kthread creation failed so that
* threaded mode will not be enabled in napi_enable().
*/
- if (dev->threaded && napi_kthread_create(napi))
- dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
+ if (napi_get_threaded_config(dev, napi))
+ if (napi_kthread_create(napi))
+ dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
netif_napi_set_irq_locked(napi, -1);
}
EXPORT_SYMBOL(netif_napi_add_weight_locked);
@@ -7376,7 +7551,9 @@ void napi_disable_locked(struct napi_struct *n)
}
new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
- new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
+ new &= ~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL |
+ NAPIF_STATE_PREFER_BUSY_POLL);
} while (!try_cmpxchg(&n->state, &val, new));
hrtimer_cancel(&n->timer);
@@ -7588,7 +7765,7 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
-static void napi_threaded_poll_loop(struct napi_struct *napi)
+static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct softnet_data *sd;
@@ -7615,24 +7792,49 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
- skb_defer_free_flush(sd);
+ skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
+
+ /* When busy poll is enabled, the old packets are not flushed in
+ * napi_complete_done. So flush them here.
+ */
+ if (busy_poll)
+ gro_flush_normal(&napi->gro, HZ >= 1000);
local_bh_enable();
+ /* Call cond_resched here to avoid watchdog warnings. */
+ if (repoll || busy_poll) {
+ rcu_softirq_qs_periodic(last_qs);
+ cond_resched();
+ }
+
if (!repoll)
break;
-
- rcu_softirq_qs_periodic(last_qs);
- cond_resched();
}
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
+ bool want_busy_poll;
+ bool in_busy_poll;
+ unsigned long val;
+
+ while (!napi_thread_wait(napi)) {
+ val = READ_ONCE(napi->state);
+
+ want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
+ in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
+
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ want_busy_poll = false;
+
+ if (want_busy_poll != in_busy_poll)
+ assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
+ want_busy_poll);
- while (!napi_thread_wait(napi))
- napi_threaded_poll_loop(napi);
+ napi_threaded_poll_loop(napi, want_busy_poll);
+ }
return 0;
}
@@ -7657,7 +7859,7 @@ start:
for (;;) {
struct napi_struct *n;
- skb_defer_free_flush(sd);
+ skb_defer_free_flush();
if (list_empty(&list)) {
if (list_empty(&repoll)) {
@@ -9780,7 +9982,7 @@ DECLARE_RWSEM(dev_addr_sem);
/* "sa" is a true struct sockaddr with limited "sa_data" member. */
int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
- size_t size = sizeof(sa->sa_data_min);
+ size_t size = sizeof(sa->sa_data);
struct net_device *dev;
int ret = 0;
@@ -11873,6 +12075,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
goto free_all;
dev->cfg_pending = dev->cfg;
+ dev->num_napi_configs = maxqs;
napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
if (!dev->napi_config)
@@ -12070,6 +12273,35 @@ static void dev_memory_provider_uninstall(struct net_device *dev)
}
}
+/* devices must be UP and netdev_lock()'d */
+static void netif_close_many_and_unlock(struct list_head *close_head)
+{
+ struct net_device *dev, *tmp;
+
+ netif_close_many(close_head, false);
+
+ /* ... now unlock them */
+ list_for_each_entry_safe(dev, tmp, close_head, close_list) {
+ netdev_unlock(dev);
+ list_del_init(&dev->close_list);
+ }
+}
+
+static void netif_close_many_and_unlock_cond(struct list_head *close_head)
+{
+#ifdef CONFIG_LOCKDEP
+ /* We can only track up to MAX_LOCK_DEPTH locks per task.
+ *
+ * Reserve half the available slots for additional locks possibly
+ * taken by notifiers and (soft)irqs.
+ */
+ unsigned int limit = MAX_LOCK_DEPTH / 2;
+
+ if (lockdep_depth(current) > limit)
+ netif_close_many_and_unlock(close_head);
+#endif
+}
+
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh)
{
@@ -12102,17 +12334,18 @@ void unregister_netdevice_many_notify(struct list_head *head,
/* If device is running, close it first. Start with ops locked... */
list_for_each_entry(dev, head, unreg_list) {
+ if (!(dev->flags & IFF_UP))
+ continue;
if (netdev_need_ops_lock(dev)) {
list_add_tail(&dev->close_list, &close_head);
netdev_lock(dev);
}
+ netif_close_many_and_unlock_cond(&close_head);
}
- netif_close_many(&close_head, true);
- /* ... now unlock them and go over the rest. */
+ netif_close_many_and_unlock(&close_head);
+ /* ... now go over the rest. */
list_for_each_entry(dev, head, unreg_list) {
- if (netdev_need_ops_lock(dev))
- netdev_unlock(dev);
- else
+ if (!netdev_need_ops_lock(dev))
list_add_tail(&dev->close_list, &close_head);
}
netif_close_many(&close_head, true);
@@ -12510,6 +12743,94 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
}
EXPORT_SYMBOL(netdev_increment_features);
+/**
+ * netdev_compute_master_upper_features - compute feature from lowers
+ * @dev: the upper device
+ * @update_header: whether to update upper device's header_len/headroom/tailroom
+ *
+ * Recompute the upper device's feature based on all lower devices.
+ */
+void netdev_compute_master_upper_features(struct net_device *dev, bool update_header)
+{
+ unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
+ netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES;
+ netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES;
+ netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES;
+ netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES;
+ netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES;
+ unsigned short max_header_len = ETH_HLEN;
+ unsigned int tso_max_size = TSO_MAX_SIZE;
+ unsigned short max_headroom = 0;
+ unsigned short max_tailroom = 0;
+ u16 tso_max_segs = TSO_MAX_SEGS;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ mpls_features = netdev_base_features(mpls_features);
+ vlan_features = netdev_base_features(vlan_features);
+ enc_features = netdev_base_features(enc_features);
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ gso_partial_features = netdev_increment_features(gso_partial_features,
+ lower_dev->gso_partial_features,
+ MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES);
+
+ vlan_features = netdev_increment_features(vlan_features,
+ lower_dev->vlan_features,
+ MASTER_UPPER_DEV_VLAN_FEATURES);
+
+ enc_features = netdev_increment_features(enc_features,
+ lower_dev->hw_enc_features,
+ MASTER_UPPER_DEV_ENC_FEATURES);
+
+ if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+ xfrm_features = netdev_increment_features(xfrm_features,
+ lower_dev->hw_enc_features,
+ MASTER_UPPER_DEV_XFRM_FEATURES);
+
+ mpls_features = netdev_increment_features(mpls_features,
+ lower_dev->mpls_features,
+ MASTER_UPPER_DEV_MPLS_FEATURES);
+
+ dst_release_flag &= lower_dev->priv_flags;
+
+ if (update_header) {
+ max_header_len = max(max_header_len, lower_dev->hard_header_len);
+ max_headroom = max(max_headroom, lower_dev->needed_headroom);
+ max_tailroom = max(max_tailroom, lower_dev->needed_tailroom);
+ }
+
+ tso_max_size = min(tso_max_size, lower_dev->tso_max_size);
+ tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs);
+ }
+
+ dev->gso_partial_features = gso_partial_features;
+ dev->vlan_features = vlan_features;
+ dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+ if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+ dev->hw_enc_features |= xfrm_features;
+ dev->mpls_features = mpls_features;
+
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
+ dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+ dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+ if (update_header) {
+ dev->hard_header_len = max_header_len;
+ dev->needed_headroom = max_headroom;
+ dev->needed_tailroom = max_tailroom;
+ }
+
+ netif_set_tso_max_segs(dev, tso_max_segs);
+ netif_set_tso_max_size(dev, tso_max_size);
+
+ netdev_change_features(dev);
+}
+EXPORT_SYMBOL(netdev_compute_master_upper_features);
+
static struct hlist_head * __net_init netdev_create_hash(void)
{
int i;
@@ -12823,7 +13144,7 @@ static void run_backlog_napi(unsigned int cpu)
{
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
- napi_threaded_poll_loop(&sd->backlog);
+ napi_threaded_poll_loop(&sd->backlog, false);
}
static void backlog_napi_setup(unsigned int cpu)
@@ -12890,7 +13211,6 @@ static int __init net_dev_init(void)
sd->cpu = i;
#endif
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
- spin_lock_init(&sd->defer_lock);
gro_init(&sd->backlog.gro);
sd->backlog.poll = process_backlog;
@@ -12900,6 +13220,11 @@ static int __init net_dev_init(void)
if (net_page_pool_create(i))
goto out;
}
+ net_hotdata.skb_defer_nodes =
+ __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
+ __alignof__(struct skb_defer_node));
+ if (!net_hotdata.skb_defer_nodes)
+ goto out;
if (use_backlog_threads())
smpboot_register_percpu_thread(&backlog_threads);
diff --git a/net/core/dev.h b/net/core/dev.h
index ab69edc0c3e3..da18536cbd35 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -29,7 +29,6 @@ struct napi_struct *
netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);
-struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev,
@@ -317,12 +316,23 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
{
+ if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state))
+ return NETDEV_NAPI_THREADED_BUSY_POLL;
+
if (test_bit(NAPI_STATE_THREADED, &n->state))
return NETDEV_NAPI_THREADED_ENABLED;
return NETDEV_NAPI_THREADED_DISABLED;
}
+static inline enum netdev_napi_threaded
+napi_get_threaded_config(struct net_device *dev, struct napi_struct *n)
+{
+ if (n->config)
+ return n->config->threaded;
+ return dev->threaded;
+}
+
int napi_set_threaded(struct napi_struct *n,
enum netdev_napi_threaded threaded);
@@ -349,7 +359,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi)
WARN_ON(READ_ONCE(napi->list_owner) != -1);
}
-void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
+void kick_defer_list_purge(unsigned int cpu);
#define XMIT_RECURSION_LIMIT 8
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 9c0ad7f4b5d8..53a53357cfef 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -249,10 +249,11 @@ int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
*
* Helper for calling the default hardware provider timestamping.
*
- * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and
- * there only exists a phydev->mii_ts->hwtstamp() method. So this will return
- * -EOPNOTSUPP for phylib for now, which is still more accurate than letting
- * the netdev handle the GET request.
+ * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but
+ * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this
+ * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not
+ * implemented for now, which is still more accurate than letting the netdev
+ * handle the GET request.
*/
int dev_get_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg)
@@ -443,6 +444,9 @@ static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd,
struct ifreq ifrr;
int err;
+ if (!kernel_cfg->ifr)
+ return -EINVAL;
+
strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ);
ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru;
@@ -464,8 +468,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev,
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_get)
- return dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ if (ops->ndo_hwtstamp_get) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
/* Legacy path: unconverted lower driver */
return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
@@ -481,8 +492,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev,
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_set)
- return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ if (ops->ndo_hwtstamp_set) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
/* Legacy path: unconverted lower driver */
return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
@@ -582,7 +600,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
if (ifr->ifr_hwaddr.sa_family != dev->type)
return -EINVAL;
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
- min(sizeof(ifr->ifr_hwaddr.sa_data_min),
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
(size_t)dev->addr_len));
netdev_lock_ops(dev);
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 24c591ab38ae..ec4217d6c0b4 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -17,6 +17,7 @@
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
#include <net/sock.h>
+#include <net/tcp.h>
#include <trace/events/page_pool.h>
#include "devmem.h"
@@ -96,9 +97,9 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
index = offset / PAGE_SIZE;
niov = &owner->area.niovs[index];
- niov->pp_magic = 0;
- niov->pp = NULL;
- atomic_long_set(&niov->pp_ref_count, 0);
+ niov->desc.pp_magic = 0;
+ niov->desc.pp = NULL;
+ atomic_long_set(&niov->desc.pp_ref_count, 0);
return niov;
}
@@ -176,6 +177,7 @@ err_close_rxq:
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
@@ -188,6 +190,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
unsigned long virtual;
int err;
+ if (!dma_dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't support DMA");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
dmabuf = dma_buf_get(dmabuf_fd);
if (IS_ERR(dmabuf))
return ERR_CAST(dmabuf);
@@ -209,7 +216,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
binding->dmabuf = dmabuf;
binding->direction = direction;
- binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
+ binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev);
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
@@ -351,7 +358,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
unsigned int dmabuf_id)
{
struct net_devmem_dmabuf_binding *binding;
- struct dst_entry *dst = __sk_dst_get(sk);
+ struct net_device *dst_dev;
+ struct dst_entry *dst;
int err = 0;
binding = net_devmem_lookup_dmabuf(dmabuf_id);
@@ -360,16 +368,35 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
goto out_err;
}
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ /* If dst is NULL (route expired), attempt to rebuild it. */
+ if (unlikely(!dst)) {
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) {
+ err = -EHOSTUNREACH;
+ goto out_unlock;
+ }
+ dst = __sk_dst_get(sk);
+ if (unlikely(!dst)) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
/* The dma-addrs in this binding are only reachable to the corresponding
* net_device.
*/
- if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
+ dst_dev = dst_dev_rcu(dst);
+ if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) {
err = -ENODEV;
- goto out_err;
+ goto out_unlock;
}
+ rcu_read_unlock();
return binding;
+out_unlock:
+ rcu_read_unlock();
out_err:
if (binding)
net_devmem_dmabuf_binding_put(binding);
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 41cd6e1c9141..0b43a648cd2e 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -85,6 +85,7 @@ struct dmabuf_genpool_chunk_owner {
void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack);
@@ -93,7 +94,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
struct net_devmem_dmabuf_binding *binding,
struct netlink_ext_ack *extack);
-void net_devmem_bind_tx_release(struct sock *sk);
static inline struct dmabuf_genpool_chunk_owner *
net_devmem_iov_to_chunk_owner(const struct net_iov *niov)
@@ -170,6 +170,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov)
static inline struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd,
struct netdev_nl_sock *priv,
diff --git a/net/core/dst.c b/net/core/dst.c
index e2de8b68c41d..e9d35f49c9e7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->ops->ifdown(dst, dev);
WRITE_ONCE(dst->input, dst_discard);
WRITE_ONCE(dst->output, dst_discard_out);
- WRITE_ONCE(dst->dev, blackhole_netdev);
+ rcu_assign_pointer(dst->dev_rcu, blackhole_netdev);
netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
GFP_ATOMIC);
}
diff --git a/net/core/filter.c b/net/core/filter.c
index da391e2b0788..616e0520a0bb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2281,6 +2281,7 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
if (IS_ERR(dst))
goto out_drop;
+ skb_dst_drop(skb);
skb_dst_set(skb, dst);
} else if (nh->nh_family != AF_INET6) {
goto out_drop;
@@ -2373,7 +2374,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct flowi4 fl4 = {
.flowi4_flags = FLOWI_FLAG_ANYSRC,
.flowi4_mark = skb->mark,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)),
+ .flowi4_dscp = ip4h_dscp(ip4h),
.flowi4_oif = dev->ifindex,
.flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
@@ -2389,6 +2390,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
goto out_drop;
}
+ skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
}
@@ -2456,6 +2458,13 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return -EINVAL;
+ /* BPF test infra's convert___skb_to_skb() can create type-less
+ * GSO packets. gso_features_check() will detect this as a bad
+ * offload. However, lets not leak them out in the first place.
+ */
+ if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type))
+ return -EBADMSG;
+
dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
if (unlikely(!dev))
return -EINVAL;
@@ -3251,11 +3260,11 @@ static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
- /* Caller already did skb_cow() with len as headroom,
+ /* Caller already did skb_cow() with meta_len+len as headroom,
* so no need to do it here.
*/
skb_push(skb, len);
- memmove(skb->data, skb->data + len, off);
+ skb_postpush_data_move(skb, len, off);
memset(skb->data + off, 0, len);
/* No skb_postpush_rcsum(skb, skb->data + off, len)
@@ -3279,7 +3288,7 @@ static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
old_data = skb->data;
__skb_pull(skb, len);
skb_postpull_rcsum(skb, old_data + off, len);
- memmove(skb->data, old_data, off);
+ skb_postpull_data_move(skb, len, off);
return 0;
}
@@ -3324,10 +3333,11 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ const u8 meta_len = skb_metadata_len(skb);
u32 off = skb_mac_header_len(skb);
int ret;
- ret = skb_cow(skb, len_diff);
+ ret = skb_cow(skb, meta_len + len_diff);
if (unlikely(ret < 0))
return ret;
@@ -3487,6 +3497,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
u16 mac_len = 0, inner_net = 0, inner_trans = 0;
+ const u8 meta_len = skb_metadata_len(skb);
unsigned int gso_type = SKB_GSO_DODGY;
int ret;
@@ -3497,7 +3508,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
return -ENOTSUPP;
}
- ret = skb_cow_head(skb, len_diff);
+ ret = skb_cow_head(skb, meta_len + len_diff);
if (unlikely(ret < 0))
return ret;
@@ -3871,15 +3882,17 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = {
static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
u64 flags)
{
+ const u8 meta_len = skb_metadata_len(skb);
u32 max_len = BPF_SKB_MAX_LEN;
u32 new_len = skb->len + head_room;
int ret;
- if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ if (unlikely(flags || (int)head_room < 0 ||
+ (!skb_is_gso(skb) && new_len > max_len) ||
new_len < skb->len))
return -EINVAL;
- ret = skb_cow(skb, head_room);
+ ret = skb_cow(skb, meta_len + head_room);
if (likely(!ret)) {
/* Idea for this helper is that we currently only
* allow to expand on mac header. This means that
@@ -3891,6 +3904,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
* for redirection into L2 device.
*/
__skb_push(skb, head_room);
+ skb_postpush_data_move(skb, head_room, 0);
memset(skb->data, 0, head_room);
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
@@ -4153,34 +4167,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
return 0;
}
-static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
- enum xdp_mem_type mem_type, bool release)
+static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
+ bool tail, bool release)
{
- struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
+ struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) :
+ xsk_buff_get_head(xdp);
if (release) {
- xsk_buff_del_tail(zc_frag);
- __xdp_return(0, mem_type, false, zc_frag);
+ xsk_buff_del_frag(zc_frag);
} else {
- zc_frag->data_end -= shrink;
+ if (tail)
+ zc_frag->data_end -= shrink;
+ else
+ zc_frag->data += shrink;
}
+
+ return zc_frag;
}
static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
- int shrink)
+ int shrink, bool tail)
{
enum xdp_mem_type mem_type = xdp->rxq->mem.type;
bool release = skb_frag_size(frag) == shrink;
+ netmem_ref netmem = skb_frag_netmem(frag);
+ struct xdp_buff *zc_frag = NULL;
if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
- bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release);
- goto out;
+ netmem = 0;
+ zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release);
}
- if (release)
- __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL);
+ if (release) {
+ __xdp_return(netmem, mem_type, false, zc_frag);
+ } else {
+ if (!tail)
+ skb_frag_off_add(frag, shrink);
+ skb_frag_size_sub(frag, shrink);
+ }
-out:
return release;
}
@@ -4198,18 +4223,15 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
len_free += shrink;
offset -= shrink;
- if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
+ if (bpf_xdp_shrink_data(xdp, frag, shrink, true))
n_frags_free++;
- } else {
- skb_frag_size_sub(frag, shrink);
- break;
- }
}
sinfo->nr_frags -= n_frags_free;
sinfo->xdp_frags_size -= len_free;
if (unlikely(!sinfo->nr_frags)) {
xdp_buff_clear_frags_flag(xdp);
+ xdp_buff_clear_frag_pfmemalloc(xdp);
xdp->data_end -= offset;
}
@@ -5723,6 +5745,77 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk,
+ char *optval, int optlen,
+ bool getopt)
+{
+ int val;
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ if (!sk_has_account(sk))
+ return -EOPNOTSUPP;
+
+ if (getopt) {
+ *(int *)optval = sk->sk_bypass_prot_mem;
+ return 0;
+ }
+
+ val = *(int *)optval;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+
+ sk->sk_bypass_prot_mem = val;
+ return 0;
+}
+
+BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM)
+ return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false);
+
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = {
+ .func = bpf_sock_create_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) {
+ int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true);
+
+ if (err)
+ memset(optval, 0, optlen);
+
+ return err;
+ }
+
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = {
+ .func = bpf_sock_create_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
@@ -5897,7 +5990,7 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
return err;
if (((struct sockaddr_in *)addr)->sin_port == htons(0))
flags |= BIND_FORCE_ADDRESS_NO_PORT;
- return __inet_bind(sk, addr, addr_len, flags);
+ return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
} else if (addr->sa_family == AF_INET6) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -5907,7 +6000,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
/* ipv6_bpf_stub cannot be NULL, since it's called from
* bpf_cgroup_inet6_connect hook and ipv6 is already loaded
*/
- return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
+ return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr,
+ addr_len, flags);
#endif /* CONFIG_IPV6 */
}
#endif /* CONFIG_INET */
@@ -6020,7 +6114,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
fl4.flowi4_iif = params->ifindex;
fl4.flowi4_oif = 0;
}
- fl4.flowi4_tos = params->tos & INET_DSCP_MASK;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos);
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
@@ -6411,9 +6505,12 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
*/
if (skb_is_gso(skb)) {
ret = BPF_MTU_CHK_RET_SUCCESS;
- if (flags & BPF_MTU_CHK_SEGS &&
- !skb_gso_validate_network_len(skb, mtu))
- ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
+ if (flags & BPF_MTU_CHK_SEGS) {
+ if (!skb_transport_header_was_set(skb))
+ return -EINVAL;
+ if (!skb_gso_validate_network_len(skb, mtu))
+ ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
+ }
}
out:
*mtu_len = mtu;
@@ -6767,7 +6864,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
{
- struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
bool refcounted = false;
struct sock *sk = NULL;
@@ -6776,7 +6872,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
__be32 dst4 = tuple->ipv4.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet_lookup(net, hinfo, NULL, 0,
+ sk = __inet_lookup(net, NULL, 0,
src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &refcounted);
@@ -6790,7 +6886,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet6_lookup(net, hinfo, NULL, 0,
+ sk = __inet6_lookup(net, NULL, 0,
src6, tuple->ipv6.sport,
dst6, ntohs(tuple->ipv6.dport),
dif, sdif, &refcounted);
@@ -7431,6 +7527,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct xdp_sock, FIELD)); \
} while (0)
+ BTF_TYPE_EMIT(struct bpf_xdp_sock);
+
switch (si->off) {
case offsetof(struct bpf_xdp_sock, queue_id):
BPF_XDP_SOCK_GET(queue_id);
@@ -8051,6 +8149,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_cg_sock_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_setsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ return &bpf_sock_create_setsockopt_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_getsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ return &bpf_sock_create_getsockopt_proto;
+ default:
+ return NULL;
+ }
default:
return bpf_base_func_proto(func_id, prog);
}
@@ -9284,13 +9396,17 @@ static bool sock_addr_is_valid_access(int off, int size,
return false;
info->reg_type = PTR_TO_SOCKET;
break;
- default:
- if (type == BPF_READ) {
- if (size != size_default)
- return false;
- } else {
+ case bpf_ctx_range(struct bpf_sock_addr, user_family):
+ case bpf_ctx_range(struct bpf_sock_addr, family):
+ case bpf_ctx_range(struct bpf_sock_addr, type):
+ case bpf_ctx_range(struct bpf_sock_addr, protocol):
+ if (type != BPF_READ)
return false;
- }
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ return false;
}
return true;
@@ -11990,6 +12106,28 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return func;
}
+/**
+ * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area.
+ * @skb: socket buffer carrying the metadata
+ * @offset: offset into the metadata area, must be <= skb_metadata_len()
+ */
+void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
+{
+ return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
+}
+
+int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+ const void *from, u32 len, u64 flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ if (unlikely(bpf_try_make_writable(skb, 0)))
+ return -EFAULT;
+
+ memmove(bpf_skb_meta_pointer(skb, offset), from, len);
+ return 0;
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
struct bpf_dynptr *ptr__uninit)
@@ -12007,6 +12145,36 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
return 0;
}
+/**
+ * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area.
+ * @skb_: socket buffer carrying the metadata
+ * @flags: future use, must be zero
+ * @ptr__uninit: dynptr to initialize
+ *
+ * Set up a dynptr for access to the metadata area earlier allocated from the
+ * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
+ * &__sk_buff->data_meta.
+ *
+ * Return:
+ * * %0 - dynptr ready to use
+ * * %-EINVAL - invalid flags, dynptr set to null
+ */
+__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)skb_;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));
+
+ return 0;
+}
+
__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
struct bpf_dynptr *ptr__uninit)
{
@@ -12160,6 +12328,98 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
return 0;
}
+/**
+ * bpf_xdp_pull_data() - Pull in non-linear xdp data.
+ * @x: &xdp_md associated with the XDP buffer
+ * @len: length of data to be made directly accessible in the linear part
+ *
+ * Pull in data in case the XDP buffer associated with @x is non-linear and
+ * not all @len are in the linear data area.
+ *
+ * Direct packet access allows reading and writing linear XDP data through
+ * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which
+ * ends up in the linear part of the xdp_buff depends on the NIC and its
+ * configuration. When a frag-capable XDP program wants to directly access
+ * headers that may be in the non-linear area, call this kfunc to make sure
+ * the data is available in the linear area. Alternatively, use dynptr or
+ * bpf_xdp_{load,store}_bytes() to access data without pulling.
+ *
+ * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate
+ * headers in the non-linear data area.
+ *
+ * A call to this kfunc may reduce headroom. If there is not enough tailroom
+ * in the linear data area, metadata and data will be shifted down.
+ *
+ * A call to this kfunc is susceptible to change the buffer geometry.
+ * Therefore, at load time, all checks on pointers previously done by the
+ * verifier are invalidated and must be performed again, if the kfunc is used
+ * in combination with direct packet access.
+ *
+ * Return:
+ * * %0 - success
+ * * %-EINVAL - invalid len
+ */
+__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)x;
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, delta, shift, headroom, tailroom, n_frags_free = 0;
+ void *data_hard_end = xdp_data_hard_end(xdp);
+ int data_len = xdp->data_end - xdp->data;
+ void *start;
+
+ if (len <= data_len)
+ return 0;
+
+ if (unlikely(len > xdp_get_buff_len(xdp)))
+ return -EINVAL;
+
+ start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta;
+
+ headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame);
+ tailroom = data_hard_end - xdp->data_end;
+
+ delta = len - data_len;
+ if (unlikely(delta > tailroom + headroom))
+ return -EINVAL;
+
+ shift = delta - tailroom;
+ if (shift > 0) {
+ memmove(start - shift, start, xdp->data_end - start);
+
+ xdp->data_meta -= shift;
+ xdp->data -= shift;
+ xdp->data_end -= shift;
+ }
+
+ for (i = 0; i < sinfo->nr_frags && delta; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ u32 shrink = min_t(u32, delta, skb_frag_size(frag));
+
+ memcpy(xdp->data_end, skb_frag_address(frag), shrink);
+
+ xdp->data_end += shrink;
+ sinfo->xdp_frags_size -= shrink;
+ delta -= shrink;
+ if (bpf_xdp_shrink_data(xdp, frag, shrink, false))
+ n_frags_free++;
+ }
+
+ if (unlikely(n_frags_free)) {
+ memmove(sinfo->frags, sinfo->frags + n_frags_free,
+ (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t));
+
+ sinfo->nr_frags -= n_frags_free;
+
+ if (!sinfo->nr_frags) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp_buff_clear_frag_pfmemalloc(xdp);
+ }
+ }
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
@@ -12181,8 +12441,13 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)
+
BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
+BTF_ID_FLAGS(func, bpf_xdp_pull_data)
BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)
BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
@@ -12202,6 +12467,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.set = &bpf_kfunc_check_set_skb,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_skb_meta,
+};
+
static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_xdp,
@@ -12237,6 +12507,8 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
&bpf_kfunc_set_sock_addr);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 7d426a8e29f3..f112156db587 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -90,10 +90,12 @@ static void est_timer(struct timer_list *t)
rate = (b_packets - est->last_packets) << (10 - est->intvl_log);
rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
+ preempt_disable_nested();
write_seqcount_begin(&est->seq);
est->avbps += brate;
est->avpps += rate;
write_seqcount_end(&est->seq);
+ preempt_enable_nested();
est->last_bytes = b_bytes;
est->last_packets = b_packets;
diff --git a/net/core/gro.c b/net/core/gro.c
index b350e5b69549..76f9c3712422 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <net/psp.h>
#include <net/gro.h>
#include <net/dst_metadata.h>
#include <net/busy_poll.h>
@@ -376,6 +377,7 @@ static void gro_list_prepare(const struct list_head *head,
diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
diffs |= gro_list_prepare_tc_ext(skb, p, diffs);
+ diffs |= __psp_skb_coalesce_diff(skb, p, diffs);
}
NAPI_GRO_CB(p)->same_flow = !diffs;
@@ -637,6 +639,8 @@ EXPORT_SYMBOL(gro_receive_skb);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
+ struct skb_shared_info *shinfo;
+
if (unlikely(skb->pfmemalloc)) {
consume_skb(skb);
return;
@@ -653,8 +657,12 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->encapsulation = 0;
skb->ip_summed = CHECKSUM_NONE;
- skb_shinfo(skb)->gso_type = 0;
- skb_shinfo(skb)->gso_size = 0;
+
+ shinfo = skb_shinfo(skb);
+ shinfo->gso_type = 0;
+ shinfo->gso_size = 0;
+ shinfo->hwtstamps.hwtstamp = 0;
+
if (unlikely(skb->slow_gro)) {
skb_orphan(skb);
skb_ext_reset(skb);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index ff8e5b64bf6b..a725d21159a6 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -8,11 +8,13 @@
struct gro_cell {
struct sk_buff_head napi_skbs;
struct napi_struct napi;
+ local_lock_t bh_lock;
};
int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
+ bool have_bh_lock = false;
struct gro_cell *cell;
int res;
@@ -25,6 +27,8 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
goto unlock;
}
+ local_lock_nested_bh(&gcells->cells->bh_lock);
+ have_bh_lock = true;
cell = this_cpu_ptr(gcells->cells);
if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) {
@@ -42,6 +46,8 @@ drop:
res = NET_RX_SUCCESS;
unlock:
+ if (have_bh_lock)
+ local_unlock_nested_bh(&gcells->cells->bh_lock);
rcu_read_unlock();
return res;
}
@@ -55,7 +61,9 @@ static int gro_cell_poll(struct napi_struct *napi, int budget)
int work_done = 0;
while (work_done < budget) {
+ __local_lock_nested_bh(&cell->bh_lock);
skb = __skb_dequeue(&cell->napi_skbs);
+ __local_unlock_nested_bh(&cell->bh_lock);
if (!skb)
break;
napi_gro_receive(napi, skb);
@@ -79,6 +87,7 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
__skb_queue_head_init(&cell->napi_skbs);
+ local_lock_init(&cell->bh_lock);
set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index 95d0a4df1006..dddd5c287cf0 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -20,7 +20,7 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
.dev_tx_weight = 64,
.dev_rx_weight = 64,
.sysctl_max_skb_frags = MAX_SKB_FRAGS,
- .sysctl_skb_defer_max = 64,
+ .sysctl_skb_defer_max = 128,
.sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE
};
EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 864f3bbc3a4c..212cde35affa 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -157,9 +157,9 @@ static void linkwatch_schedule_work(int urgent)
* override the existing timer.
*/
if (test_bit(LW_URGENT, &linkwatch_flags))
- mod_delayed_work(system_unbound_wq, &linkwatch_work, 0);
+ mod_delayed_work(system_dfl_wq, &linkwatch_work, 0);
else
- queue_delayed_work(system_unbound_wq, &linkwatch_work, delay);
+ queue_delayed_work(system_dfl_wq, &linkwatch_work, delay);
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index ae74634310a3..9f40be0c3e71 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -8,12 +8,12 @@
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/bpf.h>
+#include <net/flow.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
#include <net/ip.h>
#include <net/ip6_route.h>
#include <net/ipv6_stubs.h>
-#include <net/inet_dscp.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
fl4.flowi4_oif = oif;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_uid = sock_net_uid(net, sk);
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
fl4.flowi4_proto = iph->protocol;
fl4.daddr = iph->daddr;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index bddfa389effa..96a3b1a93252 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -81,7 +81,7 @@ static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family
}
/*
- Neighbour hash table buckets are protected with rwlock tbl->lock.
+ Neighbour hash table buckets are protected with tbl->lock.
- All the scans/updates to hash buckets MUST be made under this lock.
- NOTHING clever should be made under this lock: no callbacks
@@ -149,7 +149,7 @@ static void neigh_update_gc_list(struct neighbour *n)
{
bool on_gc_list, exempt_from_gc;
- write_lock_bh(&n->tbl->lock);
+ spin_lock_bh(&n->tbl->lock);
write_lock(&n->lock);
if (n->dead)
goto out;
@@ -172,14 +172,14 @@ static void neigh_update_gc_list(struct neighbour *n)
}
out:
write_unlock(&n->lock);
- write_unlock_bh(&n->tbl->lock);
+ spin_unlock_bh(&n->tbl->lock);
}
static void neigh_update_managed_list(struct neighbour *n)
{
bool on_managed_list, add_to_managed;
- write_lock_bh(&n->tbl->lock);
+ spin_lock_bh(&n->tbl->lock);
write_lock(&n->lock);
if (n->dead)
goto out;
@@ -193,7 +193,7 @@ static void neigh_update_managed_list(struct neighbour *n)
list_add_tail(&n->managed_list, &n->tbl->managed_list);
out:
write_unlock(&n->lock);
- write_unlock_bh(&n->tbl->lock);
+ spin_unlock_bh(&n->tbl->lock);
}
static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
@@ -263,7 +263,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
if (refcount_read(&n->refcnt) == 1) {
@@ -292,7 +292,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
WRITE_ONCE(tbl->last_flush, jiffies);
unlock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
return shrunk;
}
@@ -454,23 +454,23 @@ static void neigh_flush_table(struct neigh_table *tbl)
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev, false);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
EXPORT_SYMBOL(neigh_changeaddr);
static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
bool skip_perm)
{
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
if (likely(dev)) {
neigh_flush_dev(tbl, dev, skip_perm);
} else {
DEBUG_NET_WARN_ON_ONCE(skip_perm);
neigh_flush_table(tbl);
}
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
pneigh_ifdown(tbl, dev, skip_perm);
pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
@@ -687,7 +687,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
@@ -722,13 +722,13 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
hlist_add_head_rcu(&n->dev_list,
neigh_get_dev_table(dev, tbl->family));
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
out_neigh_release:
if (!exempt_from_gc)
atomic_dec(&tbl->gc_entries);
@@ -982,7 +982,7 @@ static void neigh_periodic_work(struct work_struct *work)
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
@@ -995,8 +995,7 @@ static void neigh_periodic_work(struct work_struct *work)
WRITE_ONCE(tbl->last_rand, jiffies);
list_for_each_entry(p, &tbl->parms_list, list)
- p->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(p);
}
if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1))
@@ -1037,9 +1036,9 @@ static void neigh_periodic_work(struct work_struct *work)
* It's fine to release lock here, even if hash table
* grows while we are preempted.
*/
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
cond_resched();
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
}
@@ -1050,7 +1049,7 @@ out:
*/
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
@@ -1642,12 +1641,12 @@ static void neigh_managed_work(struct work_struct *work)
managed_work.work);
struct neighbour *neigh;
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
list_for_each_entry(neigh, &tbl->managed_list, managed_list)
neigh_event_send_probe(neigh, NULL, false);
queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
static void neigh_proxy_process(struct timer_list *t)
@@ -1749,8 +1748,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
if (p) {
p->tbl = tbl;
refcount_set(&p->refcnt, 1);
- p->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(p);
p->qlen = 0;
netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
p->dev = dev;
@@ -1763,9 +1761,9 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
return NULL;
}
- write_lock_bh(&tbl->lock);
- list_add(&p->list, &tbl->parms.list);
- write_unlock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ list_add_rcu(&p->list, &tbl->parms.list);
+ spin_unlock_bh(&tbl->lock);
neigh_parms_data_state_cleanall(p);
}
@@ -1785,10 +1783,12 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
{
if (!parms || parms == &tbl->parms)
return;
- write_lock_bh(&tbl->lock);
- list_del(&parms->list);
+
+ spin_lock_bh(&tbl->lock);
+ list_del_rcu(&parms->list);
parms->dead = 1;
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
+
netdev_put(parms->dev, &parms->dev_tracker);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
@@ -1810,8 +1810,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
refcount_set(&tbl->parms.refcnt, 1);
- tbl->parms.reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(&tbl->parms);
tbl->parms.qlen = 0;
tbl->stats = alloc_percpu(struct neigh_statistics);
@@ -1838,7 +1837,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
else
WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
- rwlock_init(&tbl->lock);
+ spin_lock_init(&tbl->lock);
mutex_init(&tbl->phash_lock);
INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
@@ -1981,10 +1980,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
err = __neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN,
NETLINK_CB(skb).portid, extack);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_release(neigh);
neigh_remove_one(neigh);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
out:
return err;
@@ -2179,7 +2178,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
return -ENOBUFS;
if ((parms->dev &&
- nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
+ nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) ||
nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
@@ -2194,7 +2193,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
NEIGH_VAR(parms, MCAST_PROBES)) ||
nla_put_u32(skb, NDTPA_MCAST_REPROBES,
NEIGH_VAR(parms, MCAST_REPROBES)) ||
- nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time,
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time),
NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
@@ -2231,8 +2230,6 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
-
- read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
@@ -2258,11 +2255,9 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
.ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen),
};
- rcu_read_lock();
nht = rcu_dereference(tbl->nht);
ndc.ndtc_hash_rnd = nht->hash_rnd[0];
ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
- rcu_read_unlock();
if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
goto nla_put_failure;
@@ -2300,12 +2295,10 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
if (neightbl_fill_parms(skb, &tbl->parms) < 0)
goto nla_put_failure;
- read_unlock_bh(&tbl->lock);
nlmsg_end(skb, nlh);
return 0;
nla_put_failure:
- read_unlock_bh(&tbl->lock);
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
@@ -2324,8 +2317,6 @@ static int neightbl_fill_param_info(struct sk_buff *skb,
return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
-
- read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
@@ -2334,11 +2325,9 @@ static int neightbl_fill_param_info(struct sk_buff *skb,
neightbl_fill_parms(skb, parms) < 0)
goto errout;
- read_unlock_bh(&tbl->lock);
nlmsg_end(skb, nlh);
return 0;
errout:
- read_unlock_bh(&tbl->lock);
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
@@ -2375,9 +2364,9 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NDTA_MAX + 1];
struct neigh_table *tbl;
struct ndtmsg *ndtmsg;
- struct nlattr *tb[NDTA_MAX+1];
bool found = false;
int err, tidx;
@@ -2393,26 +2382,33 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
ndtmsg = nlmsg_data(nlh);
+ rcu_read_lock();
+
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
- tbl = rcu_dereference_rtnl(neigh_tables[tidx]);
+ tbl = rcu_dereference(neigh_tables[tidx]);
if (!tbl)
continue;
+
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
+
if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
found = true;
break;
}
}
- if (!found)
- return -ENOENT;
+ if (!found) {
+ rcu_read_unlock();
+ err = -ENOENT;
+ goto errout;
+ }
/*
* We acquire tbl->lock to be nice to the periodic timers and
* make sure they always see a consistent set of values.
*/
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
if (tb[NDTA_PARMS]) {
struct nlattr *tbp[NDTPA_MAX+1];
@@ -2475,8 +2471,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
* only be effective after the next time neigh_periodic_work
* decides to recompute it (can be multiple minutes)
*/
- p->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(p);
break;
case NDTPA_GC_STALETIME:
NEIGH_VAR_SET(p, GC_STALETIME,
@@ -2532,7 +2527,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
err = 0;
errout_tbl_lock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
errout:
return err;
}
@@ -2579,10 +2575,12 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+ rcu_read_lock();
+
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
- tbl = rcu_dereference_rtnl(neigh_tables[tidx]);
+ tbl = rcu_dereference(neigh_tables[tidx]);
if (!tbl)
continue;
@@ -2596,7 +2594,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
nidx = 0;
p = list_next_entry(&tbl->parms, list);
- list_for_each_entry_from(p, &tbl->parms_list, list) {
+ list_for_each_entry_from_rcu(p, &tbl->parms_list, list) {
if (!net_eq(neigh_parms_net(p), net))
continue;
@@ -2616,6 +2614,8 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
neigh_skip = 0;
}
out:
+ rcu_read_unlock();
+
cb->args[0] = tidx;
cb->args[1] = nidx;
@@ -3127,14 +3127,14 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
rcu_read_lock();
nht = rcu_dereference(tbl->nht);
- read_lock_bh(&tbl->lock); /* avoid resizes */
+ spin_lock_bh(&tbl->lock); /* avoid resizes */
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
cb(n, cookie);
}
- read_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_for_each);
@@ -3404,7 +3404,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
rcu_read_lock();
state->nht = rcu_dereference(tbl->nht);
- read_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
}
@@ -3444,7 +3444,7 @@ void neigh_seq_stop(struct seq_file *seq, void *v)
struct neigh_seq_state *state = seq->private;
struct neigh_table *tbl = state->tbl;
- read_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_seq_stop);
@@ -3721,8 +3721,7 @@ static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write
* only be effective after the next time neigh_periodic_work
* decides to recompute it
*/
- p->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(p);
}
return ret;
}
@@ -3918,8 +3917,10 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = {
{.msgtype = RTM_DELNEIGH, .doit = neigh_delete},
{.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info,
.flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
- {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info},
- {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set},
+ {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
};
static int __init neigh_init(void)
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 4f0f0709a1cb..70e0e9a3b650 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -145,7 +145,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x\n",
- READ_ONCE(sd->processed), atomic_read(&sd->dropped),
+ READ_ONCE(sd->processed),
+ numa_drop_read(&sd->drop_counters),
READ_ONCE(sd->time_squeeze), 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c28cd6665444..ca878525ad7c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
return -ENOMEM;
table->log = ilog2(mask) + 1;
- for (count = 0; count <= mask; count++)
+ for (count = 0; count <= mask; count++) {
table->flows[count].cpu = RPS_NO_CPU;
+ table->flows[count].filter = RPS_NO_FILTER;
+ }
} else {
table = NULL;
}
@@ -1328,7 +1330,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
struct netdev_rx_queue *queue = &dev->_rx[i];
struct kobject *kobj = &queue->kobj;
- if (!refcount_read(&dev_net(dev)->ns.count))
+ if (!check_net(dev_net(dev)))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -2061,7 +2063,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
- if (!refcount_read(&dev_net(dev)->ns.count))
+ if (!check_net(dev_net(dev)))
queue->kobj.uevent_suppress = 1;
if (netdev_uses_bql(dev))
@@ -2315,7 +2317,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &ndev->dev;
- if (!refcount_read(&dev_net(ndev)->ns.count))
+ if (!check_net(dev_net(ndev)))
dev_set_uevent_suppress(dev, 1);
kobject_get(&dev->kobj);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1b6f3826dd0e..a6e6a964a287 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
#include <linux/sched/task.h>
#include <linux/uidgid.h>
#include <linux/proc_fs.h>
+#include <linux/nstree.h>
#include <net/aligned_data.h>
#include <net/sock.h>
@@ -314,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
int id;
- if (refcount_read(&net->ns.count) == 0)
+ if (!check_net(net))
return NETNSA_NSID_NOT_ASSIGNED;
spin_lock(&net->nsid_lock);
@@ -394,13 +395,19 @@ static __net_init void preinit_net_sysctl(struct net *net)
net->core.sysctl_optmem_max = 128 * 1024;
net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
net->core.sysctl_tstamp_allow_data = 1;
+ net->core.sysctl_txq_reselection = msecs_to_jiffies(1000);
}
/* init code that must occur even if setup_net() is not called. */
-static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
+static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
{
+ int ret;
+
+ ret = ns_common_init(net);
+ if (ret)
+ return ret;
+
refcount_set(&net->passive, 1);
- refcount_set(&net->ns.count, 1);
ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");
@@ -420,6 +427,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
INIT_LIST_HEAD(&net->ptype_all);
INIT_LIST_HEAD(&net->ptype_specific);
preinit_net_sysctl(net);
+ return 0;
}
/*
@@ -432,7 +440,7 @@ static __net_init int setup_net(struct net *net)
LIST_HEAD(net_exit_list);
int error = 0;
- net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie);
+ net->net_cookie = ns_tree_gen_id(net);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
@@ -442,6 +450,7 @@ static __net_init int setup_net(struct net *net)
down_write(&net_rwsem);
list_add_tail_rcu(&net->list, &net_namespace_list);
up_write(&net_rwsem);
+ ns_tree_add_raw(net);
out:
return error;
@@ -539,7 +548,7 @@ void net_drop_ns(void *p)
net_passive_dec(net);
}
-struct net *copy_net_ns(unsigned long flags,
+struct net *copy_net_ns(u64 flags,
struct user_namespace *user_ns, struct net *old_net)
{
struct ucounts *ucounts;
@@ -559,7 +568,9 @@ struct net *copy_net_ns(unsigned long flags,
goto dec_ucounts;
}
- preinit_net(net, user_ns);
+ rv = preinit_net(net, user_ns);
+ if (rv < 0)
+ goto dec_ucounts;
net->ucounts = ucounts;
get_user_ns(user_ns);
@@ -573,6 +584,7 @@ struct net *copy_net_ns(unsigned long flags,
if (rv < 0) {
put_userns:
+ ns_common_free(net);
#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
#endif
@@ -659,8 +671,10 @@ static void cleanup_net(struct work_struct *work)
/* Don't let anyone else find us. */
down_write(&net_rwsem);
- llist_for_each_entry(net, net_kill_list, cleanup_list)
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
+ ns_tree_remove(net);
list_del_rcu(&net->list);
+ }
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
* to a net from net_kill_list (see peernet2id_alloc()).
@@ -693,6 +707,7 @@ static void cleanup_net(struct work_struct *work)
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
+ ns_common_free(net);
dec_net_namespaces(net->ucounts);
#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
@@ -812,31 +827,12 @@ static void net_ns_net_debugfs(struct net *net)
static __net_init int net_ns_net_init(struct net *net)
{
-#ifdef CONFIG_NET_NS
- net->ns.ops = &netns_operations;
-#endif
- net->ns.inum = PROC_NET_INIT_INO;
- if (net != &init_net) {
- int ret = ns_alloc_inum(&net->ns);
- if (ret)
- return ret;
- }
net_ns_net_debugfs(net);
return 0;
}
-static __net_exit void net_ns_net_exit(struct net *net)
-{
- /*
- * Initial network namespace doesn't exit so we don't need any
- * special checks here.
- */
- ns_free_inum(&net->ns);
-}
-
static struct pernet_operations __net_initdata net_ns_ops = {
.init = net_ns_net_init,
- .exit = net_ns_net_exit,
};
static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
@@ -1227,15 +1223,13 @@ static void __init netns_ipv4_struct_check(void)
sysctl_tcp_wmem);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
sysctl_ip_fwd_use_pmtu);
- CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);
-
- /* TXRX readonly hotpath cache lines */
- CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
- sysctl_tcp_moderate_rcvbuf);
- CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);
/* RX readonly hotpath cache line */
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_moderate_rcvbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_rcvbuf_low_rtt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_ip_early_demux);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_early_demux);
@@ -1245,7 +1239,6 @@ static void __init netns_ipv4_struct_check(void)
sysctl_tcp_reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_rmem);
- CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
}
#endif
@@ -1282,7 +1275,12 @@ void __init net_ns_init(void)
#ifdef CONFIG_KEYS
init_net.key_domain = &init_net_key_domain;
#endif
- preinit_net(&init_net, &init_user_ns);
+ /*
+ * This currently cannot fail as the initial network namespace
+ * has a static inode number.
+ */
+ if (preinit_net(&init_net, &init_user_ns))
+ panic("Could not preinitialize the initial network namespace");
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net))
@@ -1517,11 +1515,6 @@ static struct ns_common *netns_get(struct task_struct *task)
return net ? &net->ns : NULL;
}
-static inline struct net *to_net_ns(struct ns_common *ns)
-{
- return container_of(ns, struct net, ns);
-}
-
static void netns_put(struct ns_common *ns)
{
put_net(to_net_ns(ns));
@@ -1548,7 +1541,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns)
const struct proc_ns_operations netns_operations = {
.name = "net",
- .type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index e9a2a6f26cb7..ba673e81716f 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -2,6 +2,7 @@
/* Do not edit directly, auto-generated from: */
/* Documentation/netlink/specs/netdev.yaml */
/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
#include <net/netlink.h>
#include <net/genetlink.h>
@@ -97,7 +98,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED
[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
- [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1),
+ [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2),
};
/* NETDEV_CMD_BIND_TX - do */
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index cf3fad74511f..cffc08517a41 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -2,6 +2,7 @@
/* Do not edit directly, auto-generated from: */
/* Documentation/netlink/specs/netdev.yaml */
/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
#ifndef _LINUX_NETDEV_GEN_H
#define _LINUX_NETDEV_GEN_H
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 6314eb7bdf69..470fabbeacd9 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -869,16 +869,79 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
return err;
}
-int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+static int netdev_nl_read_rxq_bitmap(struct genl_info *info,
+ u32 rxq_bitmap_len,
+ unsigned long *rxq_bitmap)
{
+ const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
+ struct nlattr *attr;
+ int rem, err = 0;
+ u32 rxq_idx;
+
+ nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
+ genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ err = nla_parse_nested(tb, maxtype, attr,
+ netdev_queue_id_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
+ NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE))
+ return -EINVAL;
+
+ if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
+ return -EINVAL;
+ }
+
+ rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+ if (rxq_idx >= rxq_bitmap_len) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]);
+ return -EINVAL;
+ }
+
+ bitmap_set(rxq_bitmap, rxq_idx, 1);
+ }
+
+ return 0;
+}
+
+static struct device *
+netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap,
+ struct netlink_ext_ack *extack)
+{
+ struct device *dma_dev = NULL;
+ u32 rxq_idx, prev_rxq_idx;
+
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
+ struct device *rxq_dma_dev;
+
+ rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx);
+ if (dma_dev && rxq_dma_dev != dma_dev) {
+ NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)",
+ rxq_idx, prev_rxq_idx);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ dma_dev = rxq_dma_dev;
+ prev_rxq_idx = rxq_idx;
+ }
+
+ return dma_dev;
+}
+
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+{
struct net_devmem_dmabuf_binding *binding;
u32 ifindex, dmabuf_fd, rxq_idx;
struct netdev_nl_sock *priv;
struct net_device *netdev;
+ unsigned long *rxq_bitmap;
+ struct device *dma_dev;
struct sk_buff *rsp;
- struct nlattr *attr;
- int rem, err = 0;
+ int err = 0;
void *hdr;
if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
@@ -921,36 +984,31 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock;
}
- binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
- priv, info->extack);
- if (IS_ERR(binding)) {
- err = PTR_ERR(binding);
+ rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL);
+ if (!rxq_bitmap) {
+ err = -ENOMEM;
goto err_unlock;
}
- nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
- genlmsg_data(info->genlhdr),
- genlmsg_len(info->genlhdr), rem) {
- err = nla_parse_nested(
- tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr,
- netdev_queue_id_nl_policy, info->extack);
- if (err < 0)
- goto err_unbind;
-
- if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
- NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) {
- err = -EINVAL;
- goto err_unbind;
- }
+ err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues,
+ rxq_bitmap);
+ if (err)
+ goto err_rxq_bitmap;
- if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
- NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
- err = -EINVAL;
- goto err_unbind;
- }
+ dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack);
+ if (IS_ERR(dma_dev)) {
+ err = PTR_ERR(dma_dev);
+ goto err_rxq_bitmap;
+ }
- rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+ dmabuf_fd, priv, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_rxq_bitmap;
+ }
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
info->extack);
if (err)
@@ -964,6 +1022,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_unbind;
+ bitmap_free(rxq_bitmap);
+
netdev_unlock(netdev);
mutex_unlock(&priv->lock);
@@ -972,6 +1032,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
err_unbind:
net_devmem_unbind_dmabuf(binding);
+err_rxq_bitmap:
+ bitmap_free(rxq_bitmap);
err_unlock:
netdev_unlock(netdev);
err_unlock_sock:
@@ -986,6 +1048,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
struct net_devmem_dmabuf_binding *binding;
struct netdev_nl_sock *priv;
struct net_device *netdev;
+ struct device *dma_dev;
u32 ifindex, dmabuf_fd;
struct sk_buff *rsp;
int err = 0;
@@ -1032,8 +1095,9 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_netdev;
}
- binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv,
- info->extack);
+ dma_dev = netdev_queue_get_dma_dev(netdev, 0);
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
+ dmabuf_fd, priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_unlock_netdev;
diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c
new file mode 100644
index 000000000000..251f27a8307f
--- /dev/null
+++ b/net/core/netdev_queues.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/netdev_queues.h>
+
+/**
+ * netdev_queue_get_dma_dev() - get dma device for zero-copy operations
+ * @dev: net_device
+ * @idx: queue index
+ *
+ * Get dma device for zero-copy operations to be used for this queue.
+ * When such device is not available or valid, the function will return NULL.
+ *
+ * Return: Device or NULL on error
+ */
+struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
+{
+ const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
+ struct device *dma_dev;
+
+ if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
+ dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
+ else
+ dma_dev = dev->dev.parent;
+
+ return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
+}
+
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index 3bf1151d8061..c7d9341b7630 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -9,6 +9,15 @@
#include "page_pool_priv.h"
+/* See also page_pool_is_unreadable() */
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
+
+ return !!rxq->mp_params.mp_ops;
+}
+EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+
int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
{
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
index cd95394399b4..23175cb2bd86 100644
--- a/net/core/netmem_priv.h
+++ b/net/core/netmem_priv.h
@@ -5,19 +5,19 @@
static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
{
- return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
+ return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
}
static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
{
- __netmem_clear_lsb(netmem)->pp_magic |= pp_magic;
+ netmem_to_nmdesc(netmem)->pp_magic |= pp_magic;
}
static inline void netmem_clear_pp_magic(netmem_ref netmem)
{
- WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK);
+ WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK);
- __netmem_clear_lsb(netmem)->pp_magic = 0;
+ netmem_to_nmdesc(netmem)->pp_magic = 0;
}
static inline bool netmem_is_pp(netmem_ref netmem)
@@ -27,13 +27,13 @@ static inline bool netmem_is_pp(netmem_ref netmem)
static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
{
- __netmem_clear_lsb(netmem)->pp = pool;
+ netmem_to_nmdesc(netmem)->pp = pool;
}
static inline void netmem_set_dma_addr(netmem_ref netmem,
unsigned long dma_addr)
{
- __netmem_clear_lsb(netmem)->dma_addr = dma_addr;
+ netmem_to_nmdesc(netmem)->dma_addr = dma_addr;
}
static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
@@ -43,7 +43,7 @@ static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
return 0;
- magic = __netmem_clear_lsb(netmem)->pp_magic;
+ magic = netmem_to_nmdesc(netmem)->pp_magic;
return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
}
@@ -57,6 +57,6 @@ static inline void netmem_set_dma_index(netmem_ref netmem,
return;
magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
- __netmem_clear_lsb(netmem)->pp_magic = magic;
+ netmem_to_nmdesc(netmem)->pp_magic = magic;
}
#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 5f65b62346d4..09f72f10813c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -228,19 +228,16 @@ static void refill_skbs(struct netpoll *np)
{
struct sk_buff_head *skb_pool;
struct sk_buff *skb;
- unsigned long flags;
skb_pool = &np->skb_pool;
- spin_lock_irqsave(&skb_pool->lock, flags);
- while (skb_pool->qlen < MAX_SKBS) {
+ while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) {
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
if (!skb)
break;
- __skb_queue_tail(skb_pool, skb);
+ skb_queue_tail(skb_pool, skb);
}
- spin_unlock_irqrestore(&skb_pool->lock, flags);
}
static void zap_completion_queue(void)
@@ -557,6 +554,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
int err;
skb_queue_head_init(&np->skb_pool);
+ INIT_WORK(&np->refill_wq, refill_skbs_work_handler);
if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
np_err(np, "%s doesn't support polling, aborting\n",
@@ -591,11 +589,9 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
np->dev = ndev;
strscpy(np->dev_name, ndev->name, IFNAMSIZ);
- npinfo->netpoll = np;
/* fill up the skb queue */
refill_skbs(np);
- INIT_WORK(&np->refill_wq, refill_skbs_work_handler);
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
@@ -815,6 +811,10 @@ static void __netpoll_cleanup(struct netpoll *np)
if (!npinfo)
return;
+ /* At this point, there is a single npinfo instance per netdevice, and
+ * its refcnt tracks how many netpoll structures are linked to it. We
+ * only perform npinfo cleanup when the refcnt decrements to zero.
+ */
if (refcount_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
@@ -824,8 +824,7 @@ static void __netpoll_cleanup(struct netpoll *np)
RCU_INIT_POINTER(np->dev->npinfo, NULL);
call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
- } else
- RCU_INIT_POINTER(np->dev->npinfo, NULL);
+ }
skb_pool_flush(np);
}
@@ -835,7 +834,7 @@ void __netpoll_free(struct netpoll *np)
ASSERT_RTNL();
/* Wait for transmitting packets to finish before freeing. */
- synchronize_rcu();
+ synchronize_net();
__netpoll_cleanup(np);
kfree(np);
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 05e2e22a8f7c..265a729431bb 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -211,11 +211,7 @@ static int page_pool_init(struct page_pool *pool,
return -EINVAL;
if (pool->p.pool_size)
- ring_qsize = pool->p.pool_size;
-
- /* Sanity limit mem that can be pinned down */
- if (ring_qsize > 32768)
- return -E2BIG;
+ ring_qsize = min(pool->p.pool_size, 16384);
/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
@@ -287,8 +283,10 @@ static int page_pool_init(struct page_pool *pool,
}
if (pool->mp_ops) {
- if (!pool->dma_map || !pool->dma_sync)
- return -EOPNOTSUPP;
+ if (!pool->dma_map || !pool->dma_sync) {
+ err = -EOPNOTSUPP;
+ goto free_ptr_ring;
+ }
if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) {
err = -EFAULT;
@@ -303,12 +301,16 @@ static int page_pool_init(struct page_pool *pool,
}
static_branch_inc(&page_pool_mem_providers);
+ } else if (pool->p.order > MAX_PAGE_ORDER) {
+ err = -EINVAL;
+ goto free_ptr_ring;
}
return 0;
free_ptr_ring:
ptr_ring_cleanup(&pool->ring, NULL);
+ xa_destroy(&pool->dma_mapped);
#ifdef CONFIG_PAGE_POOL_STATS
if (!pool->system)
free_percpu(pool->recycle_stats);
@@ -470,11 +472,60 @@ page_pool_dma_sync_for_device(const struct page_pool *pool,
}
}
+static int page_pool_register_dma_index(struct page_pool *pool,
+ netmem_ref netmem, gfp_t gfp)
+{
+ int err = 0;
+ u32 id;
+
+ if (unlikely(!PP_DMA_INDEX_BITS))
+ goto out;
+
+ if (in_softirq())
+ err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ else
+ err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ if (err) {
+ WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
+ goto out;
+ }
+
+ netmem_set_dma_index(netmem, id);
+out:
+ return err;
+}
+
+static int page_pool_release_dma_index(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ struct page *old, *page = netmem_to_page(netmem);
+ unsigned long id;
+
+ if (unlikely(!PP_DMA_INDEX_BITS))
+ return 0;
+
+ id = netmem_get_dma_index(netmem);
+ if (!id)
+ return -1;
+
+ if (in_softirq())
+ old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
+ else
+ old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
+ if (old != page)
+ return -1;
+
+ netmem_set_dma_index(netmem, 0);
+
+ return 0;
+}
+
static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
{
dma_addr_t dma;
int err;
- u32 id;
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
* since dma_addr_t can be either 32 or 64 bits and does not always fit
@@ -493,18 +544,10 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t g
goto unmap_failed;
}
- if (in_softirq())
- err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
- PP_DMA_INDEX_LIMIT, gfp);
- else
- err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
- PP_DMA_INDEX_LIMIT, gfp);
- if (err) {
- WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
+ err = page_pool_register_dma_index(pool, netmem, gfp);
+ if (err)
goto unset_failed;
- }
- netmem_set_dma_index(netmem, id);
page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
return true;
@@ -553,6 +596,12 @@ static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool
netmem_ref netmem;
int i, nr_pages;
+ /* Unconditionally set NOWARN if allocating from NAPI.
+ * Drivers forget to set it, and OOM reports on packet Rx are useless.
+ */
+ if ((gfp & GFP_ATOMIC) == GFP_ATOMIC)
+ gfp |= __GFP_NOWARN;
+
/* Don't support bulk alloc for high-order pages */
if (unlikely(pp_order))
return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
@@ -676,8 +725,6 @@ void page_pool_clear_pp_info(netmem_ref netmem)
static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool,
netmem_ref netmem)
{
- struct page *old, *page = netmem_to_page(netmem);
- unsigned long id;
dma_addr_t dma;
if (!pool->dma_map)
@@ -686,15 +733,7 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo
*/
return;
- id = netmem_get_dma_index(netmem);
- if (!id)
- return;
-
- if (in_softirq())
- old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
- else
- old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
- if (old != page)
+ if (page_pool_release_dma_index(pool, netmem))
return;
dma = page_pool_get_dma_addr_netmem(netmem);
@@ -704,7 +743,6 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
page_pool_set_dma_addr_netmem(netmem, 0);
- netmem_set_dma_index(netmem, 0);
}
/* Disconnects a page (from a page_pool). API users can have a need
@@ -1201,6 +1239,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
pool->xdp_mem_id = mem->id;
}
+/**
+ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI
+ * @pool: page pool to modify
+ * @napi: NAPI instance to associate the page pool with
+ *
+ * Associate a page pool with a NAPI instance for lockless page recycling.
+ * This is useful when a new page pool has to be added to a NAPI instance
+ * without disabling that NAPI instance, to mark the point at which control
+ * path "hands over" the page pool to the NAPI instance. In most cases driver
+ * can simply set the @napi field in struct page_pool_params, and does not
+ * have to call this helper.
+ *
+ * The function is idempotent, but does not implement any refcounting.
+ * Single page_pool_disable_direct_recycling() will disable recycling,
+ * no matter how many times enable was called.
+ */
+void page_pool_enable_direct_recycling(struct page_pool *pool,
+ struct napi_struct *napi)
+{
+ if (READ_ONCE(pool->p.napi) == napi)
+ return;
+ WARN_ON(!napi || pool->p.napi);
+
+ mutex_lock(&page_pools_lock);
+ WRITE_ONCE(pool->p.napi, napi);
+ mutex_unlock(&page_pools_lock);
+}
+EXPORT_SYMBOL(page_pool_enable_direct_recycling);
+
void page_pool_disable_direct_recycling(struct page_pool *pool)
{
/* Disable direct recycling based on pool->cpuid.
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0ebe5461d4d9..d41b03fd1f63 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -114,6 +114,7 @@
#include <linux/sys.h>
#include <linux/types.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
@@ -2841,8 +2842,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
}
i = 0;
- frag_len = (datalen/frags) < PAGE_SIZE ?
- (datalen/frags) : PAGE_SIZE;
+ frag_len = min_t(int, datalen / frags, PAGE_SIZE);
while (datalen > 0) {
if (unlikely(!pkt_dev->page)) {
int node = numa_node_id();
@@ -2859,8 +2859,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
if (i == (frags - 1))
skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
pkt_dev->page, 0,
- (datalen < PAGE_SIZE ?
- datalen : PAGE_SIZE));
+ min(datalen, PAGE_SIZE));
else
skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
pkt_dev->page, 0, frag_len);
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 63de5c635842..897a8f01a67b 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -77,9 +77,7 @@ void reqsk_queue_alloc(struct request_sock_queue *queue)
* a simple spin lock - one must consider sock_owned_by_user() and arrange
* to use sk_add_backlog() stuff. But what really makes it infeasible is the
* locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
- * acquire a child's lock while holding listener's socket lock. A corner
- * case might also exist in tcp_v4_hnd_req() that will trigger this locking
- * order.
+ * acquire a child's lock while holding listener's socket lock.
*
* This function also sets "treq->tfo_listener" to false.
* treq->tfo_listener is used by the listener so it is protected by the
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 094b085cff20..b1ed55141d8a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -478,7 +478,7 @@ static int rtnl_unregister(int protocol, int msgtype)
* rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
* @protocol : Protocol family or PF_UNSPEC
*
- * Identical to calling rtnl_unregster() for all registered message types
+ * Identical to calling rtnl_unregister() for all registered message types
* of a certain protocol family.
*/
void rtnl_unregister_all(int protocol)
@@ -1270,13 +1270,13 @@ static size_t rtnl_dpll_pin_size(const struct net_device *dev)
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
- return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ size_t size;
+
+ size = NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
+ nla_total_size_64bit(sizeof(struct rtnl_link_ifmap))
- + nla_total_size(sizeof(struct rtnl_link_stats))
- + nla_total_size_64bit(sizeof(struct rtnl_link_stats64))
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ nla_total_size(4) /* IFLA_TXQLEN */
@@ -1326,7 +1326,15 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_devlink_port_size(dev)
+ rtnl_dpll_pin_size(dev)
+ nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
+ + nla_total_size(2) /* IFLA_HEADROOM */
+ + nla_total_size(2) /* IFLA_TAILROOM */
+ 0;
+
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS))
+ size += nla_total_size(sizeof(struct rtnl_link_stats)) +
+ nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
+
+ return size;
}
static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -2091,7 +2099,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
atomic_read(&dev->carrier_up_count)) ||
nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
- atomic_read(&dev->carrier_down_count)))
+ atomic_read(&dev->carrier_down_count)) ||
+ nla_put_u16(skb, IFLA_HEADROOM,
+ READ_ONCE(dev->needed_headroom)) ||
+ nla_put_u16(skb, IFLA_TAILROOM,
+ READ_ONCE(dev->needed_tailroom)))
goto nla_put_failure;
if (rtnl_fill_proto_down(skb, dev))
@@ -2117,7 +2129,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (rtnl_phys_switch_id_fill(skb, dev))
goto nla_put_failure;
- if (rtnl_fill_stats(skb, dev))
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS) &&
+ rtnl_fill_stats(skb, dev))
goto nla_put_failure;
if (rtnl_fill_vf(skb, dev, ext_filter_mask))
@@ -2243,6 +2256,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
[IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
[IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT },
+ [IFLA_HEADROOM] = { .type = NLA_REJECT },
+ [IFLA_TAILROOM] = { .type = NLA_REJECT },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -4707,9 +4722,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
int err;
u16 vid;
- if (!netlink_capable(skb, CAP_NET_ADMIN))
- return -EPERM;
-
if (!del_bulk) {
err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
NULL, extack);
diff --git a/net/core/scm.c b/net/core/scm.c
index 072d5742440a..cd87f66671aa 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -273,15 +273,13 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
check_object_size(data, cmlen - sizeof(*cm), true);
- if (!user_write_access_begin(cm, cmlen))
- goto efault;
-
- unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
- unsafe_put_user(level, &cm->cmsg_level, efault_end);
- unsafe_put_user(type, &cm->cmsg_type, efault_end);
- unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
- cmlen - sizeof(*cm), efault_end);
- user_write_access_end();
+ scoped_user_write_access_size(cm, cmlen, efault) {
+ unsafe_put_user(cmlen, &cm->cmsg_len, efault);
+ unsafe_put_user(level, &cm->cmsg_level, efault);
+ unsafe_put_user(type, &cm->cmsg_type, efault);
+ unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
+ cmlen - sizeof(*cm), efault);
+ }
} else {
struct cmsghdr *cm = msg->msg_control;
@@ -299,8 +297,6 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
msg->msg_controllen -= cmlen;
return 0;
-efault_end:
- user_write_access_end();
efault:
return -EFAULT;
}
diff --git a/net/core/selftests.c b/net/core/selftests.c
index 3d79133a91a6..8b81feb82c4a 100644
--- a/net/core/selftests.c
+++ b/net/core/selftests.c
@@ -14,46 +14,10 @@
#include <net/tcp.h>
#include <net/udp.h>
-struct net_packet_attrs {
- const unsigned char *src;
- const unsigned char *dst;
- u32 ip_src;
- u32 ip_dst;
- bool tcp;
- u16 sport;
- u16 dport;
- int timeout;
- int size;
- int max_size;
- u8 id;
- u16 queue_mapping;
- bool bad_csum;
-};
-
-struct net_test_priv {
- struct net_packet_attrs *packet;
- struct packet_type pt;
- struct completion comp;
- int double_vlan;
- int vlan_id;
- int ok;
-};
-
-struct netsfhdr {
- __be32 version;
- __be64 magic;
- u8 id;
-} __packed;
-
static u8 net_test_next_id;
-#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
- sizeof(struct netsfhdr))
-#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL
-#define NET_LB_TIMEOUT msecs_to_jiffies(200)
-
-static struct sk_buff *net_test_get_skb(struct net_device *ndev,
- struct net_packet_attrs *attr)
+struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id,
+ struct net_packet_attrs *attr)
{
struct sk_buff *skb = NULL;
struct udphdr *uhdr = NULL;
@@ -142,8 +106,8 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev,
shdr = skb_put(skb, sizeof(*shdr));
shdr->version = 0;
shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC);
- attr->id = net_test_next_id;
- shdr->id = net_test_next_id++;
+ attr->id = id;
+ shdr->id = id;
if (attr->size) {
void *payload = skb_put(skb, attr->size);
@@ -190,6 +154,7 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev,
return skb;
}
+EXPORT_SYMBOL_GPL(net_test_get_skb);
static int net_test_loopback_validate(struct sk_buff *skb,
struct net_device *ndev,
@@ -286,12 +251,13 @@ static int __net_test_loopback(struct net_device *ndev,
tpriv->packet = attr;
dev_add_pack(&tpriv->pt);
- skb = net_test_get_skb(ndev, attr);
+ skb = net_test_get_skb(ndev, net_test_next_id, attr);
if (!skb) {
ret = -ENOMEM;
goto cleanup;
}
+ net_test_next_id++;
ret = dev_direct_xmit(skb, attr->queue_mapping);
if (ret < 0) {
goto cleanup;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ee0274417948..a00808f7be6a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,7 +79,9 @@
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool/helpers.h>
+#include <net/psp/types.h>
#include <net/dropreason.h>
+#include <net/xdp_sock.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -221,9 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
skb_panic(skb, sz, addr, __func__);
}
-#define NAPI_SKB_CACHE_SIZE 64
-#define NAPI_SKB_CACHE_BULK 16
-#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
+#define NAPI_SKB_CACHE_SIZE 128
+#define NAPI_SKB_CACHE_BULK 32
+#define NAPI_SKB_CACHE_FREE 32
struct napi_alloc_cache {
local_lock_t bh_lock;
@@ -273,17 +275,23 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
}
EXPORT_SYMBOL(__netdev_alloc_frag_align);
-static struct sk_buff *napi_skb_cache_get(void)
+/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler
+ * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset.
+ */
+static u32 skbuff_cache_size __read_mostly;
+
+static struct sk_buff *napi_skb_cache_get(bool alloc)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!nc->skb_count)) {
- nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
- GFP_ATOMIC | __GFP_NOWARN,
- NAPI_SKB_CACHE_BULK,
- nc->skb_cache);
+ if (alloc)
+ nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN,
+ NAPI_SKB_CACHE_BULK,
+ nc->skb_cache);
if (unlikely(!nc->skb_count)) {
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return NULL;
@@ -291,8 +299,10 @@ static struct sk_buff *napi_skb_cache_get(void)
}
skb = nc->skb_cache[--nc->skb_count];
+ if (nc->skb_count)
+ prefetch(nc->skb_cache[nc->skb_count - 1]);
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
- kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
+ kasan_mempool_unpoison_object(skb, skbuff_cache_size);
return skb;
}
@@ -344,11 +354,9 @@ u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
get:
for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
- u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
-
skbs[i] = nc->skb_cache[base + i];
- kasan_mempool_unpoison_object(skbs[i], cache_size);
+ kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
memset(skbs[i], 0, offsetof(struct sk_buff, tail));
}
@@ -525,7 +533,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
- skb = napi_skb_cache_get();
+ skb = napi_skb_cache_get(true);
if (unlikely(!skb))
return NULL;
@@ -640,25 +648,38 @@ out:
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
int flags, int node)
{
+ struct sk_buff *skb = NULL;
struct kmem_cache *cache;
- struct sk_buff *skb;
bool pfmemalloc;
u8 *data;
- cache = (flags & SKB_ALLOC_FCLONE)
- ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;
-
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
gfp_mask |= __GFP_MEMALLOC;
- /* Get the HEAD */
- if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
- likely(node == NUMA_NO_NODE || node == numa_mem_id()))
- skb = napi_skb_cache_get();
- else
+ if (flags & SKB_ALLOC_FCLONE) {
+ cache = net_hotdata.skbuff_fclone_cache;
+ goto fallback;
+ }
+ cache = net_hotdata.skbuff_cache;
+ if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id()))
+ goto fallback;
+
+ if (flags & SKB_ALLOC_NAPI) {
+ skb = napi_skb_cache_get(true);
+ if (unlikely(!skb))
+ return NULL;
+ } else if (!in_hardirq() && !irqs_disabled()) {
+ local_bh_disable();
+ skb = napi_skb_cache_get(false);
+ local_bh_enable();
+ }
+
+ if (!skb) {
+fallback:
skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
- if (unlikely(!skb))
- return NULL;
+ if (unlikely(!skb))
+ return NULL;
+ }
prefetchw(skb);
/* We do our best to align skb_shared_info on a separate cache
@@ -1135,12 +1156,22 @@ void skb_release_head_state(struct sk_buff *skb)
skb_dst_drop(skb);
if (skb->destructor) {
DEBUG_NET_WARN_ON_ONCE(in_hardirq());
- skb->destructor(skb);
- }
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- nf_conntrack_put(skb_nfct(skb));
+#ifdef CONFIG_INET
+ INDIRECT_CALL_4(skb->destructor,
+ tcp_wfree, __sock_wfree, sock_wfree,
+ xsk_destruct_skb,
+ skb);
+#else
+ INDIRECT_CALL_2(skb->destructor,
+ sock_wfree, xsk_destruct_skb,
+ skb);
+
#endif
- skb_ext_put(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ }
+ nf_reset_ct(skb);
+ skb_ext_reset(skb);
}
/* Free everything but the sk_buff shell. */
@@ -1416,7 +1447,6 @@ void __consume_stateless_skb(struct sk_buff *skb)
static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- u32 i;
if (!kasan_mempool_poison_object(skb))
return;
@@ -1425,13 +1455,16 @@ static void napi_skb_cache_put(struct sk_buff *skb)
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
- for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
+ u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE;
+
+ for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++)
kasan_mempool_unpoison_object(nc->skb_cache[i],
- kmem_cache_size(net_hotdata.skbuff_cache));
+ skbuff_cache_size);
- kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF,
- nc->skb_cache + NAPI_SKB_CACHE_HALF);
- nc->skb_count = NAPI_SKB_CACHE_HALF;
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache,
+ NAPI_SKB_CACHE_FREE,
+ nc->skb_cache + remaining);
+ nc->skb_count = remaining;
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}
@@ -1457,13 +1490,18 @@ void napi_skb_free_stolen_head(struct sk_buff *skb)
void napi_consume_skb(struct sk_buff *skb, int budget)
{
/* Zero budget indicate non-NAPI context called us, like netpoll */
- if (unlikely(!budget)) {
+ if (unlikely(!budget || !skb)) {
dev_consume_skb_any(skb);
return;
}
DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+ if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) {
+ skb_release_head_state(skb);
+ return skb_attempt_defer_free(skb);
+ }
+
if (!skb_unref(skb))
return;
@@ -2217,6 +2255,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function.
+ *
+ * Note: If you skb_push() the start of the buffer after reallocating the
+ * header, call skb_postpush_data_move() first to move the metadata out of
+ * the way before writing to &sk_buff->data.
*/
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
@@ -2288,8 +2330,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
- skb_metadata_clear(skb);
-
/* It is not generally safe to change skb->truesize.
* For the moment, we really care of rx path, or
* when skb is orphaned (not attached to a socket).
@@ -3112,7 +3152,9 @@ static bool __splice_segment(struct page *page, unsigned int poff,
poff += flen;
plen -= flen;
*len -= flen;
- } while (*len && plen);
+ if (!*len)
+ return true;
+ } while (plen);
return false;
}
@@ -5060,6 +5102,9 @@ static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
[SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
#endif
+#if IS_ENABLED(CONFIG_INET_PSP)
+ [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
+#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
@@ -5110,6 +5155,8 @@ void __init skb_init(void)
offsetof(struct sk_buff, cb),
sizeof_field(struct sk_buff, cb),
NULL);
+ skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
+
net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
sizeof(struct sk_buff_fclones),
0,
@@ -6667,7 +6714,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
return NULL;
while (data_len) {
- if (nr_frags == MAX_SKB_FRAGS - 1)
+ if (nr_frags == MAX_SKB_FRAGS)
goto failure;
while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
order--;
@@ -7042,6 +7089,7 @@ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
skb->active_extensions = 1 << id;
return skb_ext_get_ptr(ext, id);
}
+EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL");
/**
* skb_ext_add - allocate space for given extension, COW if needed
@@ -7178,8 +7226,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb)
*/
void skb_attempt_defer_free(struct sk_buff *skb)
{
+ struct skb_defer_node *sdn;
+ unsigned long defer_count;
int cpu = skb->alloc_cpu;
- struct softnet_data *sd;
unsigned int defer_max;
bool kick;
@@ -7192,28 +7241,26 @@ nodefer: kfree_skb_napi_cache(skb);
DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
DEBUG_NET_WARN_ON_ONCE(skb->destructor);
+ DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb));
+
+ sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();
- sd = &per_cpu(softnet_data, cpu);
defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
- if (READ_ONCE(sd->defer_count) >= defer_max)
+ defer_count = atomic_long_inc_return(&sdn->defer_count);
+
+ if (defer_count >= defer_max)
goto nodefer;
- spin_lock_bh(&sd->defer_lock);
- /* Send an IPI every time queue reaches half capacity. */
- kick = sd->defer_count == (defer_max >> 1);
- /* Paired with the READ_ONCE() few lines above */
- WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
+ llist_add(&skb->ll_node, &sdn->defer_list);
- skb->next = sd->defer_list;
- /* Paired with READ_ONCE() in skb_defer_free_flush() */
- WRITE_ONCE(sd->defer_list, skb);
- spin_unlock_bh(&sd->defer_lock);
+ /* Send an IPI every time queue reaches half capacity. */
+ kick = (defer_count - 1) == (defer_max >> 1);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
if (unlikely(kick))
- kick_defer_list_purge(sd, cpu);
+ kick_defer_list_purge(cpu);
}
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 83c78379932e..2ac7731e1e0a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -876,7 +876,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
sk_psock_stop(psock);
INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
- queue_rcu_work(system_wq, &psock->rwork);
+ queue_rcu_work(system_percpu_wq, &psock->rwork);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
diff --git a/net/core/sock.c b/net/core/sock.c
index 7c26ec8dce63..45c98bf524b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -155,7 +155,7 @@
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
-static void sock_def_write_space_wfree(struct sock *sk);
+static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
static void sock_def_write_space(struct sock *sk);
/**
@@ -281,12 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];
/* Run time adjustable parameters. */
-__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_wmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_wmem_max);
-__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_rmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_rmem_max);
-__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
-__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);
@@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
struct sk_buff_head *list = &sk->sk_receive_queue;
if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
return -ENOBUFS;
}
@@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
skb->dev = NULL;
if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
goto discard_and_relse;
}
@@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
reason = SKB_DROP_REASON_PFMEMALLOC;
if (err == -ENOBUFS)
reason = SKB_DROP_REASON_SOCKET_BACKLOG;
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
goto discard_and_relse;
}
@@ -1032,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
bool charged;
int pages;
- if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
+ if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
return -EOPNOTSUPP;
if (!bytes)
@@ -1041,22 +1041,28 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
pages = sk_mem_pages(bytes);
/* pre-charge to memcg */
- charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ charged = mem_cgroup_sk_charge(sk, pages,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!charged)
return -ENOMEM;
+ if (sk->sk_bypass_prot_mem)
+ goto success;
+
/* pre-charge to forward_alloc */
sk_memory_allocated_add(sk, pages);
allocated = sk_memory_allocated(sk);
+
/* If the system goes into memory pressure with this
* precharge, give up and return error.
*/
if (allocated > sk_prot_mem_limits(sk, 1)) {
sk_memory_allocated_sub(sk, pages);
- mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
+ mem_cgroup_sk_uncharge(sk, pages);
return -ENOMEM;
}
+
+success:
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
WRITE_ONCE(sk->sk_reserved_mem,
@@ -2300,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
+
+ if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
+ sk->sk_bypass_prot_mem = 1;
+
sk->sk_kern_sock = kern;
sock_lock_init(sk);
+
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt)) {
get_net_track(net, &sk->ns_tracker, priority);
@@ -2313,7 +2324,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
}
sock_net_set(sk, net);
- refcount_set(&sk->sk_wmem_alloc, 1);
+ refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
@@ -2451,13 +2462,16 @@ static void sk_init_common(struct sock *sk)
}
/**
- * sk_clone_lock - clone a socket, and lock its clone
- * @sk: the socket to clone
- * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * sk_clone - clone a socket
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @lock: if true, lock the cloned sk
*
- * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ * If @lock is true, the clone is locked by bh_lock_sock(), and
+ * caller must unlock socket even in error path by bh_unlock_sock().
*/
-struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
+ bool lock)
{
struct proto *prot = READ_ONCE(sk->sk_prot);
struct sk_filter *filter;
@@ -2486,16 +2500,19 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
false, priority);
}
+
sk_node_init(&newsk->sk_node);
sock_lock_init(newsk);
- bh_lock_sock(newsk);
+
+ if (lock)
+ bh_lock_sock(newsk);
+
newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
newsk->sk_backlog.len = 0;
atomic_set(&newsk->sk_rmem_alloc, 0);
- /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
- refcount_set(&newsk->sk_wmem_alloc, 1);
+ refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
atomic_set(&newsk->sk_omem_alloc, 0);
sk_init_common(newsk);
@@ -2505,15 +2522,18 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
newsk->sk_reserved_mem = 0;
- atomic_set(&newsk->sk_drops, 0);
+ DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
+ sk_drops_reset(newsk);
newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE);
+#ifdef CONFIG_MEMCG
/* sk->sk_memcg will be populated at accept() time */
newsk->sk_memcg = NULL;
+#endif
cgroup_sk_clone(&newsk->sk_cgrp_data);
@@ -2577,14 +2597,15 @@ free:
* destructor and make plain sk_free()
*/
newsk->sk_destruct = NULL;
- bh_unlock_sock(newsk);
+ if (lock)
+ bh_unlock_sock(newsk);
sk_free(newsk);
newsk = NULL;
goto out;
}
-EXPORT_SYMBOL_GPL(sk_clone_lock);
+EXPORT_SYMBOL_GPL(sk_clone);
-static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
+static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
{
bool is_ipv6 = false;
u32 max_size;
@@ -2594,8 +2615,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
- max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) :
- READ_ONCE(dst_dev(dst)->gso_ipv4_max_size);
+ max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
+ READ_ONCE(dev->gso_ipv4_max_size);
if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
max_size = GSO_LEGACY_MAX_SIZE;
@@ -2604,9 +2625,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
+ const struct net_device *dev;
u32 max_segs = 1;
- sk->sk_route_caps = dst_dev(dst)->features;
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ sk->sk_route_caps = dev->features;
if (sk_is_tcp(sk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2622,13 +2646,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
+ sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
- max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1);
+ max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
sk_dst_set(sk, dst);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -2642,16 +2667,18 @@ EXPORT_SYMBOL_GPL(sk_setup_caps);
*/
void sock_wfree(struct sk_buff *skb)
{
- struct sock *sk = skb->sk;
unsigned int len = skb->truesize;
+ struct sock *sk = skb->sk;
bool free;
+ int old;
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
if (sock_flag(sk, SOCK_RCU_FREE) &&
sk->sk_write_space == sock_def_write_space) {
rcu_read_lock();
- free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
- sock_def_write_space_wfree(sk);
+ free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
+ &old);
+ sock_def_write_space_wfree(sk, old - len);
rcu_read_unlock();
if (unlikely(free))
__sk_free(sk);
@@ -2688,6 +2715,8 @@ void __sock_wfree(struct sk_buff *skb)
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
+ int old_wmem;
+
skb_orphan(skb);
#ifdef CONFIG_INET
if (unlikely(!sk_fullsock(sk)))
@@ -2701,7 +2730,15 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
* is enough to guarantee sk_free() won't free this sock until
* all in-flight packets are completed
*/
- refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+ __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
+
+ /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
+ * is in a host queue (qdisc, NIC queue).
+ * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
+ * based on XPS for better performance.
+ * Otherwise clear ooo_okay to not risk Out Of Order delivery.
+ */
+ skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
}
EXPORT_SYMBOL(skb_set_owner_w);
@@ -2780,28 +2817,6 @@ void sock_pfree(struct sk_buff *skb)
EXPORT_SYMBOL(sock_pfree);
#endif /* CONFIG_INET */
-unsigned long __sock_i_ino(struct sock *sk)
-{
- unsigned long ino;
-
- read_lock(&sk->sk_callback_lock);
- ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
- read_unlock(&sk->sk_callback_lock);
- return ino;
-}
-EXPORT_SYMBOL(__sock_i_ino);
-
-unsigned long sock_i_ino(struct sock *sk)
-{
- unsigned long ino;
-
- local_bh_disable();
- ino = __sock_i_ino(sk);
- local_bh_enable();
- return ino;
-}
-EXPORT_SYMBOL(sock_i_ino);
-
/*
* Allocate a skb from the socket's send buffer.
*/
@@ -3151,8 +3166,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
return true;
- sk_enter_memory_pressure(sk);
+ if (!sk->sk_bypass_prot_mem)
+ sk_enter_memory_pressure(sk);
+
sk_stream_moderate_sndbuf(sk);
+
return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3180,23 +3198,27 @@ void __release_sock(struct sock *sk)
__acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb, *next;
+ int nb = 0;
while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
spin_unlock_bh(&sk->sk_lock.slock);
- do {
+ while (1) {
next = skb->next;
prefetch(next);
DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
- cond_resched();
-
skb = next;
- } while (skb != NULL);
+ if (!skb)
+ break;
+
+ if (!(++nb & 15))
+ cond_resched();
+ }
spin_lock_bh(&sk->sk_lock.slock);
}
@@ -3263,20 +3285,25 @@ EXPORT_SYMBOL(sk_wait_data);
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
- struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
+ bool memcg_enabled = false, charged = false;
struct proto *prot = sk->sk_prot;
- bool charged = true;
- long allocated;
+ long allocated = 0;
- sk_memory_allocated_add(sk, amt);
- allocated = sk_memory_allocated(sk);
+ if (!sk->sk_bypass_prot_mem) {
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
+ }
- if (memcg) {
- charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
+ if (mem_cgroup_sk_enabled(sk)) {
+ memcg_enabled = true;
+ charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
if (!charged)
goto suppress_allocation;
}
+ if (!allocated)
+ return 1;
+
/* Under limit. */
if (allocated <= sk_prot_mem_limits(sk, 0)) {
sk_leave_memory_pressure(sk);
@@ -3346,21 +3373,20 @@ suppress_allocation:
*/
if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
/* Force charge with __GFP_NOFAIL */
- if (memcg && !charged) {
- mem_cgroup_charge_skmem(memcg, amt,
- gfp_memcg_charge() | __GFP_NOFAIL);
- }
+ if (memcg_enabled && !charged)
+ mem_cgroup_sk_charge(sk, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
return 1;
}
}
- if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
- trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
+ trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
- sk_memory_allocated_sub(sk, amt);
+ if (allocated)
+ sk_memory_allocated_sub(sk, amt);
- if (memcg && charged)
- mem_cgroup_uncharge_skmem(memcg, amt);
+ if (charged)
+ mem_cgroup_sk_uncharge(sk, amt);
return 0;
}
@@ -3396,10 +3422,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
*/
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
- sk_memory_allocated_sub(sk, amount);
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_uncharge(sk, amount);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
+ sk_memory_allocated_sub(sk, amount);
if (sk_under_global_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
@@ -3419,6 +3448,24 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
}
EXPORT_SYMBOL(__sk_mem_reclaim);
+void __sk_charge(struct sock *sk, gfp_t gfp)
+{
+ int amt;
+
+ gfp |= __GFP_NOFAIL;
+ if (mem_cgroup_from_sk(sk)) {
+ /* The socket has not been accepted yet, no need
+ * to look at newsk->sk_wmem_queued.
+ */
+ amt = sk_mem_pages(sk->sk_forward_alloc +
+ atomic_read(&sk->sk_rmem_alloc));
+ if (amt)
+ mem_cgroup_sk_charge(sk, amt, gfp);
+ }
+
+ kmem_cache_charge(sk, gfp);
+}
+
int sk_set_peek_off(struct sock *sk, int val)
{
WRITE_ONCE(sk->sk_peek_off, val);
@@ -3433,13 +3480,13 @@ EXPORT_SYMBOL_GPL(sk_set_peek_off);
* function, some default processing is provided.
*/
-int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
{
return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_bind);
-int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
int len, int flags)
{
return -EOPNOTSUPP;
@@ -3593,12 +3640,12 @@ static void sock_def_write_space(struct sock *sk)
* for SOCK_RCU_FREE sockets under RCU read section and after putting
* ->sk_wmem_alloc.
*/
-static void sock_def_write_space_wfree(struct sock *sk)
+static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
{
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if (sock_writeable(sk)) {
+ if (__sock_writeable(sk, wmem_alloc)) {
struct socket_wq *wq = rcu_dereference(sk->sk_wq);
/* rely on refcount_sub from sock_wfree() */
@@ -3713,7 +3760,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
*/
smp_wmb();
refcount_set(&sk->sk_refcnt, 1);
- atomic_set(&sk->sk_drops, 0);
+ sk_drops_reset(sk);
}
EXPORT_SYMBOL(sock_init_data_uid);
@@ -3973,7 +4020,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
- mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+ mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
}
#ifdef CONFIG_PROC_FS
@@ -4366,7 +4413,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */
-int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
+int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
{
if (!sk->sk_prot->bind_add)
return -EOPNOTSUPP;
@@ -4454,7 +4501,9 @@ static int __init sock_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
+#ifdef CONFIG_MEMCG
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
+#endif
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
@@ -4463,31 +4512,34 @@ static int __init sock_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
- CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
- CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
- CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index b23594c767f2..026ce9bd9e5e 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -348,7 +348,7 @@ static struct pernet_operations diag_net_ops = {
static int __init sock_diag_init(void)
{
- broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0);
+ broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0);
BUG_ON(!broadcast_wq);
return register_pernet_subsys(&diag_net_ops);
}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8cf04b57ade1..8d4decb2606f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -668,6 +668,13 @@ static struct ctl_table netns_core_table[] = {
.proc_handler = proc_dou8vec_minmax,
},
{
+ .procname = "txq_reselection_ms",
+ .data = &init_net.core.sysctl_txq_reselection,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
.procname = "tstamp_allow_data",
.data = &init_net.core.sysctl_tstamp_allow_data,
.maxlen = sizeof(u8),
@@ -676,6 +683,15 @@ static struct ctl_table netns_core_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE
},
+ {
+ .procname = "bypass_prot_mem",
+ .data = &init_net.core.sysctl_bypass_prot_mem,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
/* sysctl_core_net_init() will set the values after this
* to readonly in network namespaces
*/
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 491334b9b8be..9100e160113a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -663,9 +663,8 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
u32 tsize;
tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
- xdp_update_skb_shared_info(skb, nr_frags,
- sinfo->xdp_frags_size, tsize,
- xdp_buff_is_frag_pfmemalloc(xdp));
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ tsize, xdp_buff_get_skb_flags(xdp));
}
skb->protocol = eth_type_trans(skb, rxq->dev);
@@ -692,7 +691,7 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
struct skb_shared_info *sinfo = skb_shinfo(skb);
const struct skb_shared_info *xinfo;
u32 nr_frags, tsize = 0;
- bool pfmemalloc = false;
+ u32 flags = 0;
xinfo = xdp_get_shared_info_from_buff(xdp);
nr_frags = xinfo->nr_frags;
@@ -714,11 +713,12 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
__skb_fill_page_desc_noacc(sinfo, i, page, offset, len);
tsize += truesize;
- pfmemalloc |= page_is_pfmemalloc(page);
+ if (page_is_pfmemalloc(page))
+ flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
}
- xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size,
- tsize, pfmemalloc);
+ xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize,
+ flags);
return true;
}
@@ -823,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
skb_metadata_set(skb, xdpf->metasize);
if (unlikely(xdp_frame_has_frags(xdpf)))
- xdp_update_skb_shared_info(skb, nr_frags,
- sinfo->xdp_frags_size,
- nr_frags * xdpf->frame_sz,
- xdp_frame_is_frag_pfmemalloc(xdpf));
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ nr_frags * xdpf->frame_sz,
+ xdp_frame_get_skb_flags(xdpf));
/* Essential SKB info: protocol and skb->dev */
skb->protocol = eth_type_trans(skb, dev);