summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile8
-rw-r--r--net/core/bpf_sk_storage.c53
-rw-r--r--net/core/datagram.c90
-rw-r--r--net/core/dev.c1927
-rw-r--r--net/core/dev.h210
-rw-r--r--net/core/dev_addr_lists.c13
-rw-r--r--net/core/dev_addr_lists_test.c14
-rw-r--r--net/core/dev_ioctl.c95
-rw-r--r--net/core/devmem.c402
-rw-r--r--net/core/devmem.h180
-rw-r--r--net/core/drop_monitor.c60
-rw-r--r--net/core/dst.c23
-rw-r--r--net/core/dst_cache.c13
-rw-r--r--net/core/fib_notifier.c2
-rw-r--r--net/core/fib_rules.c82
-rw-r--r--net/core/filter.c857
-rw-r--r--net/core/flow_dissector.c154
-rw-r--r--net/core/gen_estimator.c2
-rw-r--r--net/core/gro.c89
-rw-r--r--net/core/gro_cells.c3
-rw-r--r--net/core/gso.c4
-rw-r--r--net/core/hotdata.c25
-rw-r--r--net/core/ieee8021q_helpers.c242
-rw-r--r--net/core/link_watch.c28
-rw-r--r--net/core/lwt_bpf.c21
-rw-r--r--net/core/lwtunnel.c65
-rw-r--r--net/core/mp_dmabuf_devmem.h44
-rw-r--r--net/core/neighbour.c467
-rw-r--r--net/core/net-procfs.c58
-rw-r--r--net/core/net-sysfs.c201
-rw-r--r--net/core/net-traces.c2
-rw-r--r--net/core/net_namespace.c213
-rw-r--r--net/core/net_test.c (renamed from net/core/gso_test.c)131
-rw-r--r--net/core/netdev-genl-gen.c69
-rw-r--r--net/core/netdev-genl-gen.h9
-rw-r--r--net/core/netdev-genl.c581
-rw-r--r--net/core/netdev_rx_queue.c82
-rw-r--r--net/core/netmem_priv.h31
-rw-r--r--net/core/netpoll.c118
-rw-r--r--net/core/page_pool.c665
-rw-r--r--net/core/page_pool_priv.h48
-rw-r--r--net/core/page_pool_user.c49
-rw-r--r--net/core/pktgen.c29
-rw-r--r--net/core/rtnetlink.c1565
-rw-r--r--net/core/rtnl_net_debug.c116
-rw-r--r--net/core/scm.c27
-rw-r--r--net/core/skb_fault_injection.c106
-rw-r--r--net/core/skbuff.c673
-rw-r--r--net/core/skmsg.c32
-rw-r--r--net/core/sock.c311
-rw-r--r--net/core/sock_diag.c118
-rw-r--r--net/core/sock_map.c329
-rw-r--r--net/core/sock_reuseport.c5
-rw-r--r--net/core/sysctl_net_core.c201
-rw-r--r--net/core/timestamping.c51
-rw-r--r--net/core/tso.c2
-rw-r--r--net/core/utils.c2
-rw-r--r--net/core/xdp.c351
58 files changed, 8178 insertions, 3170 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 821aec06abf1..d9326600e289 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -18,6 +18,8 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
+obj-y += hotdata.o
+obj-y += netdev_rx_queue.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
@@ -25,6 +27,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o
obj-$(CONFIG_NET_SELFTESTS) += selftests.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
@@ -40,4 +43,7 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
obj-$(CONFIG_OF) += of_net.o
-obj-$(CONFIG_NET_TEST) += gso_test.o
+obj-$(CONFIG_NET_TEST) += net_test.o
+obj-$(CONFIG_NET_DEVMEM) += devmem.o
+obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o
+obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 6c4d90b24d46..2e538399757f 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
+ migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
- if (!sk_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!sk_storage)
+ goto out;
bpf_local_storage_destroy(sk_storage);
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
@@ -106,7 +107,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
if (sock) {
sdata = bpf_local_storage_update(
sock->sk, (struct bpf_local_storage_map *)map, value,
- map_flags, GFP_ATOMIC);
+ map_flags, false, GFP_ATOMIC);
sockfd_put(sock);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -137,7 +138,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
{
struct bpf_local_storage_elem *copy_selem;
- copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC);
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC);
if (!copy_selem)
return NULL;
@@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+ migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
out:
rcu_read_unlock();
+ migrate_enable();
/* In case of an error, don't free anything explicitly here, the
* caller is responsible to call bpf_sk_storage_free.
@@ -243,7 +246,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
refcount_inc_not_zero(&sk->sk_refcnt)) {
sdata = bpf_local_storage_update(
sk, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, gfp_flags);
+ BPF_NOEXIST, false, gfp_flags);
/* sk must be a fullsock (guaranteed by verifier),
* so sock_gen_put() is unnecessary.
*/
@@ -352,11 +355,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
{
- const struct btf *btf_vmlinux;
- const struct btf_type *t;
- const char *tname;
- u32 btf_id;
-
if (prog->aux->dst_prog)
return false;
@@ -371,13 +369,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
return true;
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- btf_vmlinux = bpf_get_btf_vmlinux();
- if (IS_ERR_OR_NULL(btf_vmlinux))
- return false;
- btf_id = prog->aux->attach_btf_id;
- t = btf_type_by_id(btf_vmlinux, btf_id);
- tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- return !!strncmp(tname, "bpf_sk_storage",
+ return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
strlen("bpf_sk_storage"));
default:
return false;
@@ -496,27 +488,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
if (!bpf_capable())
return ERR_PTR(-EPERM);
- nla_for_each_nested(nla, nla_stgs, rem) {
- if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) {
- if (nla_len(nla) != sizeof(u32))
- return ERR_PTR(-EINVAL);
- nr_maps++;
- }
+ nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+ nla_stgs, rem) {
+ if (nla_len(nla) != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+ nr_maps++;
}
diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
if (!diag)
return ERR_PTR(-ENOMEM);
- nla_for_each_nested(nla, nla_stgs, rem) {
- struct bpf_map *map;
- int map_fd;
-
- if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
- continue;
+ nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+ nla_stgs, rem) {
+ int map_fd = nla_get_u32(nla);
+ struct bpf_map *map = bpf_map_get(map_fd);
- map_fd = nla_get_u32(nla);
- map = bpf_map_get(map_fd);
if (IS_ERR(map)) {
err = PTR_ERR(map);
goto err_free;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a8b625abe242..f0693707aece 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -324,25 +324,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(skb_free_datagram);
-void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
-{
- bool slow;
-
- if (!skb_unref(skb)) {
- sk_peek_offset_bwd(sk, len);
- return;
- }
-
- slow = lock_sock_fast(sk);
- sk_peek_offset_bwd(sk, len);
- skb_orphan(skb);
- unlock_sock_fast(sk, slow);
-
- /* skb is now orphaned, can be freed outside of locked section */
- __kfree_skb(skb);
-}
-EXPORT_SYMBOL(__skb_free_datagram_locked);
-
int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
struct sk_buff *skb, unsigned int flags,
void (*destructor)(struct sock *sk,
@@ -426,6 +407,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
return 0;
}
+ if (!skb_frags_readable(skb))
+ goto short_copy;
+
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
@@ -435,15 +419,23 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- struct page *page = skb_frag_page(frag);
- u8 *vaddr = kmap(page);
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
if (copy > len)
copy = len;
- n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
- vaddr + skb_frag_off(frag) + offset - start,
- copy, data, to);
- kunmap(page);
+
+ n = 0;
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_local_page(p);
+ n += INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + p_off, p_len, data, to);
+ kunmap_local(vaddr);
+ }
+
offset += n;
if (n != copy)
goto short_copy;
@@ -629,16 +621,13 @@ fault:
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
- struct sk_buff *skb, struct iov_iter *from,
- size_t length)
+int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
{
- int frag;
-
- if (msg && msg->msg_ubuf && msg->sg_from_iter)
- return msg->sg_from_iter(sk, skb, from, length);
+ int frag = skb_shinfo(skb)->nr_frags;
- frag = skb_shinfo(skb)->nr_frags;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
while (length && iov_iter_count(from)) {
struct page *head, *last_head = NULL;
@@ -646,7 +635,6 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
int refs, order, n = 0;
size_t start;
ssize_t copied;
- unsigned long truesize;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
@@ -658,17 +646,9 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
length -= copied;
- truesize = PAGE_ALIGN(copied + start);
skb->data_len += copied;
skb->len += copied;
- skb->truesize += truesize;
- if (sk && sk->sk_type == SOCK_STREAM) {
- sk_wmem_queued_add(sk, truesize);
- if (!skb_zcopy_pure(skb))
- sk_mem_charge(sk, truesize);
- } else {
- refcount_add(truesize, &skb->sk->sk_wmem_alloc);
- }
+ skb->truesize += PAGE_ALIGN(copied + start);
head = compound_head(pages[n]);
order = compound_order(head);
@@ -711,6 +691,30 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
}
return 0;
}
+
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, struct iov_iter *from,
+ size_t length)
+{
+ unsigned long orig_size = skb->truesize;
+ unsigned long truesize;
+ int ret;
+
+ if (msg && msg->msg_ubuf && msg->sg_from_iter)
+ ret = msg->sg_from_iter(skb, from, length);
+ else
+ ret = zerocopy_fill_skb_from_iter(skb, from, length);
+
+ truesize = skb->truesize - orig_size;
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk_wmem_queued_add(sk, truesize);
+ if (!skb_zcopy_pure(skb))
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
+ return ret;
+}
EXPORT_SYMBOL(__zerocopy_sg_from_iter);
/**
diff --git a/net/core/dev.c b/net/core/dev.c
index 76e6438f4858..2f7f5fd9ffec 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -77,7 +77,9 @@
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/isolation.h>
#include <linux/sched/mm.h>
+#include <linux/smpboot.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/string.h>
@@ -90,6 +92,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
@@ -103,6 +106,7 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
+#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
@@ -153,41 +157,23 @@
#include <linux/prandom.h>
#include <linux/once_lite.h>
#include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
+#include <net/rps.h>
+#include <linux/phy_link_topology.h>
#include "dev.h"
+#include "devmem.h"
#include "net-sysfs.h"
static DEFINE_SPINLOCK(ptype_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-struct list_head ptype_all __read_mostly; /* Taps */
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_extack(unsigned long val,
struct net_device *dev,
struct netlink_ext_ack *extack);
-/*
- * The @dev_base_head list is protected by @dev_base_lock and the rtnl
- * semaphore.
- *
- * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
- *
- * Writers must hold the rtnl semaphore while they loop through the
- * dev_base_head list, and hold dev_base_lock for writing when they do the
- * actual updates. This allows pure readers to access the list even
- * while a writer is preparing to update it.
- *
- * To put it another way, dev_base_lock is held for writing only to
- * protect against pure readers; the rtnl semaphore provides the
- * protection against other writers.
- *
- * See, for example usages, register_netdevice() and
- * unregister_netdevice(), which must be called with the rtnl
- * semaphore held.
- */
-DEFINE_RWLOCK(dev_base_lock);
-EXPORT_SYMBOL(dev_base_lock);
-
static DEFINE_MUTEX(ifalias_mutex);
/* protects napi_hash addition/deletion and napi_gen_id */
@@ -196,12 +182,11 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static DECLARE_RWSEM(devnet_rename_sem);
-
static inline void dev_base_seq_inc(struct net *net)
{
- while (++net->dev_base_seq == 0)
- ;
+ unsigned int val = net->dev_base_seq + 1;
+
+ WRITE_ONCE(net->dev_base_seq, val ?: 1);
}
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -216,37 +201,62 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
-static inline void rps_lock_irqsave(struct softnet_data *sd,
- unsigned long *flags)
+#ifndef CONFIG_PREEMPT_RT
+
+static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
+
+static int __init setup_backlog_napi_threads(char *arg)
+{
+ static_branch_enable(&use_backlog_threads_key);
+ return 0;
+}
+early_param("thread_backlog_napi", setup_backlog_napi_threads);
+
+static bool use_backlog_threads(void)
+{
+ return static_branch_unlikely(&use_backlog_threads_key);
+}
+
+#else
+
+static bool use_backlog_threads(void)
+{
+ return true;
+}
+
+#endif
+
+static inline void backlog_lock_irq_save(struct softnet_data *sd,
+ unsigned long *flags)
{
- if (IS_ENABLED(CONFIG_RPS))
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_save(*flags);
}
-static inline void rps_lock_irq_disable(struct softnet_data *sd)
+static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
- if (IS_ENABLED(CONFIG_RPS))
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irq(&sd->input_pkt_queue.lock);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_disable();
}
-static inline void rps_unlock_irq_restore(struct softnet_data *sd,
- unsigned long *flags)
+static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
+ unsigned long *flags)
{
- if (IS_ENABLED(CONFIG_RPS))
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_restore(*flags);
}
-static inline void rps_unlock_irq_enable(struct softnet_data *sd)
+static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
- if (IS_ENABLED(CONFIG_RPS))
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irq(&sd->input_pkt_queue.lock);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_enable();
}
@@ -341,13 +351,22 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name)
return 0;
}
-static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+static void netdev_name_node_alt_free(struct rcu_head *head)
{
- list_del(&name_node->list);
+ struct netdev_name_node *name_node =
+ container_of(head, struct netdev_name_node, rcu);
+
kfree(name_node->name);
netdev_name_node_free(name_node);
}
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+{
+ netdev_name_node_del(name_node);
+ list_del(&name_node->list);
+ call_rcu(&name_node->rcu, netdev_name_node_alt_free);
+}
+
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
struct netdev_name_node *name_node;
@@ -362,10 +381,7 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
if (name_node == dev->name_node || name_node->dev != dev)
return -EINVAL;
- netdev_name_node_del(name_node);
- synchronize_rcu();
__netdev_name_node_alt_destroy(name_node);
-
return 0;
}
@@ -373,8 +389,10 @@ static void netdev_name_node_alt_flush(struct net_device *dev)
{
struct netdev_name_node *name_node, *tmp;
- list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
- __netdev_name_node_alt_destroy(name_node);
+ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
+ list_del(&name_node->list);
+ netdev_name_node_alt_free(&name_node->rcu);
+ }
}
/* Device list insertion */
@@ -385,12 +403,10 @@ static void list_netdevice(struct net_device *dev)
ASSERT_RTNL();
- write_lock(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
- write_unlock(&dev_base_lock);
netdev_for_each_altname(dev, name_node)
netdev_name_node_add(net, name_node);
@@ -404,7 +420,7 @@ static void list_netdevice(struct net_device *dev)
/* Device list removal
* caller must respect a RCU grace period before freeing/reusing dev
*/
-static void unlist_netdevice(struct net_device *dev, bool lock)
+static void unlist_netdevice(struct net_device *dev)
{
struct netdev_name_node *name_node;
struct net *net = dev_net(dev);
@@ -417,13 +433,9 @@ static void unlist_netdevice(struct net_device *dev, bool lock)
netdev_name_node_del(name_node);
/* Unlink dev from the device chain */
- if (lock)
- write_lock(&dev_base_lock);
list_del_rcu(&dev->dev_list);
netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
- if (lock)
- write_unlock(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
@@ -439,9 +451,17 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
* queue in the local softnet handler.
*/
-DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
+ .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
+};
EXPORT_PER_CPU_SYMBOL(softnet_data);
+/* Page_pool has a lockless array/stack to alloc/recycle pages.
+ * PP consumers must pay attention to run APIs in the appropriate context
+ * (e.g. NAPI context).
+ */
+DEFINE_PER_CPU(struct page_pool *, system_page_pool);
+
#ifdef CONFIG_LOCKDEP
/*
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
@@ -551,7 +571,7 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
if (pt->type == htons(ETH_P_ALL))
- return pt->dev ? &pt->dev->ptype_all : &ptype_all;
+ return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
else
return pt->dev ? &pt->dev->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
@@ -653,7 +673,7 @@ int dev_get_iflink(const struct net_device *dev)
if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
return dev->netdev_ops->ndo_get_iflink(dev);
- return dev->ifindex;
+ return READ_ONCE(dev->ifindex);
}
EXPORT_SYMBOL(dev_get_iflink);
@@ -733,14 +753,88 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
}
EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
+
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
+
+ return NULL;
+}
+
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *
+netdev_napi_by_id(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ return NULL;
+
+ if (WARN_ON_ONCE(!napi->dev))
+ return NULL;
+ if (!net_eq(net, dev_net(napi->dev)))
+ return NULL;
+
+ return napi;
+}
+
+/**
+ * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
+ * @net: the applicable net namespace
+ * @napi_id: ID of a NAPI of a target device
+ *
+ * Find a NAPI instance with @napi_id. Lock its device.
+ * The device must be in %NETREG_REGISTERED state for lookup to succeed.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to NAPI, its device with lock held, NULL if not found.
+ */
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+ struct net_device *dev;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ dev = napi->dev;
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev);
+ if (!dev)
+ return NULL;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (napi && napi->dev != dev)
+ napi = NULL;
+ rcu_read_unlock();
+
+ if (!napi)
+ netdev_unlock(dev);
+ return napi;
+}
+
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
*
- * Find an interface by name. Must be called under RTNL semaphore
- * or @dev_base_lock. If the name is found a pointer to the device
- * is returned. If the name is not found then %NULL is returned. The
+ * Find an interface by name. Must be called under RTNL semaphore.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned. The
* reference counters are not incremented so the caller must be
* careful with locks.
*/
@@ -821,8 +915,7 @@ EXPORT_SYMBOL(netdev_get_by_name);
* Search for an interface by index. Returns %NULL if the device
* is not found or a pointer to the device. The device has not
* had its reference counter increased so the caller must be careful
- * about locking. The caller must hold either the RTNL semaphore
- * or @dev_base_lock.
+ * about locking. The caller must hold the RTNL semaphore.
*/
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
@@ -908,7 +1001,6 @@ EXPORT_SYMBOL(netdev_get_by_index);
* its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
-
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
struct napi_struct *napi;
@@ -922,7 +1014,85 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id)
return napi ? napi->dev : NULL;
}
-EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/* Release the held reference on the net_device, and if the net_device
+ * is still registered try to lock the instance lock. If device is being
+ * unregistered NULL will be returned (but the reference has been released,
+ * either way!)
+ *
+ * This helper is intended for locking net_device after it has been looked up
+ * using a lockless lookup helper. Lock prevents the instance from going away.
+ */
+struct net_device *__netdev_put_lock(struct net_device *dev)
+{
+ netdev_lock(dev);
+ if (dev->reg_state > NETREG_REGISTERED) {
+ netdev_unlock(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+/**
+ * netdev_get_by_index_lock() - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. If a valid device
+ * with @ifindex is found it will be returned with netdev->lock held.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to a device with lock held, NULL if not found.
+ */
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock(dev);
+}
+
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+static DEFINE_SEQLOCK(netdev_rename_lock);
+
+void netdev_copy_name(struct net_device *dev, char *name)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&netdev_rename_lock);
+ strscpy(name, dev->name, IFNAMSIZ);
+ } while (read_seqretry(&netdev_rename_lock, seq));
+}
/**
* netdev_get_name - get a netdevice name, knowing its ifindex.
@@ -935,7 +1105,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
struct net_device *dev;
int ret;
- down_read(&devnet_rename_sem);
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
@@ -944,15 +1113,20 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
goto out;
}
- strcpy(name, dev->name);
+ netdev_copy_name(dev, name);
ret = 0;
out:
rcu_read_unlock();
- up_read(&devnet_rename_sem);
return ret;
}
+static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
+ const char *ha)
+{
+ return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
+}
+
/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
@@ -961,7 +1135,7 @@ out:
*
* Search for an interface by MAC address. Returns NULL if the device
* is not found or a pointer to the device.
- * The caller must hold RCU or RTNL.
+ * The caller must hold RCU.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -973,14 +1147,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
struct net_device *dev;
for_each_netdev_rcu(net, dev)
- if (dev->type == type &&
- !memcmp(dev->dev_addr, ha, dev->addr_len))
+ if (dev_addr_cmp(dev, type, ha))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+/**
+ * dev_getbyhwaddr() - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
+ * rtnl_lock.
+ *
+ * Context: rtnl_lock() must be held.
+ * Return: pointer to the net_device, or NULL if not found
+ */
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
+ const char *ha)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev_addr_cmp(dev, type, ha))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr);
+
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
@@ -1181,61 +1380,50 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
*/
int dev_change_name(struct net_device *dev, const char *newname)
{
+ struct net *net = dev_net(dev);
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
- struct net *net;
- ASSERT_RTNL();
- BUG_ON(!dev_net(dev));
+ ASSERT_RTNL_NET(net);
- net = dev_net(dev);
-
- down_write(&devnet_rename_sem);
-
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- up_write(&devnet_rename_sem);
+ if (!strncmp(newname, dev->name, IFNAMSIZ))
return 0;
- }
memcpy(oldname, dev->name, IFNAMSIZ);
+ write_seqlock_bh(&netdev_rename_lock);
err = dev_get_valid_name(net, dev, newname);
- if (err < 0) {
- up_write(&devnet_rename_sem);
+ write_sequnlock_bh(&netdev_rename_lock);
+
+ if (err < 0)
return err;
- }
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s%s\n", oldname,
dev->flags & IFF_UP ? " (while UP)" : "");
old_assign_type = dev->name_assign_type;
- dev->name_assign_type = NET_NAME_RENAMED;
+ WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
- dev->name_assign_type = old_assign_type;
- up_write(&devnet_rename_sem);
+ write_sequnlock_bh(&netdev_rename_lock);
+ WRITE_ONCE(dev->name_assign_type, old_assign_type);
return ret;
}
- up_write(&devnet_rename_sem);
-
netdev_adjacent_rename_links(dev, oldname);
- write_lock(&dev_base_lock);
netdev_name_node_del(dev->name_node);
- write_unlock(&dev_base_lock);
- synchronize_rcu();
+ synchronize_net();
- write_lock(&dev_base_lock);
netdev_name_node_add(net, dev->name_node);
- write_unlock(&dev_base_lock);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
ret = notifier_to_errno(ret);
@@ -1244,10 +1432,11 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- down_write(&devnet_rename_sem);
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
memcpy(oldname, newname, IFNAMSIZ);
- dev->name_assign_type = old_assign_type;
+ WRITE_ONCE(dev->name_assign_type, old_assign_type);
old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
@@ -1450,7 +1639,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- dev->flags |= IFF_UP;
+ netif_set_up(dev, true);
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1529,7 +1718,7 @@ static void __dev_close_many(struct list_head *head)
if (ops->ndo_stop)
ops->ndo_stop(dev);
- dev->flags &= ~IFF_UP;
+ netif_set_up(dev, false);
netpoll_poll_enable(dev);
}
}
@@ -1739,14 +1928,19 @@ int register_netdevice_notifier(struct notifier_block *nb)
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
+
+ /* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
+
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
+ __rtnl_net_lock(net);
err = call_netdevice_register_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
if (err)
goto rollback;
}
@@ -1757,8 +1951,11 @@ unlock:
return err;
rollback:
- for_each_net_continue_reverse(net)
+ for_each_net_continue_reverse(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
@@ -1791,8 +1988,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
if (err)
goto unlock;
- for_each_net(net)
+ for_each_net(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
unlock:
rtnl_unlock();
@@ -1856,9 +2056,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __register_netdevice_notifier_net(net, nb, false);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);
@@ -1884,9 +2085,10 @@ int unregister_netdevice_notifier_net(struct net *net,
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __unregister_netdevice_notifier_net(net, nb);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
@@ -1899,19 +2101,56 @@ static void __move_netdevice_notifier_net(struct net *src_net,
__register_netdevice_notifier_net(dst_net, nb, true);
}
+static void rtnl_net_dev_lock(struct net_device *dev)
+{
+ bool again;
+
+ do {
+ struct net *net;
+
+ again = false;
+
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ net_passive_inc(net);
+ rcu_read_unlock();
+
+ rtnl_net_lock(net);
+
+#ifdef CONFIG_NET_NS
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ again = true;
+ }
+#endif
+ } while (again);
+}
+
+static void rtnl_net_dev_unlock(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+}
+
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
if (!err) {
nn->nb = nb;
list_add(&nn->list, &dev->net_notifier_list);
}
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
@@ -1922,10 +2161,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev,
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
list_del(&nn->list);
err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
@@ -2073,6 +2313,11 @@ void net_dec_egress_queue(void)
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
+#ifdef CONFIG_NET_CLS_ACT
+DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
+EXPORT_SYMBOL(tcf_sw_enabled_key);
+#endif
+
DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
@@ -2129,7 +2374,7 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
- skb->mono_delivery_time = 0;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
if (static_branch_unlikely(&netstamp_needed_key))
skb->tstamp = ktime_get_real();
}
@@ -2242,7 +2487,8 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
*/
bool dev_nit_active(struct net_device *dev)
{
- return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
+ return !list_empty(&net_hotdata.ptype_all) ||
+ !list_empty(&dev->ptype_all);
}
EXPORT_SYMBOL_GPL(dev_nit_active);
@@ -2253,15 +2499,14 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
- struct packet_type *ptype;
+ struct list_head *ptype_list = &net_hotdata.ptype_all;
+ struct packet_type *ptype, *pt_prev = NULL;
struct sk_buff *skb2 = NULL;
- struct packet_type *pt_prev = NULL;
- struct list_head *ptype_list = &ptype_all;
rcu_read_lock();
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
- if (ptype->ignore_outgoing)
+ if (READ_ONCE(ptype->ignore_outgoing))
continue;
/* Never send packets back to the socket
@@ -2302,7 +2547,7 @@ again:
pt_prev = ptype;
}
- if (ptype_list == &ptype_all) {
+ if (ptype_list == &net_hotdata.ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
}
@@ -2914,6 +3159,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
+ net_shaper_set_real_num_tx_queues(dev, txq);
+
dev_qdisc_change_real_num_tx(dev, txq);
dev->real_num_tx_queues = txq;
@@ -3196,7 +3443,7 @@ void netif_device_attach(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
@@ -3277,6 +3524,10 @@ int skb_checksum_help(struct sk_buff *skb)
return -EINVAL;
}
+ if (!skb_frags_readable(skb)) {
+ return -EFAULT;
+ }
+
/* Before computing a checksum, we should make sure no frag could
* be modified by an external entity : checksum could be wrong.
*/
@@ -3353,6 +3604,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
out:
return ret;
}
+EXPORT_SYMBOL(skb_crc32c_csum_help);
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
@@ -3398,8 +3650,9 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = skb_frag_page(frag);
- if (PageHighMem(skb_frag_page(frag)))
+ if (page && PageHighMem(page))
return 1;
}
}
@@ -3471,7 +3724,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (gso_segs > READ_ONCE(dev->gso_max_segs))
return features & ~NETIF_F_GSO_MASK;
- if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
+ if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
return features & ~NETIF_F_GSO_MASK;
if (!skb_shinfo(skb)->gso_type) {
@@ -3598,6 +3851,11 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
return 0;
if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
+ if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ goto sw_checksum;
+
switch (skb->csum_offset) {
case offsetof(struct tcphdr, check):
case offsetof(struct udphdr, check):
@@ -3605,6 +3863,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
}
}
+sw_checksum:
return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
@@ -3613,6 +3872,9 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
{
netdev_features_t features;
+ if (!skb_frags_readable(skb))
+ goto out_kfree_skb;
+
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -3672,7 +3934,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
next = skb->next;
skb_mark_not_on_list(skb);
- /* in case skb wont be segmented, point to itself */
+ /* in case skb won't be segmented, point to itself */
skb->prev = skb;
skb = validate_xmit_skb(skb, dev, again);
@@ -3717,7 +3979,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
sizeof(_tcphdr), &_tcphdr);
if (likely(th))
hdr_len += __tcp_hdrlen(th);
- } else {
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
struct udphdr _udphdr;
if (skb_header_pointer(skb, hdr_len,
@@ -3725,10 +3987,14 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
hdr_len += sizeof(struct udphdr);
}
- if (shinfo->gso_type & SKB_GSO_DODGY)
- gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
- shinfo->gso_size);
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
+ int payload = skb->len - hdr_len;
+ /* Malicious packet. */
+ if (payload <= 0)
+ return;
+ gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
+ }
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
}
@@ -3791,6 +4057,10 @@ no_lock_out:
return rc;
}
+ if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
+ return NET_XMIT_DROP;
+ }
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
@@ -3830,7 +4100,9 @@ no_lock_out:
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
+ WRITE_ONCE(q->owner, smp_processor_id());
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ WRITE_ONCE(q->owner, -1);
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -3903,6 +4175,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
}
+#ifndef CONFIG_PREEMPT_RT
static bool netdev_xmit_txqueue_skipped(void)
{
return __this_cpu_read(softnet_data.xmit.skip_txqueue);
@@ -3913,6 +4186,19 @@ void netdev_xmit_skip_txqueue(bool skip)
__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+
+#else
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return current->net_xmit.skip_txqueue;
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ current->net_xmit.skip_txqueue = skip;
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+#endif
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_NET_XGRESS
@@ -3927,6 +4213,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
if (!miniq)
return ret;
+ /* Global bypass */
+ if (!static_branch_likely(&tcf_sw_enabled_key))
+ return ret;
+
+ /* Block-wise bypass */
+ if (tcf_block_bypass_sw(miniq->block))
+ return ret;
+
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
tcf_set_drop_reason(skb, *drop_reason);
@@ -3987,10 +4281,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int sch_ret;
if (!entry)
return skb;
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
@@ -4019,10 +4316,12 @@ ingress_verdict:
break;
}
*ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_RX_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
/* used by tc_run */
case TC_ACT_STOLEN:
@@ -4032,8 +4331,10 @@ ingress_verdict:
fallthrough;
case TC_ACT_CONSUMED:
*ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
@@ -4043,11 +4344,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int sch_ret;
if (!entry)
return skb;
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
* already set by the caller.
*/
@@ -4063,10 +4367,12 @@ egress_verdict:
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_XMIT_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
/* used by tc_run */
case TC_ACT_STOLEN:
@@ -4076,8 +4382,10 @@ egress_verdict:
fallthrough;
case TC_ACT_CONSUMED:
*ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
@@ -4173,13 +4481,6 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
}
EXPORT_SYMBOL(dev_pick_tx_zero);
-u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev)
-{
- return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
-}
-EXPORT_SYMBOL(dev_pick_tx_cpu_id);
-
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
{
@@ -4420,20 +4721,11 @@ EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
*************************************************************************/
+static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
-int netdev_max_backlog __read_mostly = 1000;
-EXPORT_SYMBOL(netdev_max_backlog);
-
-int netdev_tstamp_prequeue __read_mostly = 1;
-unsigned int sysctl_skb_defer_max __read_mostly = 64;
-int netdev_budget __read_mostly = 300;
-/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
-unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
-int dev_rx_weight __read_mostly = 64;
-int dev_tx_weight __read_mostly = 64;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4452,35 +4744,27 @@ static inline void ____napi_schedule(struct softnet_data *sd,
*/
thread = READ_ONCE(napi->thread);
if (thread) {
- /* Avoid doing set_bit() if the thread is in
- * INTERRUPTIBLE state, cause napi_thread_wait()
- * makes sure to proceed with napi polling
- * if the thread is explicitly woken from here.
- */
- if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
- set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
+ goto use_local_napi;
+
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
wake_up_process(thread);
return;
}
}
+use_local_napi:
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ.
*/
if (!sd->in_net_rx_action)
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-u32 rps_cpu_mask __read_mostly;
-EXPORT_SYMBOL(rps_cpu_mask);
-
struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
@@ -4491,12 +4775,13 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
if (next_cpu < nr_cpu_ids) {
+ u32 head;
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
- u32 flow_id;
u16 rxq_index;
+ u32 flow_id;
int rc;
/* Should we steer this flow to a different hardware queue? */
@@ -4518,16 +4803,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
goto out;
old_rflow = rflow;
rflow = &flow_table->flows[flow_id];
- rflow->filter = rc;
- if (old_rflow->filter == rflow->filter)
- old_rflow->filter = RPS_NO_FILTER;
+ WRITE_ONCE(rflow->filter, rc);
+ if (old_rflow->filter == rc)
+ WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out:
#endif
- rflow->last_qtail =
- per_cpu(softnet_data, next_cpu).input_queue_head;
+ head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
+ rps_input_queue_tail_save(&rflow->last_qtail, head);
}
- rflow->cpu = next_cpu;
+ WRITE_ONCE(rflow->cpu, next_cpu);
return rflow;
}
@@ -4572,7 +4857,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (!hash)
goto done;
- sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
if (flow_table && sock_flow_table) {
struct rps_dev_flow *rflow;
u32 next_cpu;
@@ -4582,10 +4867,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
* This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
*/
ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
- if ((ident ^ hash) & ~rps_cpu_mask)
+ if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
goto try_rps;
- next_cpu = ident & rps_cpu_mask;
+ next_cpu = ident & net_hotdata.rps_cpu_mask;
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
@@ -4606,7 +4891,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
*/
if (unlikely(tcpu != next_cpu) &&
(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
- ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+ ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
@@ -4660,9 +4945,9 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
if (flow_table && flow_id <= flow_table->mask) {
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
- if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
- ((int)(per_cpu(softnet_data, cpu).input_queue_head -
- rflow->last_qtail) <
+ if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
+ ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
+ READ_ONCE(rflow->last_qtail)) <
(int)(10 * flow_table->mask)))
expire = false;
}
@@ -4709,6 +4994,11 @@ static void napi_schedule_rps(struct softnet_data *sd)
#ifdef CONFIG_RPS
if (sd != mysd) {
+ if (use_backlog_threads()) {
+ __napi_schedule_irqoff(&sd->backlog);
+ return;
+ }
+
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
@@ -4723,6 +5013,23 @@ static void napi_schedule_rps(struct softnet_data *sd)
__napi_schedule_irqoff(&mysd->backlog);
}
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
+{
+ unsigned long flags;
+
+ if (use_backlog_threads()) {
+ backlog_lock_irq_save(sd, &flags);
+
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+ __napi_schedule_irqoff(&sd->backlog);
+
+ backlog_unlock_irq_restore(sd, &flags);
+
+ } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+ smp_call_function_single_async(cpu, &sd->defer_csd);
+ }
+}
+
#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif
@@ -4734,7 +5041,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
struct softnet_data *sd;
unsigned int old_flow, new_flow;
- if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
+ if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
return false;
sd = this_cpu_ptr(&softnet_data);
@@ -4774,36 +5081,45 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
struct softnet_data *sd;
unsigned long flags;
unsigned int qlen;
+ int max_backlog;
+ u32 tail;
+
+ reason = SKB_DROP_REASON_DEV_READY;
+ if (!netif_running(skb->dev))
+ goto bad_dev;
- reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ reason = SKB_DROP_REASON_CPU_BACKLOG;
sd = &per_cpu(softnet_data, cpu);
- rps_lock_irqsave(sd, &flags);
- if (!netif_running(skb->dev))
- goto drop;
+ qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
+ max_backlog = READ_ONCE(net_hotdata.max_backlog);
+ if (unlikely(qlen > max_backlog))
+ goto cpu_backlog_drop;
+ backlog_lock_irq_save(sd, &flags);
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
- if (qlen) {
-enqueue:
- __skb_queue_tail(&sd->input_pkt_queue, skb);
- input_queue_tail_incr_save(sd, qtail);
- rps_unlock_irq_restore(sd, &flags);
- return NET_RX_SUCCESS;
+ if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (!qlen) {
+ /* Schedule NAPI for backlog device. We can use
+ * non atomic operation as we own the queue lock.
+ */
+ if (!__test_and_set_bit(NAPI_STATE_SCHED,
+ &sd->backlog.state))
+ napi_schedule_rps(sd);
}
+ __skb_queue_tail(&sd->input_pkt_queue, skb);
+ tail = rps_input_queue_tail_incr(sd);
+ backlog_unlock_irq_restore(sd, &flags);
- /* Schedule NAPI for backlog device
- * We can use non atomic operation since we own the queue lock
- */
- if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
- napi_schedule_rps(sd);
- goto enqueue;
+ /* save the tail outside of the critical section */
+ rps_input_queue_tail_save(qtail, tail);
+ return NET_RX_SUCCESS;
}
- reason = SKB_DROP_REASON_CPU_BACKLOG;
-drop:
- sd->dropped++;
- rps_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, &flags);
+cpu_backlog_drop:
+ atomic_inc(&sd->dropped);
+bad_dev:
dev_core_stats_rx_dropped_inc(skb->dev);
kfree_skb_reason(skb, reason);
return NET_RX_DROP;
@@ -4833,7 +5149,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
@@ -4858,6 +5174,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
skb_headlen(skb) + mac_len, true);
+ if (skb_is_nonlinear(skb)) {
+ skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+ xdp_buff_set_frags_flag(xdp);
+ } else {
+ xdp_buff_clear_frags_flag(xdp);
+ }
orig_data_end = xdp->data_end;
orig_data = xdp->data;
@@ -4887,6 +5209,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
skb->len += off; /* positive on grow, negative on shrink */
}
+ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+ * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+ */
+ if (xdp_buff_has_frags(xdp))
+ skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+ else
+ skb->data_len = 0;
+
/* check if XDP changed eth hdr such SKB needs update */
eth = (struct ethhdr *)xdp->data;
if ((orig_eth_type != eth->h_proto) ||
@@ -4920,11 +5250,35 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
return act;
}
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+static int
+netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
+{
+ struct sk_buff *skb = *pskb;
+ int err, hroom, troom;
+
+ if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+ return 0;
+
+ /* In case we have to go down the path and also linearize,
+ * then lets do the pskb_expand_head() work just once here.
+ */
+ hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+ troom = skb->tail + skb->data_len - skb->end;
+ err = pskb_expand_head(skb,
+ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+ troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+ if (err)
+ return err;
+
+ return skb_linearize(skb);
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
- u32 act = XDP_DROP;
+ struct sk_buff *skb = *pskb;
+ u32 mac_len, act = XDP_DROP;
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
@@ -4932,41 +5286,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
if (skb_is_redirected(skb))
return XDP_PASS;
- /* XDP packets must be linear and must have sufficient headroom
- * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
- * native XDP provides, thus we need to do it here as well.
+ /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+ * bytes. This is the guarantee that also native XDP provides,
+ * thus we need to do it here as well.
*/
+ mac_len = skb->data - skb_mac_header(skb);
+ __skb_push(skb, mac_len);
+
if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
- int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
- int troom = skb->tail + skb->data_len - skb->end;
-
- /* In case we have to go down the path and also linearize,
- * then lets do the pskb_expand_head() work just once here.
- */
- if (pskb_expand_head(skb,
- hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
- troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
- goto do_drop;
- if (skb_linearize(skb))
+ if (netif_skb_check_for_xdp(pskb, xdp_prog))
goto do_drop;
}
- act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+ __skb_pull(*pskb, mac_len);
+
+ act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
case XDP_PASS:
break;
default:
- bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
+ bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
- trace_xdp_exception(skb->dev, xdp_prog, act);
+ trace_xdp_exception((*pskb)->dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
do_drop:
- kfree_skb(skb);
+ kfree_skb(*pskb);
break;
}
@@ -4979,7 +5328,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
* and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
* queues, so they do not have this starvation issue.
*/
-void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -5004,32 +5353,38 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
+int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+
if (xdp_prog) {
struct xdp_buff xdp;
u32 act;
int err;
- act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
if (act != XDP_PASS) {
switch (act) {
case XDP_REDIRECT:
- err = xdp_do_generic_redirect(skb->dev, skb,
+ err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
&xdp, xdp_prog);
if (err)
goto out_redir;
break;
case XDP_TX:
- generic_xdp_tx(skb, xdp_prog);
+ generic_xdp_tx(*pskb, xdp_prog);
break;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return XDP_DROP;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
}
return XDP_PASS;
out_redir:
- kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
+ bpf_net_ctx_clear(bpf_net_ctx);
+ kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);
@@ -5038,7 +5393,7 @@ static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
trace_netif_rx(skb);
@@ -5122,7 +5477,7 @@ int netif_rx(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_rx);
-static __latent_entropy void net_tx_action(struct softirq_action *h)
+static __latent_entropy void net_tx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -5144,7 +5499,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
trace_consume_skb(skb, net_tx_action);
else
trace_kfree_skb(skb, net_tx_action,
- get_kfree_skb_cb(skb)->reason);
+ get_kfree_skb_cb(skb)->reason, NULL);
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
@@ -5330,15 +5685,21 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
int ret = NET_RX_DROP;
__be16 type;
- net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
+ net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
orig_dev = skb->dev;
skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+ /* We plan to no longer reset the transport header here.
+ * Give some time to fuzzers and dev build to catch bugs
+ * in network stacks.
+ */
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
+#endif
skb_reset_mac_len(skb);
pt_prev = NULL;
@@ -5352,7 +5713,8 @@ another_round:
int ret2;
migrate_disable();
- ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+ ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ &skb);
migrate_enable();
if (ret2 != XDP_PASS) {
@@ -5373,7 +5735,7 @@ another_round:
if (pfmemalloc)
goto skip_taps;
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -5598,10 +5960,9 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct packet_type *pt_curr = NULL;
/* Current (common) orig_dev of sublist */
struct net_device *od_curr = NULL;
- struct list_head sublist;
struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
@@ -5713,7 +6074,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
@@ -5739,11 +6100,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
void netif_receive_skb_list_internal(struct list_head *head)
{
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
- net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
+ skb);
skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(&skb->list, &sublist);
@@ -5822,8 +6183,6 @@ void netif_receive_skb_list(struct list_head *head)
}
EXPORT_SYMBOL(netif_receive_skb_list);
-static DEFINE_PER_CPU(struct work_struct, flush_works);
-
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
@@ -5833,23 +6192,25 @@ static void flush_backlog(struct work_struct *work)
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
- rps_lock_irq_disable(sd);
+ backlog_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
dev_kfree_skb_irq(skb);
- input_queue_head_incr(sd);
+ rps_input_queue_head_incr(sd);
}
}
- rps_unlock_irq_enable(sd);
+ backlog_unlock_irq_enable(sd);
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
- input_queue_head_incr(sd);
+ rps_input_queue_head_incr(sd);
}
}
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
local_bh_enable();
}
@@ -5859,14 +6220,14 @@ static bool flush_required(int cpu)
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
bool do_flush;
- rps_lock_irq_disable(sd);
+ backlog_lock_irq_disable(sd);
/* as insertion into process_queue happens with the rps lock held,
* process_queue access may race only with dequeue
*/
do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
!skb_queue_empty_lockless(&sd->process_queue);
- rps_unlock_irq_enable(sd);
+ backlog_unlock_irq_enable(sd);
return do_flush;
#endif
@@ -5878,36 +6239,54 @@ static bool flush_required(int cpu)
return true;
}
+struct flush_backlogs {
+ cpumask_t flush_cpus;
+ struct work_struct w[];
+};
+
+static struct flush_backlogs *flush_backlogs_alloc(void)
+{
+ return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
+ GFP_KERNEL);
+}
+
+static struct flush_backlogs *flush_backlogs_fallback;
+static DEFINE_MUTEX(flush_backlogs_mutex);
+
static void flush_all_backlogs(void)
{
- static cpumask_t flush_cpus;
+ struct flush_backlogs *ptr = flush_backlogs_alloc();
unsigned int cpu;
- /* since we are under rtnl lock protection we can use static data
- * for the cpumask and avoid allocating on stack the possibly
- * large mask
- */
- ASSERT_RTNL();
+ if (!ptr) {
+ mutex_lock(&flush_backlogs_mutex);
+ ptr = flush_backlogs_fallback;
+ }
+ cpumask_clear(&ptr->flush_cpus);
cpus_read_lock();
- cpumask_clear(&flush_cpus);
for_each_online_cpu(cpu) {
if (flush_required(cpu)) {
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
- cpumask_set_cpu(cpu, &flush_cpus);
+ INIT_WORK(&ptr->w[cpu], flush_backlog);
+ queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
+ __cpumask_set_cpu(cpu, &ptr->flush_cpus);
}
}
/* we can have in flight packet[s] on the cpus we are not flushing,
* synchronize_net() in unregister_netdevice_many() will take care of
- * them
+ * them.
*/
- for_each_cpu(cpu, &flush_cpus)
- flush_work(per_cpu_ptr(&flush_works, cpu));
+ for_each_cpu(cpu, &ptr->flush_cpus)
+ flush_work(&ptr->w[cpu]);
cpus_read_unlock();
+
+ if (ptr != flush_backlogs_fallback)
+ kfree(ptr);
+ else
+ mutex_unlock(&flush_backlogs_mutex);
}
static void net_rps_send_ipi(struct softnet_data *remsd)
@@ -5932,7 +6311,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
- if (remsd) {
+ if (!use_backlog_threads() && remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
@@ -5947,7 +6326,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- return sd->rps_ipi_list != NULL;
+ return !use_backlog_threads() && sd->rps_ipi_list;
#else
return false;
#endif
@@ -5967,21 +6346,26 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = READ_ONCE(dev_rx_weight);
+ napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
while (again) {
struct sk_buff *skb;
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
while ((skb = __skb_dequeue(&sd->process_queue))) {
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
- input_queue_head_incr(sd);
- if (++work >= quota)
+ if (++work >= quota) {
+ rps_input_queue_head_add(sd, work);
return work;
+ }
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
}
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
- rps_lock_irq_disable(sd);
+ backlog_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
@@ -5991,15 +6375,19 @@ static int process_backlog(struct napi_struct *napi, int quota)
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
- napi->state = 0;
+ napi->state &= NAPIF_STATE_THREADED;
again = false;
} else {
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
}
- rps_unlock_irq_enable(sd);
+ backlog_unlock_irq_enable(sd);
}
+ if (work)
+ rps_input_queue_head_add(sd, work);
return work;
}
@@ -6088,12 +6476,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
if (work_done) {
if (n->gro_bitmask)
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
- n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ timeout = napi_get_gro_flush_timeout(n);
+ n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(n);
if (timeout)
ret = false;
}
@@ -6143,17 +6531,25 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
}
EXPORT_SYMBOL(napi_complete_done);
-/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *napi_by_id(unsigned int napi_id)
+static void skb_defer_free_flush(struct softnet_data *sd)
{
- unsigned int hash = napi_id % HASH_SIZE(napi_hash);
- struct napi_struct *napi;
+ struct sk_buff *skb, *next;
- hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
- if (napi->napi_id == napi_id)
- return napi;
+ /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
+ if (!READ_ONCE(sd->defer_list))
+ return;
- return NULL;
+ spin_lock(&sd->defer_lock);
+ skb = sd->defer_list;
+ sd->defer_list = NULL;
+ sd->defer_count = 0;
+ spin_unlock(&sd->defer_lock);
+
+ while (skb != NULL) {
+ next = skb->next;
+ napi_consume_skb(skb, 1);
+ skb = next;
+ }
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
@@ -6177,9 +6573,15 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
- u16 budget)
+enum {
+ NAPI_F_PREFER_BUSY_POLL = 1,
+ NAPI_F_END_ON_RESCHED = 2,
+};
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
+ unsigned flags, u16 budget)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
bool skip_schedule = false;
unsigned long timeout;
int rc;
@@ -6197,10 +6599,11 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- if (prefer_busy_poll) {
- napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
- timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ if (flags & NAPI_F_PREFER_BUSY_POLL) {
+ napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
+ timeout = napi_get_gro_flush_timeout(napi);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
skip_schedule = true;
@@ -6219,26 +6622,28 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
netpoll_poll_unlock(have_poll_lock);
if (rc == budget)
__busy_poll_stop(napi, skip_schedule);
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
}
-void napi_busy_loop(unsigned int napi_id,
- bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+static void __napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, unsigned flags, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
void *have_poll_lock = NULL;
struct napi_struct *napi;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
restart:
napi_poll = NULL;
- rcu_read_lock();
-
napi = napi_by_id(napi_id);
if (!napi)
- goto out;
+ return;
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
@@ -6246,6 +6651,7 @@ restart:
int work = 0;
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (!napi_poll) {
unsigned long val = READ_ONCE(napi->state);
@@ -6254,14 +6660,14 @@ restart:
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
NAPIF_STATE_IN_BUSY_POLL)) {
- if (prefer_busy_poll)
+ if (flags & NAPI_F_PREFER_BUSY_POLL)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
NAPIF_STATE_SCHED) != val) {
- if (prefer_busy_poll)
+ if (flags & NAPI_F_PREFER_BUSY_POLL)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
@@ -6275,18 +6681,23 @@ count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
+ skb_defer_free_flush(this_cpu_ptr(&softnet_data));
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
if (!loop_end || loop_end(loop_end_arg, start_time))
break;
if (unlikely(need_resched())) {
+ if (flags & NAPI_F_END_ON_RESCHED)
+ break;
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
+ busy_poll_stop(napi, have_poll_lock, flags, budget);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
rcu_read_unlock();
cond_resched();
+ rcu_read_lock();
if (loop_end(loop_end_arg, start_time))
return;
goto restart;
@@ -6294,34 +6705,111 @@ count:
cpu_relax();
}
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
+ busy_poll_stop(napi, have_poll_lock, flags, budget);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
-out:
+}
+
+void napi_busy_loop_rcu(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+ unsigned flags = NAPI_F_END_ON_RESCHED;
+
+ if (prefer_busy_poll)
+ flags |= NAPI_F_PREFER_BUSY_POLL;
+
+ __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
+}
+
+void napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+ unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
+
+ rcu_read_lock();
+ __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);
+void napi_suspend_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ unsigned long timeout = napi_get_irq_suspend_timeout(napi);
+
+ if (timeout)
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ }
+ rcu_read_unlock();
+}
+
+void napi_resume_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ /* If irq_suspend_timeout is set to 0 between the call to
+ * napi_suspend_irqs and now, the original value still
+ * determines the safety timeout as intended and napi_watchdog
+ * will resume irq processing.
+ */
+ if (napi_get_irq_suspend_timeout(napi)) {
+ local_bh_disable();
+ napi_schedule(napi);
+ local_bh_enable();
+ }
+ }
+ rcu_read_unlock();
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ WRITE_ONCE(napi->napi_id, napi_id);
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
+ WARN_ON_ONCE(napi_by_id(napi_id));
+ __napi_hash_add_with_id(napi, napi_id);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
+}
+
static void napi_hash_add(struct napi_struct *napi)
{
+ unsigned long flags;
+
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
- spin_lock(&napi_hash_lock);
+ spin_lock_irqsave(&napi_hash_lock, flags);
/* 0..NR_CPUS range is reserved for sender_cpu use */
do {
if (unlikely(++napi_gen_id < MIN_NAPI_ID))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
- napi->napi_id = napi_gen_id;
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ __napi_hash_add_with_id(napi, napi_gen_id);
- spin_unlock(&napi_hash_lock);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
}
/* Warning : caller is responsible to make sure rcu grace period
@@ -6329,11 +6817,13 @@ static void napi_hash_add(struct napi_struct *napi)
*/
static void napi_hash_del(struct napi_struct *napi)
{
- spin_lock(&napi_hash_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
hlist_del_init_rcu(&napi->napi_hash_node);
- spin_unlock(&napi_hash_lock);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
}
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
@@ -6370,6 +6860,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
struct napi_struct *napi;
int err = 0;
+ netdev_assert_locked_or_invisible(dev);
+
if (dev->threaded == threaded)
return 0;
@@ -6385,7 +6877,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
}
}
- dev->threaded = threaded;
+ WRITE_ONCE(dev->threaded, threaded);
/* Make sure kthread is created before THREADED bit
* is set.
@@ -6442,9 +6934,83 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
}
EXPORT_SYMBOL(netif_queue_set_napi);
-void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+static void napi_restore_config(struct napi_struct *n)
+{
+ n->defer_hard_irqs = n->config->defer_hard_irqs;
+ n->gro_flush_timeout = n->config->gro_flush_timeout;
+ n->irq_suspend_timeout = n->config->irq_suspend_timeout;
+ /* a NAPI ID might be stored in the config, if so use it. if not, use
+ * napi_hash_add to generate one for us.
+ */
+ if (n->config->napi_id) {
+ napi_hash_add_with_id(n, n->config->napi_id);
+ } else {
+ napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+ n->config->defer_hard_irqs = n->defer_hard_irqs;
+ n->config->gro_flush_timeout = n->gro_flush_timeout;
+ n->config->irq_suspend_timeout = n->irq_suspend_timeout;
+ napi_hash_del(n);
+}
+
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (pos->napi_id >= MIN_NAPI_ID)
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+static void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
+void netif_napi_add_weight_locked(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ netdev_assert_locked(dev);
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
@@ -6468,24 +7034,32 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
- list_add_rcu(&napi->dev_list, &dev->napi_list);
- napi_hash_add(napi);
+ netif_napi_dev_list_add(dev, napi);
+
+ /* default settings from sysfs are applied to all NAPIs. any per-NAPI
+ * configuration will be loaded in napi_enable
+ */
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
napi_get_frags_check(napi);
/* Create kthread for this napi if dev->threaded is set.
* Clear dev->threaded if kthread creation failed so that
* threaded mode will not be enabled in napi_enable().
*/
if (dev->threaded && napi_kthread_create(napi))
- dev->threaded = 0;
- netif_napi_set_irq(napi, -1);
+ dev->threaded = false;
+ netif_napi_set_irq_locked(napi, -1);
}
-EXPORT_SYMBOL(netif_napi_add_weight);
+EXPORT_SYMBOL(netif_napi_add_weight_locked);
-void napi_disable(struct napi_struct *n)
+void napi_disable_locked(struct napi_struct *n)
{
unsigned long val, new;
might_sleep();
+ netdev_assert_locked(n->dev);
+
set_bit(NAPI_STATE_DISABLE, &n->state);
val = READ_ONCE(n->state);
@@ -6501,21 +7075,40 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ if (n->config)
+ napi_save_config(n);
+ else
+ napi_hash_del(n);
+
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
-EXPORT_SYMBOL(napi_disable);
+EXPORT_SYMBOL(napi_disable_locked);
/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
+ * napi_disable() - prevent NAPI from scheduling
+ * @n: NAPI context
*
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
+ * Stop NAPI from being scheduled on this context.
+ * Waits till any outstanding processing completes.
+ * Takes netdev_lock() for associated net_device.
*/
-void napi_enable(struct napi_struct *n)
+void napi_disable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_disable_locked(n);
+ netdev_unlock(n->dev);
+}
+EXPORT_SYMBOL(napi_disable);
+
+void napi_enable_locked(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
+ if (n->config)
+ napi_restore_config(n);
+ else
+ napi_hash_add(n);
+
do {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
@@ -6524,6 +7117,22 @@ void napi_enable(struct napi_struct *n)
new |= NAPIF_STATE_THREADED;
} while (!try_cmpxchg(&n->state, &val, new));
}
+EXPORT_SYMBOL(napi_enable_locked);
+
+/**
+ * napi_enable() - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Enable scheduling of a NAPI instance.
+ * Must be paired with napi_disable().
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_enable_locked(n);
+ netdev_unlock(n->dev);
+}
EXPORT_SYMBOL(napi_enable);
static void flush_gro_hash(struct napi_struct *napi)
@@ -6540,12 +7149,18 @@ static void flush_gro_hash(struct napi_struct *napi)
}
/* Must be called in process context */
-void __netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del_locked(struct napi_struct *napi)
{
+ netdev_assert_locked(napi->dev);
+
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
- napi_hash_del(napi);
+ if (napi->config) {
+ napi->index = -1;
+ napi->config = NULL;
+ }
+
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
@@ -6557,7 +7172,7 @@ void __netif_napi_del(struct napi_struct *napi)
napi->thread = NULL;
}
}
-EXPORT_SYMBOL(__netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del_locked);
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
@@ -6654,8 +7269,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
static int napi_thread_wait(struct napi_struct *napi)
{
- bool woken = false;
-
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
@@ -6664,15 +7277,13 @@ static int napi_thread_wait(struct napi_struct *napi)
* Testing SCHED bit is not enough because SCHED bit might be
* set by some other busy poll thread or by napi_disable().
*/
- if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
- /* woken being true indicates this thread owns this napi. */
- woken = true;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -6680,73 +7291,66 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
-static void skb_defer_free_flush(struct softnet_data *sd)
+static void napi_threaded_poll_loop(struct napi_struct *napi)
{
- struct sk_buff *skb, *next;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ struct softnet_data *sd;
+ unsigned long last_qs = jiffies;
- /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
- if (!READ_ONCE(sd->defer_list))
- return;
+ for (;;) {
+ bool repoll = false;
+ void *have;
- spin_lock(&sd->defer_lock);
- skb = sd->defer_list;
- sd->defer_list = NULL;
- sd->defer_count = 0;
- spin_unlock(&sd->defer_lock);
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- while (skb != NULL) {
- next = skb->next;
- napi_consume_skb(skb, 1);
- skb = next;
- }
-}
+ sd = this_cpu_ptr(&softnet_data);
+ sd->in_napi_threaded_poll = true;
-static int napi_threaded_poll(void *data)
-{
- struct napi_struct *napi = data;
- struct softnet_data *sd;
- void *have;
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
- while (!napi_thread_wait(napi)) {
- for (;;) {
- bool repoll = false;
+ sd->in_napi_threaded_poll = false;
+ barrier();
- local_bh_disable();
- sd = this_cpu_ptr(&softnet_data);
- sd->in_napi_threaded_poll = true;
+ if (sd_has_rps_ipi_waiting(sd)) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+ skb_defer_free_flush(sd);
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
- have = netpoll_poll_lock(napi);
- __napi_poll(napi, &repoll);
- netpoll_poll_unlock(have);
+ if (!repoll)
+ break;
- sd->in_napi_threaded_poll = false;
- barrier();
+ rcu_softirq_qs_periodic(last_qs);
+ cond_resched();
+ }
+}
- if (sd_has_rps_ipi_waiting(sd)) {
- local_irq_disable();
- net_rps_action_and_irq_enable(sd);
- }
- skb_defer_free_flush(sd);
- local_bh_enable();
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
- if (!repoll)
- break;
+ while (!napi_thread_wait(napi))
+ napi_threaded_poll_loop(napi);
- cond_resched();
- }
- }
return 0;
}
-static __latent_entropy void net_rx_action(struct softirq_action *h)
+static __latent_entropy void net_rx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
- usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
- int budget = READ_ONCE(netdev_budget);
+ usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int budget = READ_ONCE(net_hotdata.netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
sd->in_net_rx_action = true;
local_irq_disable();
@@ -6799,7 +7403,8 @@ start:
sd->in_net_rx_action = false;
net_rps_action_and_irq_enable(sd);
-end:;
+end:
+ bpf_net_ctx_clear(bpf_net_ctx);
}
struct netdev_adjacent {
@@ -8415,27 +9020,29 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
+ unsigned int promiscuity, flags;
kuid_t uid;
kgid_t gid;
ASSERT_RTNL();
- dev->flags |= IFF_PROMISC;
- dev->promiscuity += inc;
- if (dev->promiscuity == 0) {
+ promiscuity = dev->promiscuity + inc;
+ if (promiscuity == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch promisc and return error.
*/
- if (inc < 0)
- dev->flags &= ~IFF_PROMISC;
- else {
- dev->promiscuity -= inc;
+ if (unlikely(inc > 0)) {
netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
return -EOVERFLOW;
}
+ flags = old_flags & ~IFF_PROMISC;
+ } else {
+ flags = old_flags | IFF_PROMISC;
}
- if (dev->flags != old_flags) {
+ WRITE_ONCE(dev->promiscuity, promiscuity);
+ if (flags != old_flags) {
+ WRITE_ONCE(dev->flags, flags);
netdev_info(dev, "%s promiscuous mode\n",
dev->flags & IFF_PROMISC ? "entered" : "left");
if (audit_enabled) {
@@ -8486,25 +9093,27 @@ EXPORT_SYMBOL(dev_set_promiscuity);
static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
+ unsigned int allmulti, flags;
ASSERT_RTNL();
- dev->flags |= IFF_ALLMULTI;
- dev->allmulti += inc;
- if (dev->allmulti == 0) {
+ allmulti = dev->allmulti + inc;
+ if (allmulti == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch allmulti and return error.
*/
- if (inc < 0)
- dev->flags &= ~IFF_ALLMULTI;
- else {
- dev->allmulti -= inc;
+ if (unlikely(inc > 0)) {
netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
return -EOVERFLOW;
}
+ flags = old_flags & ~IFF_ALLMULTI;
+ } else {
+ flags = old_flags | IFF_ALLMULTI;
}
- if (dev->flags ^ old_flags) {
+ WRITE_ONCE(dev->allmulti, allmulti);
+ if (flags != old_flags) {
+ WRITE_ONCE(dev->flags, flags);
netdev_info(dev, "%s allmulticast mode\n",
dev->flags & IFF_ALLMULTI ? "entered" : "left");
dev_change_rx_flags(dev, IFF_ALLMULTI);
@@ -8586,12 +9195,12 @@ unsigned int dev_get_flags(const struct net_device *dev)
{
unsigned int flags;
- flags = (dev->flags & ~(IFF_PROMISC |
+ flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
IFF_ALLMULTI |
IFF_RUNNING |
IFF_LOWER_UP |
IFF_DORMANT)) |
- (dev->gflags & (IFF_PROMISC |
+ (READ_ONCE(dev->gflags) & (IFF_PROMISC |
IFF_ALLMULTI));
if (netif_running(dev)) {
@@ -8830,7 +9439,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
return -ERANGE;
if (new_len != orig_len) {
- dev->tx_queue_len = new_len;
+ WRITE_ONCE(dev->tx_queue_len, new_len);
res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
res = notifier_to_errno(res);
if (res)
@@ -8844,7 +9453,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
err_rollback:
netdev_err(dev, "refused to change device tx_queue_len\n");
- dev->tx_queue_len = orig_len;
+ WRITE_ONCE(dev->tx_queue_len, orig_len);
return res;
}
@@ -8914,7 +9523,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
}
EXPORT_SYMBOL(dev_set_mac_address);
-static DECLARE_RWSEM(dev_addr_sem);
+DECLARE_RWSEM(dev_addr_sem);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
struct netlink_ext_ack *extack)
@@ -9082,7 +9691,7 @@ EXPORT_SYMBOL(netdev_port_same_parent_id);
*/
int dev_change_proto_down(struct net_device *dev, bool proto_down)
{
- if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
+ if (!dev->change_proto_down)
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -9090,7 +9699,7 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
netif_carrier_off(dev);
else
netif_carrier_on(dev);
- dev->proto_down = proto_down;
+ WRITE_ONCE(dev->proto_down, proto_down);
return 0;
}
@@ -9104,18 +9713,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
u32 value)
{
+ u32 proto_down_reason;
int b;
if (!mask) {
- dev->proto_down_reason = value;
+ proto_down_reason = value;
} else {
+ proto_down_reason = dev->proto_down_reason;
for_each_set_bit(b, &mask, 32) {
if (value & (1 << b))
- dev->proto_down_reason |= BIT(b);
+ proto_down_reason |= BIT(b);
else
- dev->proto_down_reason &= ~BIT(b);
+ proto_down_reason &= ~BIT(b);
}
}
+ WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
}
struct bpf_xdp_link {
@@ -9176,6 +9788,40 @@ u8 dev_xdp_prog_count(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
+u8 dev_xdp_sb_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog &&
+ !dev->xdp_state[i].prog->aux->xdp_has_frags)
+ count++;
+ return count;
+}
+
+int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ if (!dev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ bpf->command == XDP_SETUP_PROG &&
+ bpf->prog && !bpf->prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "unable to propagate XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
+ return -EBUSY;
+ }
+
+ return dev->netdev_ops->ndo_bpf(dev, bpf);
+}
+EXPORT_SYMBOL_GPL(dev_xdp_propagate);
+
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
struct bpf_prog *prog = dev_xdp_prog(dev, mode);
@@ -9204,6 +9850,17 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
struct netdev_bpf xdp;
int err;
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ prog && !prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
+ return -EBUSY;
+ }
+
memset(&xdp, 0, sizeof(xdp));
xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
xdp.extack = extack;
@@ -9352,6 +10009,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
NL_SET_ERR_MSG(extack, "Program bound to different device");
return -EINVAL;
}
+ if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
+ NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
+ return -EINVAL;
+ }
if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
return -EINVAL;
@@ -9628,6 +10289,20 @@ err_out:
return err;
}
+u32 dev_get_min_mp_channel_count(const struct net_device *dev)
+{
+ int i;
+
+ ASSERT_RTNL();
+
+ for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
+ if (dev->_rx[i].mp_params.mp_priv)
+ /* The channel count is the idx plus 1. */
+ return i + 1;
+
+ return 0;
+}
+
/**
* dev_index_reserve() - allocate an ifindex in a namespace
* @net: the applicable net namespace
@@ -9665,14 +10340,23 @@ static void dev_index_release(struct net *net, int ifindex)
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}
+static bool from_cleanup_net(void)
+{
+#ifdef CONFIG_NET_NS
+ return current == cleanup_net_task;
+#else
+ return false;
+#endif
+}
+
/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+atomic_t dev_unreg_count = ATOMIC_INIT(0);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
- atomic_inc(&dev_net(dev)->dev_unreg_count);
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -9719,6 +10403,15 @@ static void netdev_sync_lower_features(struct net_device *upper,
}
}
+static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
+{
+ netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+ bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
+ bool hw_csum = features & NETIF_F_HW_CSUM;
+
+ return ip_csum || hw_csum;
+}
+
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{
@@ -9800,15 +10493,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_LRO;
}
- if (features & NETIF_F_HW_TLS_TX) {
- bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
- (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
- bool hw_csum = features & NETIF_F_HW_CSUM;
-
- if (!ip_csum && !hw_csum) {
- netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
- features &= ~NETIF_F_HW_TLS_TX;
- }
+ if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_TX;
}
if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
@@ -9816,6 +10503,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_HW_TLS_RX;
}
+ if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
+ features &= ~NETIF_F_GSO_UDP_L4;
+ }
+
return features;
}
@@ -10120,6 +10812,17 @@ static void netdev_do_free_pcpu_stats(struct net_device *dev)
}
}
+static void netdev_free_phy_link_topology(struct net_device *dev)
+{
+ struct phy_link_topology *topo = dev->link_topo;
+
+ if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
+ xa_destroy(&topo->phys);
+ kfree(topo);
+ dev->link_topo = NULL;
+ }
+}
+
/**
* register_netdevice() - register a network device
* @dev: device to register
@@ -10149,6 +10852,10 @@ int register_netdevice(struct net_device *dev)
if (ret)
return ret;
+ /* rss ctx ID 0 is reserved for the default context, start from 1 */
+ xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
+ mutex_init(&dev->ethtool->rss_lock);
+
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
@@ -10237,9 +10944,11 @@ int register_netdevice(struct net_device *dev)
goto err_ifindex_release;
ret = netdev_register_kobject(dev);
- write_lock(&dev_base_lock);
- dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
- write_unlock(&dev_base_lock);
+
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+ netdev_unlock(dev);
+
if (ret)
goto err_uninit_notify;
@@ -10305,49 +11014,26 @@ err_free_name:
}
EXPORT_SYMBOL(register_netdevice);
-/**
- * init_dummy_netdev - init a dummy network device for NAPI
- * @dev: device to init
- *
- * This takes a network device structure and initialize the minimum
- * amount of fields so it can be used to schedule NAPI polls without
- * registering a full blown interface. This is to be used by drivers
- * that need to tie several hardware interfaces to a single NAPI
- * poll scheduler due to HW limitations.
+/* Initialize the core of a dummy net device.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
*/
-int init_dummy_netdev(struct net_device *dev)
+static void init_dummy_netdev(struct net_device *dev)
{
- /* Clear everything. Note we don't initialize spinlocks
- * are they aren't supposed to be taken by any of the
- * NAPI code and this dummy netdev is supposed to be
- * only ever used for NAPI polls
- */
- memset(dev, 0, sizeof(struct net_device));
-
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
- /* NAPI wants this */
- INIT_LIST_HEAD(&dev->napi_list);
-
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
- /* napi_busy_loop stats accounting wants this */
- dev_net_set(dev, &init_net);
-
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
*/
-
- return 0;
}
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
-
/**
* register_netdev - register a network device
@@ -10364,12 +11050,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev);
*/
int register_netdev(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
int err;
- if (rtnl_lock_killable())
+ if (rtnl_net_lock_killable(net))
return -EINTR;
+
err = register_netdevice(dev);
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdev);
@@ -10446,8 +11136,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
rebroadcast_time = jiffies;
}
+ rcu_barrier();
+
if (!wait) {
- rcu_barrier();
wait = WAIT_REFS_MIN_MSECS;
} else {
msleep(wait);
@@ -10499,6 +11190,7 @@ void netdev_run_todo(void)
{
struct net_device *dev, *tmp;
struct list_head list;
+ int cnt;
#ifdef CONFIG_LOCKDEP
struct list_head unlink_list;
@@ -10529,12 +11221,13 @@ void netdev_run_todo(void)
continue;
}
- write_lock(&dev_base_lock);
- dev->reg_state = NETREG_UNREGISTERED;
- write_unlock(&dev_base_lock);
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ netdev_unlock(dev);
linkwatch_sync_dev(dev);
}
+ cnt = 0;
while (!list_empty(&list)) {
dev = netdev_wait_allrefs_any(&list);
list_del(&dev->todo_list);
@@ -10552,12 +11245,61 @@ void netdev_run_todo(void)
if (dev->needs_free_netdev)
free_netdev(dev);
- if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
- wake_up(&netdev_unregistering_wq);
+ cnt++;
/* Free network device */
kobject_put(&dev->dev.kobj);
}
+ if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
+ wake_up(&netdev_unregistering_wq);
+}
+
+/* Collate per-cpu network dstats statistics
+ *
+ * Read per-cpu network statistics from dev->dstats and populate the related
+ * fields in @s.
+ */
+static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_dstats __percpu *dstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ u64 rx_packets, rx_bytes, rx_drops;
+ u64 tx_packets, tx_bytes, tx_drops;
+ const struct pcpu_dstats *stats;
+ unsigned int start;
+
+ stats = per_cpu_ptr(dstats, cpu);
+ do {
+ start = u64_stats_fetch_begin(&stats->syncp);
+ rx_packets = u64_stats_read(&stats->rx_packets);
+ rx_bytes = u64_stats_read(&stats->rx_bytes);
+ rx_drops = u64_stats_read(&stats->rx_drops);
+ tx_packets = u64_stats_read(&stats->tx_packets);
+ tx_bytes = u64_stats_read(&stats->tx_bytes);
+ tx_drops = u64_stats_read(&stats->tx_drops);
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
+
+ s->rx_packets += rx_packets;
+ s->rx_bytes += rx_bytes;
+ s->rx_dropped += rx_drops;
+ s->tx_packets += tx_packets;
+ s->tx_bytes += tx_bytes;
+ s->tx_dropped += tx_drops;
+ }
+}
+
+/* ndo_get_stats64 implementation for dtstats-based accounting.
+ *
+ * Populate @s from dev->stats and dev->dstats. This is used internally by the
+ * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
+ */
+static void dev_get_dstats64(const struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ netdev_stats_to_stats64(s, &dev->stats);
+ dev_fetch_dstats(s, dev->dstats);
}
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
@@ -10608,7 +11350,7 @@ noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
return;
}
- field = (__force unsigned long __percpu *)((__force void *)p + offset);
+ field = (unsigned long __percpu *)((void __percpu *)p + offset);
this_cpu_inc(*field);
}
EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
@@ -10629,11 +11371,29 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
const struct net_device_core_stats __percpu *p;
+ /*
+ * IPv{4,6} and udp tunnels share common stat helpers and use
+ * different stat type (NETDEV_PCPU_STAT_TSTATS vs
+ * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
+ */
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
+ offsetof(struct pcpu_dstats, rx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
+ offsetof(struct pcpu_dstats, rx_packets));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
+ offsetof(struct pcpu_dstats, tx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
+ offsetof(struct pcpu_dstats, tx_packets));
+
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
} else if (ops->ndo_get_stats) {
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+ } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
+ dev_get_tstats64(dev, storage);
+ } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
+ dev_get_dstats64(dev, storage);
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
@@ -10745,19 +11505,12 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
WARN_ON(dev->reg_state == NETREG_REGISTERED);
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- dev->gro_flush_timeout = 20000;
- dev->napi_defer_hard_irqs = 1;
+ netdev_set_gro_flush_timeout(dev, 20000);
+ netdev_set_defer_hard_irqs(dev, 1);
}
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
-void netdev_freemem(struct net_device *dev)
-{
- char *addr = (char *)dev - dev->padded;
-
- kvfree(addr);
-}
-
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
@@ -10777,8 +11530,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
- unsigned int alloc_size;
- struct net_device *p;
+ size_t napi_config_sz;
+ unsigned int maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
@@ -10792,21 +11545,14 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
- alloc_size = sizeof(struct net_device);
- if (sizeof_priv) {
- /* ensure 32-byte alignment of private area */
- alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
- alloc_size += sizeof_priv;
- }
- /* ensure 32-byte alignment of whole construct */
- alloc_size += NETDEV_ALIGN - 1;
+ maxqs = max(txqs, rxqs);
- p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
- if (!p)
+ dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
+ GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+ if (!dev)
return NULL;
- dev = PTR_ALIGN(p, NETDEV_ALIGN);
- dev->padded = (char *)dev - (char *)p;
+ dev->priv_len = sizeof_priv;
ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
#ifdef CONFIG_PCPU_DEV_REFCNT
@@ -10853,6 +11599,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
+
+ mutex_init(&dev->lock);
+
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -10870,8 +11619,21 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
+ dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
+ if (!dev->ethtool)
+ goto free_all;
- strcpy(dev->name, name);
+ dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg)
+ goto free_all;
+ dev->cfg_pending = dev->cfg;
+
+ napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+ dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+ if (!dev->napi_config)
+ goto free_all;
+
+ strscpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
@@ -10890,11 +11652,27 @@ free_pcpu:
free_percpu(dev->pcpu_refcnt);
free_dev:
#endif
- netdev_freemem(dev);
+ kvfree(dev);
return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);
+static void netdev_napi_exit(struct net_device *dev)
+{
+ if (!list_empty(&dev->napi_list)) {
+ struct napi_struct *p, *n;
+
+ netdev_lock(dev);
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ __netif_napi_del_locked(p);
+ netdev_unlock(dev);
+
+ synchronize_net();
+ }
+
+ kvfree(dev->napi_config);
+}
+
/**
* free_netdev - free network device
* @dev: device
@@ -10906,8 +11684,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
*/
void free_netdev(struct net_device *dev)
{
- struct napi_struct *p, *n;
-
might_sleep();
/* When called immediately after register_netdevice() failed the unwind
@@ -10920,6 +11696,9 @@ void free_netdev(struct net_device *dev)
return;
}
+ WARN_ON(dev->cfg != dev->cfg_pending);
+ kfree(dev->cfg);
+ kfree(dev->ethtool);
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -10928,8 +11707,7 @@ void free_netdev(struct net_device *dev)
/* Flush device addresses */
dev_addr_flush(dev);
- list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
- netif_napi_del(p);
+ netdev_napi_exit(dev);
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
@@ -10941,14 +11719,19 @@ void free_netdev(struct net_device *dev)
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
+ netdev_free_phy_link_topology(dev);
+
+ mutex_destroy(&dev->lock);
+
/* Compatibility with error handling in drivers */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- netdev_freemem(dev);
+ if (dev->reg_state == NETREG_UNINITIALIZED ||
+ dev->reg_state == NETREG_DUMMY) {
+ kvfree(dev);
return;
}
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
- dev->reg_state = NETREG_RELEASED;
+ WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
/* will free via device release */
put_device(&dev->dev);
@@ -10956,6 +11739,19 @@ void free_netdev(struct net_device *dev)
EXPORT_SYMBOL(free_netdev);
/**
+ * alloc_netdev_dummy - Allocate and initialize a dummy net device.
+ * @sizeof_priv: size of private data to allocate space for
+ *
+ * Return: the allocated net_device on success, NULL otherwise
+ */
+struct net_device *alloc_netdev_dummy(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
+ init_dummy_netdev);
+}
+EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
+
+/**
* synchronize_net - Synchronize with packet receive processing
*
* Wait for packets currently being received to be done.
@@ -10964,13 +11760,41 @@ EXPORT_SYMBOL(free_netdev);
void synchronize_net(void)
{
might_sleep();
- if (rtnl_is_locked())
+ if (from_cleanup_net() || rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);
+static void netdev_rss_contexts_free(struct net_device *dev)
+{
+ struct ethtool_rxfh_context *ctx;
+ unsigned long context;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
+ struct ethtool_rxfh_param rxfh;
+
+ rxfh.indir = ethtool_rxfh_context_indir(ctx);
+ rxfh.key = ethtool_rxfh_context_key(ctx);
+ rxfh.hfunc = ctx->hfunc;
+ rxfh.input_xfrm = ctx->input_xfrm;
+ rxfh.rss_context = context;
+ rxfh.rss_delete = true;
+
+ xa_erase(&dev->ethtool->rss_ctx, context);
+ if (dev->ethtool_ops->create_rxfh_context)
+ dev->ethtool_ops->remove_rxfh_context(dev, ctx,
+ context, NULL);
+ else
+ dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL);
+ kfree(ctx);
+ }
+ xa_destroy(&dev->ethtool->rss_ctx);
+ mutex_unlock(&dev->ethtool->rss_lock);
+}
+
/**
* unregister_netdevice_queue - remove device from the kernel
* @dev: device
@@ -11004,6 +11828,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
{
struct net_device *dev, *tmp;
LIST_HEAD(close_head);
+ int cnt = 0;
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
@@ -11035,10 +11860,10 @@ void unregister_netdevice_many_notify(struct list_head *head,
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
- write_lock(&dev_base_lock);
- unlist_netdevice(dev, false);
- dev->reg_state = NETREG_UNREGISTERING;
- write_unlock(&dev_base_lock);
+ unlist_netdevice(dev);
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
+ netdev_unlock(dev);
}
flush_all_backlogs();
@@ -11052,6 +11877,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
dev_tcx_uninstall(dev);
dev_xdp_uninstall(dev);
bpf_dev_bound_netdev_unregister(dev);
+ dev_dmabuf_uninstall(dev);
netdev_offload_xstats_disable_all(dev);
@@ -11075,11 +11901,17 @@ void unregister_netdevice_many_notify(struct list_head *head,
netdev_name_node_alt_flush(dev);
netdev_name_node_free(dev->name_node);
+ netdev_rss_contexts_free(dev);
+
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
+ mutex_destroy(&dev->ethtool->rss_lock);
+
+ net_shaper_flush_netdev(dev);
+
if (skb)
rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
@@ -11100,7 +11932,9 @@ void unregister_netdevice_many_notify(struct list_head *head,
list_for_each_entry(dev, head, unreg_list) {
netdev_put(dev, &dev->dev_registered_tracker);
net_set_todo(dev);
+ cnt++;
}
+ atomic_add(cnt, &dev_unreg_count);
list_del(head);
}
@@ -11110,7 +11944,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
* @head: list of devices
*
* Note: As most callers use a stack allocated list_head,
- * we force a list_del() to make sure stack wont be corrupted later.
+ * we force a list_del() to make sure stack won't be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
@@ -11131,9 +11965,9 @@ EXPORT_SYMBOL(unregister_netdevice_many);
*/
void unregister_netdev(struct net_device *dev)
{
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);
@@ -11165,10 +11999,10 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
- if (dev->features & NETIF_F_NETNS_LOCAL)
+ if (dev->netns_local)
goto out;
- /* Ensure the device has been registrered */
+ /* Ensure the device has been registered */
if (dev->reg_state != NETREG_REGISTERED)
goto out;
@@ -11218,7 +12052,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
dev_close(dev);
/* And unlink it from device chain */
- unlist_netdevice(dev, true);
+ unlist_netdevice(dev);
synchronize_net();
@@ -11257,8 +12091,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
dev_net_set(dev, net);
dev->ifindex = new_ifindex;
- if (new_name[0]) /* Rename the netdev to prepared name */
+ if (new_name[0]) {
+ /* Rename the netdev to prepared name */
+ write_seqlock_bh(&netdev_rename_lock);
strscpy(dev->name, new_name, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
+ }
/* Fixup kobjects */
dev_set_uevent_suppress(&dev->dev, 1);
@@ -11333,7 +12171,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
list_del_init(&napi->poll_list);
if (napi->poll == process_backlog)
- napi->state = 0;
+ napi->state &= NAPIF_STATE_THREADED;
else
____napi_schedule(sd, napi);
}
@@ -11341,21 +12179,23 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+ if (!use_backlog_threads()) {
#ifdef CONFIG_RPS
- remsd = oldsd->rps_ipi_list;
- oldsd->rps_ipi_list = NULL;
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
#endif
- /* send out pending IPI's on offline CPU */
- net_rps_send_ipi(remsd);
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+ }
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx(skb);
- input_queue_head_incr(oldsd);
+ rps_input_queue_head_incr(oldsd);
}
while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
- input_queue_head_incr(oldsd);
+ rps_input_queue_head_incr(oldsd);
}
return 0;
@@ -11541,7 +12381,7 @@ static void __net_exit default_device_exit_net(struct net *net)
char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */
- if (dev->features & NETIF_F_NETNS_LOCAL)
+ if (dev->netns_local)
continue;
/* Leave virtual devices for the generic cleanup */
@@ -11554,11 +12394,8 @@ static void __net_exit default_device_exit_net(struct net *net)
snprintf(fb_name, IFNAMSIZ, "dev%%d");
netdev_for_each_altname_safe(dev, name_node, tmp)
- if (netdev_name_in_use(&init_net, name_node->name)) {
- netdev_name_node_del(name_node);
- synchronize_rcu();
+ if (netdev_name_in_use(&init_net, name_node->name))
__netdev_name_node_alt_destroy(name_node);
- }
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {
@@ -11605,7 +12442,7 @@ static struct pernet_operations __net_initdata default_device_ops = {
static void __init net_dev_struct_check(void)
{
/* TX read-mostly hotpath */
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
@@ -11631,19 +12468,18 @@ static void __init net_dev_struct_check(void)
/* TXRX read-mostly hotpath */
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 38);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
/* RX read-mostly hotpath */
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -11655,7 +12491,7 @@ static void __init net_dev_struct_check(void)
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
}
/*
@@ -11665,6 +12501,67 @@ static void __init net_dev_struct_check(void)
*
*/
+/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
+#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
+
+static int net_page_pool_create(int cpuid)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+ struct page_pool_params page_pool_params = {
+ .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
+ .flags = PP_FLAG_SYSTEM_POOL,
+ .nid = cpu_to_mem(cpuid),
+ };
+ struct page_pool *pp_ptr;
+ int err;
+
+ pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
+ if (IS_ERR(pp_ptr))
+ return -ENOMEM;
+
+ err = xdp_reg_page_pool(pp_ptr);
+ if (err) {
+ page_pool_destroy(pp_ptr);
+ return err;
+ }
+
+ per_cpu(system_page_pool, cpuid) = pp_ptr;
+#endif
+ return 0;
+}
+
+static int backlog_napi_should_run(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+ struct napi_struct *napi = &sd->backlog;
+
+ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+}
+
+static void run_backlog_napi(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+
+ napi_threaded_poll_loop(&sd->backlog);
+}
+
+static void backlog_napi_setup(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+ struct napi_struct *napi = &sd->backlog;
+
+ napi->thread = this_cpu_read(backlog_napi);
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+}
+
+static struct smp_hotplug_thread backlog_threads = {
+ .store = &backlog_napi,
+ .thread_should_run = backlog_napi_should_run,
+ .thread_fn = run_backlog_napi,
+ .thread_comm = "backlog_napi/%u",
+ .setup = backlog_napi_setup,
+};
+
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
@@ -11683,7 +12580,6 @@ static int __init net_dev_init(void)
if (netdev_kobject_init())
goto out;
- INIT_LIST_HEAD(&ptype_all);
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
@@ -11694,12 +12590,13 @@ static int __init net_dev_init(void)
* Initialise the packet receive queues.
*/
+ flush_backlogs_fallback = flush_backlogs_alloc();
+ if (!flush_backlogs_fallback)
+ goto out;
+
for_each_possible_cpu(i) {
- struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
- INIT_WORK(flush, flush_backlog);
-
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
@@ -11717,7 +12614,13 @@ static int __init net_dev_init(void)
init_gro_hash(&sd->backlog);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
+ INIT_LIST_HEAD(&sd->backlog.poll_list);
+
+ if (net_page_pool_create(i))
+ goto out;
}
+ if (use_backlog_threads())
+ smpboot_register_percpu_thread(&backlog_threads);
dev_boot_phase = 0;
@@ -11743,7 +12646,25 @@ static int __init net_dev_init(void)
NULL, dev_cpu_dead);
WARN_ON(rc < 0);
rc = 0;
+
+ /* avoid static key IPIs to isolated CPUs */
+ if (housekeeping_enabled(HK_TYPE_MISC))
+ net_enable_timestamp();
out:
+ if (rc < 0) {
+ for_each_possible_cpu(i) {
+ struct page_pool *pp_ptr;
+
+ pp_ptr = per_cpu(system_page_pool, i);
+ if (!pp_ptr)
+ continue;
+
+ xdp_unreg_page_pool(pp_ptr);
+ page_pool_destroy(pp_ptr);
+ per_cpu(system_page_pool, i) = NULL;
+ }
+ }
+
return rc;
}
diff --git a/net/core/dev.h b/net/core/dev.h
index 7480b4c84298..a5b166bbd169 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -2,12 +2,12 @@
#ifndef _NET_CORE_DEV_H
#define _NET_CORE_DEV_H
+#include <linux/cleanup.h>
#include <linux/types.h>
+#include <linux/rwsem.h>
+#include <linux/netdevice.h>
struct net;
-struct net_device;
-struct netdev_bpf;
-struct netdev_phys_item_id;
struct netlink_ext_ack;
struct cpumask;
@@ -23,6 +23,23 @@ struct sd_flow_limit {
extern int netdev_flow_limit_table_len;
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
+
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
+struct net_device *__netdev_put_lock(struct net_device *dev);
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T));
+
+#define for_each_netdev_lock_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock) = NULL; \
+ (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \
+ ifindex++)
+
#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
#else
@@ -36,16 +53,24 @@ void dev_addr_flush(struct net_device *dev);
int dev_addr_init(struct net_device *dev);
void dev_addr_check(struct net_device *dev);
+#if IS_ENABLED(CONFIG_NET_SHAPER)
+void net_shaper_flush_netdev(struct net_device *dev);
+void net_shaper_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq);
+#else
+static inline void net_shaper_flush_netdev(struct net_device *dev) {}
+static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq) {}
+#endif
+
/* sysctls not referred to from outside net/core/ */
-extern int netdev_budget;
-extern unsigned int netdev_budget_usecs;
-extern unsigned int sysctl_skb_defer_max;
-extern int netdev_tstamp_prequeue;
extern int netdev_unregister_timeout_secs;
extern int weight_p;
extern int dev_weight_rx_bias;
extern int dev_weight_tx_bias;
+extern struct rw_semaphore dev_addr_sem;
+
/* rtnl helpers */
extern struct list_head net_todo_list;
void netdev_run_todo(void);
@@ -56,6 +81,7 @@ struct netdev_name_node {
struct list_head list;
struct net_device *dev;
const char *name;
+ struct rcu_head rcu;
};
int netdev_get_name(struct net *net, char *name, int ifindex);
@@ -101,6 +127,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh);
+static inline void netif_set_up(struct net_device *dev, bool value)
+{
+ if (value)
+ dev->flags |= IFF_UP;
+ else
+ dev->flags &= ~IFF_UP;
+
+ netdev_lock(dev);
+ dev->up = value;
+ netdev_unlock(dev);
+}
+
static inline void netif_set_gso_max_size(struct net_device *dev,
unsigned int size)
{
@@ -140,6 +178,119 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
WRITE_ONCE(dev->gro_ipv4_max_size, size);
}
+/**
+ * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs
+ * @n: napi struct to get the defer_hard_irqs field from
+ *
+ * Return: the per-NAPI value of the defar_hard_irqs field.
+ */
+static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n)
+{
+ return READ_ONCE(n->defer_hard_irqs);
+}
+
+/**
+ * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi
+ * @n: napi_struct to set the defer_hard_irqs field
+ * @defer: the value the field should be set to
+ */
+static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
+{
+ WRITE_ONCE(n->defer_hard_irqs, defer);
+}
+
+/**
+ * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev
+ * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set
+ * @defer: the defer_hard_irqs value to set
+ */
+static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
+ u32 defer)
+{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
+ struct napi_struct *napi;
+ int i;
+
+ WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_defer_hard_irqs(napi, defer);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].defer_hard_irqs = defer;
+}
+
+/**
+ * napi_get_gro_flush_timeout - get the gro_flush_timeout
+ * @n: napi struct to get the gro_flush_timeout from
+ *
+ * Return: the per-NAPI value of the gro_flush_timeout field.
+ */
+static inline unsigned long
+napi_get_gro_flush_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->gro_flush_timeout);
+}
+
+/**
+ * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi
+ * @n: napi struct to set the gro_flush_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout
+ */
+static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->gro_flush_timeout, timeout);
+}
+
+/**
+ * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs
+ * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set
+ * @timeout: the timeout value to set
+ */
+static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
+ unsigned long timeout)
+{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
+ struct napi_struct *napi;
+ int i;
+
+ WRITE_ONCE(netdev->gro_flush_timeout, timeout);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_gro_flush_timeout(napi, timeout);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].gro_flush_timeout = timeout;
+}
+
+/**
+ * napi_get_irq_suspend_timeout - get the irq_suspend_timeout
+ * @n: napi struct to get the irq_suspend_timeout from
+ *
+ * Return: the per-NAPI value of the irq_suspend_timeout field.
+ */
+static inline unsigned long
+napi_get_irq_suspend_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->irq_suspend_timeout);
+}
+
+/**
+ * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi
+ * @n: napi struct to set the irq_suspend_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout
+ */
+static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->irq_suspend_timeout, timeout);
+}
+
int rps_cpumask_housekeeping(struct cpumask *mask);
#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
@@ -148,5 +299,48 @@ void xdp_do_check_flushed(struct napi_struct *napi);
static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
#endif
-struct napi_struct *napi_by_id(unsigned int napi_id);
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
+
+#define XMIT_RECURSION_LIMIT 8
+
+#ifndef CONFIG_PREEMPT_RT
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
+ XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ __this_cpu_inc(softnet_data.xmit.recursion);
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.recursion);
+}
+#else
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ current->net_xmit.recursion++;
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ current->net_xmit.recursion--;
+}
+#endif
+
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack);
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg);
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg);
+
#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index baa63dee2829..90716bd736f3 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -242,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
__hw_addr_del_entry(from_list, ha, false, false);
}
-static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
- struct netdev_hw_addr_list *from_list,
- int addr_len)
+int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
{
int err = 0;
struct netdev_hw_addr *ha, *tmp;
@@ -260,9 +260,10 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
}
return err;
}
+EXPORT_SYMBOL(__hw_addr_sync_multiple);
/* This function only works where there is a strict 1-1 relationship
- * between source and destionation of they synch. If you ever need to
+ * between source and destination of they synch. If you ever need to
* sync addresses to more then 1 destination, you need to use
* __hw_addr_sync_multiple().
*/
@@ -299,8 +300,8 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
EXPORT_SYMBOL(__hw_addr_unsync);
/**
- * __hw_addr_sync_dev - Synchonize device's multicast list
- * @list: address list to syncronize
+ * __hw_addr_sync_dev - Synchronize device's multicast list
+ * @list: address list to synchronize
* @dev: device to sync
* @sync: function to call if address should be added
* @unsync: function to call if address should be removed
diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c
index 4dbd0dc6aea2..8e1dba825e94 100644
--- a/net/core/dev_addr_lists_test.c
+++ b/net/core/dev_addr_lists_test.c
@@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test)
KUNIT_FAIL(test, "Can't register netdev %d", err);
}
- rtnl_lock();
return 0;
}
@@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test)
{
struct net_device *netdev = test->priv;
- rtnl_unlock();
unregister_netdev(netdev);
free_netdev(netdev);
}
@@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test)
struct net_device *netdev = test->priv;
u8 addr[ETH_ALEN];
+ rtnl_lock();
KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr);
memset(addr, 2, sizeof(addr));
@@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test)
memset(addr, 3, sizeof(addr));
dev_addr_set(netdev, addr);
KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr));
+ rtnl_unlock();
}
static void dev_addr_test_sync_one(struct kunit *test)
@@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test)
datp = netdev_priv(netdev);
+ rtnl_lock();
memset(addr, 1, sizeof(addr));
eth_hw_addr_set(netdev, addr);
@@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test)
* considered synced and we overwrite in place.
*/
KUNIT_EXPECT_EQ(test, 0, datp->addr_seen);
+ rtnl_unlock();
}
static void dev_addr_test_add_del(struct kunit *test)
@@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test)
datp = netdev_priv(netdev);
+ rtnl_lock();
for (i = 1; i < 4; i++) {
memset(addr, i, sizeof(addr));
KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
@@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test)
__hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
dev_addr_test_unsync);
KUNIT_EXPECT_EQ(test, 1, datp->addr_seen);
+ rtnl_unlock();
}
static void dev_addr_test_del_main(struct kunit *test)
@@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test)
struct net_device *netdev = test->priv;
u8 addr[ETH_ALEN];
+ rtnl_lock();
memset(addr, 1, sizeof(addr));
eth_hw_addr_set(netdev, addr);
@@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test)
NETDEV_HW_ADDR_T_LAN));
KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
NETDEV_HW_ADDR_T_LAN));
+ rtnl_unlock();
}
static void dev_addr_test_add_set(struct kunit *test)
@@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test)
datp = netdev_priv(netdev);
+ rtnl_lock();
/* There is no external API like dev_addr_add_excl(),
* so shuffle the tree a little bit and exploit aliasing.
*/
@@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test)
__hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
dev_addr_test_unsync);
KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen);
+ rtnl_unlock();
}
static void dev_addr_test_add_excl(struct kunit *test)
@@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test)
u8 addr[ETH_ALEN];
int i;
+ rtnl_lock();
for (i = 0; i < 10; i++) {
memset(addr, i, sizeof(addr));
KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr));
@@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test)
memset(addr, i, sizeof(addr));
KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
}
+ rtnl_unlock();
}
static struct kunit_case dev_addr_test_cases[] = {
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 9a66cf5015f2..4c2098ac9d72 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -6,6 +6,7 @@
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
#include <linux/phylib_stubs.h>
+#include <linux/ptp_clock_kernel.h>
#include <linux/wireless.h>
#include <linux/if_bridge.h>
#include <net/dsa_stubs.h>
@@ -64,7 +65,7 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc)
}
/* Loop over the interfaces, and write an info block for each. */
- rtnl_lock();
+ rtnl_net_lock(net);
for_each_netdev(net, dev) {
if (!pos)
done = inet_gifconf(dev, NULL, 0, size);
@@ -72,12 +73,12 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc)
done = inet_gifconf(dev, pos + total,
len - total, size);
if (done < 0) {
- rtnl_unlock();
+ rtnl_net_unlock(net);
return -EFAULT;
}
total += done;
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
return put_user(total, &uifc->ifc_len);
}
@@ -184,7 +185,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
return err;
}
-static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
{
enum hwtstamp_tx_types tx_type;
enum hwtstamp_rx_filters rx_filter;
@@ -259,19 +260,32 @@ static int dev_eth_ioctl(struct net_device *dev,
* @dev: Network device
* @cfg: Timestamping configuration structure
*
- * Helper for enforcing a common policy that phylib timestamping, if available,
- * should take precedence in front of hardware timestamping provided by the
- * netdev.
+ * Helper for calling the default hardware provider timestamping.
*
* Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and
* there only exists a phydev->mii_ts->hwtstamp() method. So this will return
* -EOPNOTSUPP for phylib for now, which is still more accurate than letting
* the netdev handle the GET request.
*/
-static int dev_get_hwtstamp_phylib(struct net_device *dev,
- struct kernel_hwtstamp_config *cfg)
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
{
- if (phy_has_hwtstamp(dev->phydev))
+ struct hwtstamp_provider *hwprov;
+
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ cfg->qualifier = hwprov->desc.qualifier;
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev)
+ return phy_hwtstamp_get(hwprov->phydev, cfg);
+
+ if (hwprov->source == HWTSTAMP_SOURCE_NETDEV)
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+
+ return -EOPNOTSUPP;
+ }
+
+ if (phy_is_default_hwtstamp(dev->phydev))
return phy_hwtstamp_get(dev->phydev, cfg);
return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
@@ -319,28 +333,48 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
* should take precedence in front of hardware timestamping provided by the
* netdev. If the netdev driver needs to perform specific actions even for PHY
* timestamping to work properly (a switch port must trap the timestamped
- * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in
- * dev->priv_flags.
+ * frames and not forward them), it must set dev->see_all_hwtstamp_requests.
*/
int dev_set_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
- bool phy_ts = phy_has_hwtstamp(dev->phydev);
struct kernel_hwtstamp_config old_cfg = {};
+ struct hwtstamp_provider *hwprov;
+ struct phy_device *phydev;
bool changed = false;
+ bool phy_ts;
int err;
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev) {
+ phy_ts = true;
+ phydev = hwprov->phydev;
+ } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) {
+ phy_ts = false;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ cfg->qualifier = hwprov->desc.qualifier;
+ } else {
+ phy_ts = phy_is_default_hwtstamp(dev->phydev);
+ if (phy_ts)
+ phydev = dev->phydev;
+ }
+
cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
- if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
+ if (phy_ts && dev->see_all_hwtstamp_requests) {
err = ops->ndo_hwtstamp_get(dev, &old_cfg);
if (err)
return err;
}
- if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
+ if (!phy_ts || dev->see_all_hwtstamp_requests) {
err = ops->ndo_hwtstamp_set(dev, cfg, extack);
if (err) {
if (extack->_msg)
@@ -349,11 +383,11 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
}
}
- if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS))
+ if (phy_ts && dev->see_all_hwtstamp_requests)
changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
if (phy_ts) {
- err = phy_hwtstamp_set(dev->phydev, cfg, extack);
+ err = phy_hwtstamp_set(phydev, cfg, extack);
if (err) {
if (changed)
ops->ndo_hwtstamp_set(dev, &old_cfg, NULL);
@@ -363,7 +397,6 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
return 0;
}
-EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib);
static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
{
@@ -510,7 +543,7 @@ static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
}
/*
- * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ * Perform the SIOCxIFxxx calls, inside rtnl_net_lock()
*/
static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
unsigned int cmd)
@@ -587,11 +620,14 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return -ENODEV;
if (!netif_is_bridge_master(dev))
return -EOPNOTSUPP;
+
netdev_hold(dev, &dev_tracker, GFP_KERNEL);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
+
netdev_put(dev, &dev_tracker);
- rtnl_lock();
+ rtnl_net_lock(net);
return err;
case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15:
@@ -737,9 +773,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
dev_load(net, ifr->ifr_name);
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
if (colon)
*colon = ':';
return ret;
@@ -783,9 +821,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
dev_load(net, ifr->ifr_name);
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
if (need_copyout)
*need_copyout = false;
return ret;
@@ -808,9 +848,10 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
(cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15)) {
dev_load(net, ifr->ifr_name);
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return ret;
}
return -ENOTTY;
diff --git a/net/core/devmem.c b/net/core/devmem.c
new file mode 100644
index 000000000000..0e5a2c672efd
--- /dev/null
+++ b/net/core/devmem.c
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Devmem TCP
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ * Willem de Bruijn <willemdebruijn.kernel@gmail.com>
+ * Kaiyuan Zhang <kaiyuanz@google.com
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/ethtool_netlink.h>
+#include <linux/genalloc.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+#include <trace/events/page_pool.h>
+
+#include "devmem.h"
+#include "mp_dmabuf_devmem.h"
+#include "page_pool_priv.h"
+
+/* Device memory support */
+
+/* Protected by rtnl_lock() */
+static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
+
+static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
+ struct gen_pool_chunk *chunk,
+ void *not_used)
+{
+ struct dmabuf_genpool_chunk_owner *owner = chunk->owner;
+
+ kvfree(owner->niovs);
+ kfree(owner);
+}
+
+static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
+{
+ struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov);
+
+ return owner->base_dma_addr +
+ ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
+}
+
+void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
+{
+ size_t size, avail;
+
+ gen_pool_for_each_chunk(binding->chunk_pool,
+ net_devmem_dmabuf_free_chunk_owner, NULL);
+
+ size = gen_pool_size(binding->chunk_pool);
+ avail = gen_pool_avail(binding->chunk_pool);
+
+ if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu",
+ size, avail))
+ gen_pool_destroy(binding->chunk_pool);
+
+ dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
+ DMA_FROM_DEVICE);
+ dma_buf_detach(binding->dmabuf, binding->attachment);
+ dma_buf_put(binding->dmabuf);
+ xa_destroy(&binding->bound_rxqs);
+ kfree(binding);
+}
+
+struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ struct dmabuf_genpool_chunk_owner *owner;
+ unsigned long dma_addr;
+ struct net_iov *niov;
+ ssize_t offset;
+ ssize_t index;
+
+ dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE,
+ (void **)&owner);
+ if (!dma_addr)
+ return NULL;
+
+ offset = dma_addr - owner->base_dma_addr;
+ index = offset / PAGE_SIZE;
+ niov = &owner->niovs[index];
+
+ niov->pp_magic = 0;
+ niov->pp = NULL;
+ atomic_long_set(&niov->pp_ref_count, 0);
+
+ return niov;
+}
+
+void net_devmem_free_dmabuf(struct net_iov *niov)
+{
+ struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov);
+ unsigned long dma_addr = net_devmem_get_dma_addr(niov);
+
+ if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
+ PAGE_SIZE)))
+ return;
+
+ gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE);
+}
+
+void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ struct netdev_rx_queue *rxq;
+ unsigned long xa_idx;
+ unsigned int rxq_idx;
+ int err;
+
+ if (binding->list.next)
+ list_del(&binding->list);
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, rxq) {
+ WARN_ON(rxq->mp_params.mp_priv != binding);
+
+ rxq->mp_params.mp_priv = NULL;
+
+ rxq_idx = get_netdev_rx_queue_index(rxq);
+
+ err = netdev_rx_queue_restart(binding->dev, rxq_idx);
+ WARN_ON(err && err != -ENETDOWN);
+ }
+
+ xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+
+ net_devmem_dmabuf_binding_put(binding);
+}
+
+int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_rx_queue *rxq;
+ u32 xa_idx;
+ int err;
+
+ if (rxq_idx >= dev->real_num_rx_queues) {
+ NL_SET_ERR_MSG(extack, "rx queue index out of range");
+ return -ERANGE;
+ }
+
+ if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
+ return -EINVAL;
+ }
+
+ if (dev->cfg->hds_thresh) {
+ NL_SET_ERR_MSG(extack, "hds-thresh is not zero");
+ return -EINVAL;
+ }
+
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
+ if (rxq->mp_params.mp_priv) {
+ NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
+ return -EEXIST;
+ }
+
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool) {
+ NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP");
+ return -EBUSY;
+ }
+#endif
+
+ err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b,
+ GFP_KERNEL);
+ if (err)
+ return err;
+
+ rxq->mp_params.mp_priv = binding;
+
+ err = netdev_rx_queue_restart(dev, rxq_idx);
+ if (err)
+ goto err_xa_erase;
+
+ return 0;
+
+err_xa_erase:
+ rxq->mp_params.mp_priv = NULL;
+ xa_erase(&binding->bound_rxqs, xa_idx);
+
+ return err;
+}
+
+struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+ struct netlink_ext_ack *extack)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ static u32 id_alloc_next;
+ struct scatterlist *sg;
+ struct dma_buf *dmabuf;
+ unsigned int sg_idx, i;
+ unsigned long virtual;
+ int err;
+
+ dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return ERR_CAST(dmabuf);
+
+ binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
+ dev_to_node(&dev->dev));
+ if (!binding) {
+ err = -ENOMEM;
+ goto err_put_dmabuf;
+ }
+
+ binding->dev = dev;
+
+ err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
+ binding, xa_limit_32b, &id_alloc_next,
+ GFP_KERNEL);
+ if (err < 0)
+ goto err_free_binding;
+
+ xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
+
+ refcount_set(&binding->ref, 1);
+
+ binding->dmabuf = dmabuf;
+
+ binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
+ if (IS_ERR(binding->attachment)) {
+ err = PTR_ERR(binding->attachment);
+ NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
+ goto err_free_id;
+ }
+
+ binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
+ DMA_FROM_DEVICE);
+ if (IS_ERR(binding->sgt)) {
+ err = PTR_ERR(binding->sgt);
+ NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
+ goto err_detach;
+ }
+
+ /* For simplicity we expect to make PAGE_SIZE allocations, but the
+ * binding can be much more flexible than that. We may be able to
+ * allocate MTU sized chunks here. Leave that for future work...
+ */
+ binding->chunk_pool =
+ gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev));
+ if (!binding->chunk_pool) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+
+ virtual = 0;
+ for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) {
+ dma_addr_t dma_addr = sg_dma_address(sg);
+ struct dmabuf_genpool_chunk_owner *owner;
+ size_t len = sg_dma_len(sg);
+ struct net_iov *niov;
+
+ owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
+ dev_to_node(&dev->dev));
+ if (!owner) {
+ err = -ENOMEM;
+ goto err_free_chunks;
+ }
+
+ owner->base_virtual = virtual;
+ owner->base_dma_addr = dma_addr;
+ owner->num_niovs = len / PAGE_SIZE;
+ owner->binding = binding;
+
+ err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
+ dma_addr, len, dev_to_node(&dev->dev),
+ owner);
+ if (err) {
+ kfree(owner);
+ err = -EINVAL;
+ goto err_free_chunks;
+ }
+
+ owner->niovs = kvmalloc_array(owner->num_niovs,
+ sizeof(*owner->niovs),
+ GFP_KERNEL);
+ if (!owner->niovs) {
+ err = -ENOMEM;
+ goto err_free_chunks;
+ }
+
+ for (i = 0; i < owner->num_niovs; i++) {
+ niov = &owner->niovs[i];
+ niov->owner = owner;
+ page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
+ net_devmem_get_dma_addr(niov));
+ }
+
+ virtual += len;
+ }
+
+ return binding;
+
+err_free_chunks:
+ gen_pool_for_each_chunk(binding->chunk_pool,
+ net_devmem_dmabuf_free_chunk_owner, NULL);
+ gen_pool_destroy(binding->chunk_pool);
+err_unmap:
+ dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
+ DMA_FROM_DEVICE);
+err_detach:
+ dma_buf_detach(dmabuf, binding->attachment);
+err_free_id:
+ xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+err_free_binding:
+ kfree(binding);
+err_put_dmabuf:
+ dma_buf_put(dmabuf);
+ return ERR_PTR(err);
+}
+
+void dev_dmabuf_uninstall(struct net_device *dev)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct netdev_rx_queue *rxq;
+ unsigned long xa_idx;
+ unsigned int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ binding = dev->_rx[i].mp_params.mp_priv;
+ if (!binding)
+ continue;
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, rxq)
+ if (rxq == &dev->_rx[i]) {
+ xa_erase(&binding->bound_rxqs, xa_idx);
+ break;
+ }
+ }
+}
+
+/*** "Dmabuf devmem memory provider" ***/
+
+int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+
+ if (!binding)
+ return -EINVAL;
+
+ /* dma-buf dma addresses do not need and should not be used with
+ * dma_sync_for_cpu/device. Force disable dma_sync.
+ */
+ pool->dma_sync = false;
+ pool->dma_sync_for_cpu = false;
+
+ if (pool->p.order != 0)
+ return -E2BIG;
+
+ net_devmem_dmabuf_binding_get(binding);
+ return 0;
+}
+
+netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+ struct net_iov *niov;
+ netmem_ref netmem;
+
+ niov = net_devmem_alloc_dmabuf(binding);
+ if (!niov)
+ return 0;
+
+ netmem = net_iov_to_netmem(niov);
+
+ page_pool_set_pp_info(pool, netmem);
+
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
+ return netmem;
+}
+
+void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+
+ net_devmem_dmabuf_binding_put(binding);
+}
+
+bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
+{
+ long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem));
+
+ if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ return false;
+
+ if (WARN_ON_ONCE(refcount != 1))
+ return false;
+
+ page_pool_clear_pp_info(netmem);
+
+ net_devmem_free_dmabuf(netmem_to_net_iov(netmem));
+
+ /* We don't want the page pool put_page()ing our net_iovs. */
+ return false;
+}
diff --git a/net/core/devmem.h b/net/core/devmem.h
new file mode 100644
index 000000000000..76099ef9c482
--- /dev/null
+++ b/net/core/devmem.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Device memory TCP support
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ * Willem de Bruijn <willemb@google.com>
+ * Kaiyuan Zhang <kaiyuanz@google.com>
+ *
+ */
+#ifndef _NET_DEVMEM_H
+#define _NET_DEVMEM_H
+
+struct netlink_ext_ack;
+
+struct net_devmem_dmabuf_binding {
+ struct dma_buf *dmabuf;
+ struct dma_buf_attachment *attachment;
+ struct sg_table *sgt;
+ struct net_device *dev;
+ struct gen_pool *chunk_pool;
+
+ /* The user holds a ref (via the netlink API) for as long as they want
+ * the binding to remain alive. Each page pool using this binding holds
+ * a ref to keep the binding alive. Each allocated net_iov holds a
+ * ref.
+ *
+ * The binding undos itself and unmaps the underlying dmabuf once all
+ * those refs are dropped and the binding is no longer desired or in
+ * use.
+ */
+ refcount_t ref;
+
+ /* The list of bindings currently active. Used for netlink to notify us
+ * of the user dropping the bind.
+ */
+ struct list_head list;
+
+ /* rxq's this binding is active on. */
+ struct xarray bound_rxqs;
+
+ /* ID of this binding. Globally unique to all bindings currently
+ * active.
+ */
+ u32 id;
+};
+
+#if defined(CONFIG_NET_DEVMEM)
+/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist
+ * entry from the dmabuf is inserted into the genpool as a chunk, and needs
+ * this owner struct to keep track of some metadata necessary to create
+ * allocations from this chunk.
+ */
+struct dmabuf_genpool_chunk_owner {
+ /* Offset into the dma-buf where this chunk starts. */
+ unsigned long base_virtual;
+
+ /* dma_addr of the start of the chunk. */
+ dma_addr_t base_dma_addr;
+
+ /* Array of net_iovs for this chunk. */
+ struct net_iov *niovs;
+ size_t num_niovs;
+
+ struct net_devmem_dmabuf_binding *binding;
+};
+
+void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding);
+struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+ struct netlink_ext_ack *extack);
+void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
+int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack);
+void dev_dmabuf_uninstall(struct net_device *dev);
+
+static inline struct dmabuf_genpool_chunk_owner *
+net_iov_owner(const struct net_iov *niov)
+{
+ return niov->owner;
+}
+
+static inline unsigned int net_iov_idx(const struct net_iov *niov)
+{
+ return niov - net_iov_owner(niov)->niovs;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_iov_binding(const struct net_iov *niov)
+{
+ return net_iov_owner(niov)->binding;
+}
+
+static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
+{
+ struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov);
+
+ return owner->base_virtual +
+ ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
+}
+
+static inline u32 net_iov_binding_id(const struct net_iov *niov)
+{
+ return net_iov_owner(niov)->binding->id;
+}
+
+static inline void
+net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
+{
+ refcount_inc(&binding->ref);
+}
+
+static inline void
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+ if (!refcount_dec_and_test(&binding->ref))
+ return;
+
+ __net_devmem_dmabuf_binding_free(binding);
+}
+
+struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
+void net_devmem_free_dmabuf(struct net_iov *ppiov);
+
+#else
+struct net_devmem_dmabuf_binding;
+
+static inline void
+__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+ struct netlink_ext_ack *extack)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void
+net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline int
+net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack)
+
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void dev_dmabuf_uninstall(struct net_device *dev)
+{
+}
+
+static inline struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ return NULL;
+}
+
+static inline void net_devmem_free_dmabuf(struct net_iov *ppiov)
+{
+}
+
+static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
+{
+ return 0;
+}
+
+static inline u32 net_iov_binding_id(const struct net_iov *niov)
+{
+ return 0;
+}
+#endif
+
+#endif /* _NET_DEVMEM_H */
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index b0f221d658be..212f0a048cab 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -37,7 +37,7 @@
#include <trace/events/napi.h>
#include <trace/events/devlink.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#define TRACE_ON 1
#define TRACE_OFF 0
@@ -74,7 +74,7 @@ struct net_dm_hw_entries {
};
struct per_cpu_dm_data {
- spinlock_t lock; /* Protects 'skb', 'hw_entries' and
+ raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and
* 'send_timer'
*/
union {
@@ -109,7 +109,8 @@ static u32 net_dm_queue_len = 1000;
struct net_dm_alert_ops {
void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
void *location,
- enum skb_drop_reason reason);
+ enum skb_drop_reason reason,
+ struct sock *rx_sk);
void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
int work, int budget);
void (*work_item_func)(struct work_struct *work);
@@ -168,9 +169,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
err:
mod_timer(&data->send_timer, jiffies + HZ / 10);
out:
- spin_lock_irqsave(&data->lock, flags);
+ raw_spin_lock_irqsave(&data->lock, flags);
swap(data->skb, skb);
- spin_unlock_irqrestore(&data->lock, flags);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
if (skb) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
@@ -225,7 +226,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
local_irq_save(flags);
data = this_cpu_ptr(&dm_cpu_data);
- spin_lock(&data->lock);
+ raw_spin_lock(&data->lock);
dskb = data->skb;
if (!dskb)
@@ -259,12 +260,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
}
out:
- spin_unlock_irqrestore(&data->lock, flags);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
}
static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
void *location,
- enum skb_drop_reason reason)
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
{
trace_drop_common(skb, location);
}
@@ -314,9 +316,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)
mod_timer(&hw_data->send_timer, jiffies + HZ / 10);
}
- spin_lock_irqsave(&hw_data->lock, flags);
+ raw_spin_lock_irqsave(&hw_data->lock, flags);
swap(hw_data->hw_entries, hw_entries);
- spin_unlock_irqrestore(&hw_data->lock, flags);
+ raw_spin_unlock_irqrestore(&hw_data->lock, flags);
return hw_entries;
}
@@ -448,7 +450,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
return;
hw_data = this_cpu_ptr(&dm_hw_cpu_data);
- spin_lock_irqsave(&hw_data->lock, flags);
+ raw_spin_lock_irqsave(&hw_data->lock, flags);
hw_entries = hw_data->hw_entries;
if (!hw_entries)
@@ -477,7 +479,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
}
out:
- spin_unlock_irqrestore(&hw_data->lock, flags);
+ raw_spin_unlock_irqrestore(&hw_data->lock, flags);
}
static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
@@ -491,7 +493,8 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
struct sk_buff *skb,
void *location,
- enum skb_drop_reason reason)
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
{
ktime_t tstamp = ktime_get_real();
struct per_cpu_dm_data *data;
@@ -1673,7 +1676,7 @@ static struct notifier_block dropmon_net_notifier = {
static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data)
{
- spin_lock_init(&data->lock);
+ raw_spin_lock_init(&data->lock);
skb_queue_head_init(&data->drop_queue);
u64_stats_init(&data->stats.syncp);
}
@@ -1731,30 +1734,30 @@ static int __init init_net_drop_monitor(void)
return -ENOSPC;
}
- rc = genl_register_family(&net_drop_monitor_family);
- if (rc) {
- pr_err("Could not create drop monitor netlink family\n");
- return rc;
+ for_each_possible_cpu(cpu) {
+ net_dm_cpu_data_init(cpu);
+ net_dm_hw_cpu_data_init(cpu);
}
- WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = register_netdevice_notifier(&dropmon_net_notifier);
if (rc < 0) {
pr_crit("Failed to register netdevice notifier\n");
+ return rc;
+ }
+
+ rc = genl_register_family(&net_drop_monitor_family);
+ if (rc) {
+ pr_err("Could not create drop monitor netlink family\n");
goto out_unreg;
}
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = 0;
- for_each_possible_cpu(cpu) {
- net_dm_cpu_data_init(cpu);
- net_dm_hw_cpu_data_init(cpu);
- }
-
goto out;
out_unreg:
- genl_unregister_family(&net_drop_monitor_family);
+ WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
out:
return rc;
}
@@ -1763,19 +1766,18 @@ static void exit_net_drop_monitor(void)
{
int cpu;
- BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
-
/*
* Because of the module_get/put we do in the trace state change path
* we are guaranteed not to have any current users when we get here
*/
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
for_each_possible_cpu(cpu) {
net_dm_hw_cpu_data_fini(cpu);
net_dm_cpu_data_fini(cpu);
}
-
- BUG_ON(genl_unregister_family(&net_drop_monitor_family));
}
module_init(init_net_drop_monitor);
diff --git a/net/core/dst.c b/net/core/dst.c
index 6838d3212c37..9552a90d4772 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -96,7 +96,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
}
EXPORT_SYMBOL(dst_alloc);
-struct dst_entry *dst_destroy(struct dst_entry * dst)
+static void dst_destroy(struct dst_entry *dst)
{
struct dst_entry *child = NULL;
@@ -109,9 +109,6 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
child = xdst->child;
}
#endif
- if (!(dst->flags & DST_NOCOUNT))
- dst_entries_add(dst->ops, -1);
-
if (dst->ops->destroy)
dst->ops->destroy(dst);
netdev_put(dst->dev, &dst->dev_tracker);
@@ -126,15 +123,13 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
dst = child;
if (dst)
dst_release_immediate(dst);
- return NULL;
}
-EXPORT_SYMBOL(dst_destroy);
static void dst_destroy_rcu(struct rcu_head *head)
{
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
- dst = dst_destroy(dst);
+ dst_destroy(dst);
}
/* Operations to mark dst as DEAD and clean up the net device referenced
@@ -161,17 +156,27 @@ void dst_dev_put(struct dst_entry *dst)
}
EXPORT_SYMBOL(dst_dev_put);
+static void dst_count_dec(struct dst_entry *dst)
+{
+ if (!(dst->flags & DST_NOCOUNT))
+ dst_entries_add(dst->ops, -1);
+}
+
void dst_release(struct dst_entry *dst)
{
- if (dst && rcuref_put(&dst->__rcuref))
+ if (dst && rcuref_put(&dst->__rcuref)) {
+ dst_count_dec(dst);
call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
+ }
}
EXPORT_SYMBOL(dst_release);
void dst_release_immediate(struct dst_entry *dst)
{
- if (dst && rcuref_put(&dst->__rcuref))
+ if (dst && rcuref_put(&dst->__rcuref)) {
+ dst_count_dec(dst);
dst_destroy(dst);
+ }
}
EXPORT_SYMBOL(dst_release_immediate);
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 0ccfd5fa5cb9..70c634b9e7b0 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -27,6 +27,7 @@ struct dst_cache_pcpu {
static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
struct dst_entry *dst, u32 cookie)
{
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
dst_release(dst_cache->dst);
if (dst)
dst_hold(dst);
@@ -40,6 +41,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
{
struct dst_entry *dst;
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
dst = idst->dst;
if (!dst)
goto fail;
@@ -47,7 +49,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
/* the cache already hold a dst reference; it can't go away */
dst_hold(dst);
- if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+ if (unlikely(!time_after(idst->refresh_ts,
+ READ_ONCE(dst_cache->reset_ts)) ||
(dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
dst_cache_per_cpu_dst_set(idst, NULL, 0);
dst_release(dst);
@@ -83,7 +86,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
return NULL;
*saddr = idst->in_saddr.s_addr;
- return container_of(dst, struct rtable, dst);
+ return dst_rtable(dst);
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
@@ -111,8 +114,8 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
return;
idst = this_cpu_ptr(dst_cache->cache);
- dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
- rt6_get_cookie((struct rt6_info *)dst));
+ dst_cache_per_cpu_dst_set(idst, dst,
+ rt6_get_cookie(dst_rt6_info(dst)));
idst->in6_saddr = *saddr;
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
@@ -170,7 +173,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache)
if (!dst_cache->cache)
return;
- dst_cache->reset_ts = jiffies;
+ dst_cache_reset(dst_cache);
for_each_possible_cpu(i) {
struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);
struct dst_entry *dst = idst->dst;
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index fc96259807b6..5cdca49b1d7c 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -43,7 +43,6 @@ static unsigned int fib_seq_sum(struct net *net)
struct fib_notifier_ops *ops;
unsigned int fib_seq = 0;
- rtnl_lock();
rcu_read_lock();
list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
if (!try_module_get(ops->owner))
@@ -52,7 +51,6 @@ static unsigned int fib_seq_sum(struct net *net)
module_put(ops->owner);
}
rcu_read_unlock();
- rtnl_unlock();
return fib_seq;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 3f933ffcefc3..94a7872ab231 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -11,6 +11,7 @@
#include <linux/list.h>
#include <linux/module.h>
#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
#include <net/sock.h>
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
@@ -36,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = {
bool fib_rule_matchall(const struct fib_rule *rule)
{
- if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id ||
- rule->flags)
+ if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) ||
+ rule->mark || rule->tun_id || rule->flags)
return false;
if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1)
return false;
@@ -72,7 +73,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
- /* The lock is not required here, the list in unreacheable
+ /* The lock is not required here, the list in unreachable
* at the moment this function is called */
list_add_tail(&r->list, &ops->rules_list);
return 0;
@@ -100,7 +101,8 @@ static void notify_rule_change(int event, struct fib_rule *rule,
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
u32 pid);
-static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
+static struct fib_rules_ops *lookup_rules_ops(const struct net *net,
+ int family)
{
struct fib_rules_ops *ops;
@@ -259,12 +261,14 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
{
- int ret = 0;
+ int iifindex, oifindex, ret = 0;
- if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
+ iifindex = READ_ONCE(rule->iifindex);
+ if (iifindex && (iifindex != fl->flowi_iif))
goto out;
- if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
+ oifindex = READ_ONCE(rule->oifindex);
+ if (oifindex && (oifindex != fl->flowi_oif))
goto out;
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
@@ -369,7 +373,9 @@ static int call_fib_rule_notifiers(struct net *net,
.rule = rule,
};
- ops->fib_rules_seq++;
+ ASSERT_RTNL();
+ /* Paired with READ_ONCE() in fib_rules_seq() */
+ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1);
return call_fib_notifiers(net, event_type, &info.info);
}
@@ -396,17 +402,16 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
}
EXPORT_SYMBOL_GPL(fib_rules_dump);
-unsigned int fib_rules_seq_read(struct net *net, int family)
+unsigned int fib_rules_seq_read(const struct net *net, int family)
{
unsigned int fib_rules_seq;
struct fib_rules_ops *ops;
- ASSERT_RTNL();
-
ops = lookup_rules_ops(net, family);
if (!ops)
return 0;
- fib_rules_seq = ops->fib_rules_seq;
+ /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */
+ fib_rules_seq = READ_ONCE(ops->fib_rules_seq);
rules_ops_put(ops);
return fib_rules_seq;
@@ -555,8 +560,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
nlrule->pref = fib_default_rule_pref(ops);
}
- nlrule->proto = tb[FRA_PROTOCOL] ?
- nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
+ nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC);
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
@@ -766,7 +770,10 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
[FRA_PROTOCOL] = { .type = NLA_U8 },
[FRA_IP_PROTO] = { .type = NLA_U8 },
[FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
- [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }
+ [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
+ [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2),
+ [FRA_FLOWLABEL] = { .type = NLA_BE32 },
+ [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 },
};
int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -1036,14 +1043,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
if (rule->iifname[0]) {
if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
goto nla_put_failure;
- if (rule->iifindex == -1)
+ if (READ_ONCE(rule->iifindex) == -1)
frh->flags |= FIB_RULE_IIF_DETACHED;
}
if (rule->oifname[0]) {
if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
goto nla_put_failure;
- if (rule->oifindex == -1)
+ if (READ_ONCE(rule->oifindex) == -1)
frh->flags |= FIB_RULE_OIF_DETACHED;
}
@@ -1142,10 +1149,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct fib_rules_ops *ops;
- int idx = 0, family;
+ int err, idx = 0, family;
if (cb->strict_check) {
- int err = fib_valid_dumprule_req(nlh, cb->extack);
+ err = fib_valid_dumprule_req(nlh, cb->extack);
if (err < 0)
return err;
@@ -1158,17 +1165,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
if (ops == NULL)
return -EAFNOSUPPORT;
- dump_rules(skb, cb, ops);
-
- return skb->len;
+ return dump_rules(skb, cb, ops);
}
+ err = 0;
rcu_read_lock();
list_for_each_entry_rcu(ops, &net->rules_ops, list) {
if (idx < cb->args[0] || !try_module_get(ops->owner))
goto skip;
- if (dump_rules(skb, cb, ops) < 0)
+ err = dump_rules(skb, cb, ops);
+ if (err < 0)
break;
cb->args[1] = 0;
@@ -1178,7 +1185,7 @@ skip:
rcu_read_unlock();
cb->args[0] = idx;
- return skb->len;
+ return err;
}
static void notify_rule_change(int event, struct fib_rule *rule,
@@ -1205,8 +1212,7 @@ static void notify_rule_change(int event, struct fib_rule *rule,
rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, ops->nlgroup, err);
+ rtnl_set_sk_err(net, ops->nlgroup, err);
}
static void attach_rules(struct list_head *rules, struct net_device *dev)
@@ -1216,10 +1222,10 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
list_for_each_entry(rule, rules, list) {
if (rule->iifindex == -1 &&
strcmp(dev->name, rule->iifname) == 0)
- rule->iifindex = dev->ifindex;
+ WRITE_ONCE(rule->iifindex, dev->ifindex);
if (rule->oifindex == -1 &&
strcmp(dev->name, rule->oifname) == 0)
- rule->oifindex = dev->ifindex;
+ WRITE_ONCE(rule->oifindex, dev->ifindex);
}
}
@@ -1229,9 +1235,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
list_for_each_entry(rule, rules, list) {
if (rule->iifindex == dev->ifindex)
- rule->iifindex = -1;
+ WRITE_ONCE(rule->iifindex, -1);
if (rule->oifindex == dev->ifindex)
- rule->oifindex = -1;
+ WRITE_ONCE(rule->oifindex, -1);
}
}
@@ -1288,12 +1294,18 @@ static struct pernet_operations fib_rules_net_ops = {
.exit = fib_rules_net_exit,
};
+static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule},
+ {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule},
+ {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+};
+
static int __init fib_rules_init(void)
{
int err;
- rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0);
+
+ rtnl_register_many(fib_rules_rtnl_msg_handlers);
err = register_pernet_subsys(&fib_rules_net_ops);
if (err < 0)
@@ -1308,9 +1320,7 @@ static int __init fib_rules_init(void)
fail_unregister:
unregister_pernet_subsys(&fib_rules_net_ops);
fail:
- rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
- rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
- rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
+ rtnl_unregister_many(fib_rules_rtnl_msg_handlers);
return err;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index ef3e78b6a39c..2ec162dd83c4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -42,7 +42,7 @@
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
@@ -84,11 +84,15 @@
#include <net/netkit.h>
#include <linux/un.h>
#include <net/xdp_sock_drv.h>
+#include <net/inet_dscp.h>
#include "dev.h"
+/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
+static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");
+
static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id);
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
@@ -778,7 +782,7 @@ jmp_rest:
BPF_EMIT_JMP;
break;
- /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+ /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
case BPF_LDX | BPF_MSH | BPF_B: {
struct sock_filter tmp = {
.code = BPF_LD | BPF_ABS | BPF_B,
@@ -804,7 +808,7 @@ jmp_rest:
*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
break;
}
- /* RET_K is remaped into 2 insns. RET_A case doesn't need an
+ /* RET_K is remapped into 2 insns. RET_A case doesn't need an
* extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
*/
case BPF_RET | BPF_A:
@@ -1262,8 +1266,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
* so we need to keep the user BPF around until the 2nd
* pass. At this time, the user BPF is stored in fp->insns.
*/
- old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
- GFP_KERNEL | __GFP_NOWARN);
+ old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter),
+ GFP_KERNEL | __GFP_NOWARN);
if (!old_prog) {
err = -ENOMEM;
goto out_err;
@@ -1650,18 +1654,14 @@ void sk_reuseport_prog_free(struct bpf_prog *prog)
bpf_prog_destroy(prog);
}
-struct bpf_scratchpad {
- union {
- __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
- u8 buff[MAX_BPF_STACK];
- };
-};
-
-static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
-
static inline int __bpf_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
+#ifdef CONFIG_DEBUG_NET
+ /* Avoid a splat in pskb_may_pull_reason() */
+ if (write_len > INT_MAX)
+ return -EINVAL;
+#endif
return skb_ensure_writable(skb, write_len);
}
@@ -2010,10 +2010,6 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
__be32 *, to, u32, to_size, __wsum, seed)
{
- struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
- u32 diff_size = from_size + to_size;
- int i, j = 0;
-
/* This is quite flexible, some examples:
*
* from_size == 0, to_size > 0, seed := csum --> pushing data
@@ -2022,16 +2018,19 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
*
* Even for diffing, from_size and to_size don't need to be equal.
*/
- if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
- diff_size > sizeof(sp->diff)))
- return -EINVAL;
- for (i = 0; i < from_size / sizeof(__be32); i++, j++)
- sp->diff[j] = ~from[i];
- for (i = 0; i < to_size / sizeof(__be32); i++, j++)
- sp->diff[j] = to[i];
+ __wsum ret = seed;
+
+ if (from_size && to_size)
+ ret = csum_sub(csum_partial(to, to_size, ret),
+ csum_partial(from, from_size, 0));
+ else if (to_size)
+ ret = csum_partial(to, to_size, ret);
- return csum_partial(sp->diff, diff_size, seed);
+ else if (from_size)
+ ret = ~csum_partial(from, from_size, ~ret);
+
+ return csum_from32to16((__force unsigned int)ret);
}
static const struct bpf_func_proto bpf_csum_diff_proto = {
@@ -2215,7 +2214,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
rcu_read_lock();
if (!nh) {
dst = skb_dst(skb);
- nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+ nexthop = rt6_nexthop(dst_rt6_info(dst),
&ipv6_hdr(skb)->daddr);
} else {
nexthop = &nh->ipv6_nh;
@@ -2233,7 +2232,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
rcu_read_unlock();
return ret;
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
if (dst)
IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
@@ -2271,12 +2270,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
err = bpf_out_neigh_v6(net, skb, dev, nh);
if (unlikely(net_xmit_eval(err)))
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
else
ret = NET_XMIT_SUCCESS;
goto out_xmit;
out_drop:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
out_xmit:
return ret;
@@ -2314,8 +2313,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
rcu_read_lock();
if (!nh) {
- struct dst_entry *dst = skb_dst(skb);
- struct rtable *rt = container_of(dst, struct rtable, dst);
+ struct rtable *rt = skb_rtable(skb);
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
} else if (nh->nh_family == AF_INET6) {
@@ -2357,7 +2355,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct flowi4 fl4 = {
.flowi4_flags = FLOWI_FLAG_ANYSRC,
.flowi4_mark = skb->mark,
- .flowi4_tos = RT_TOS(ip4h->tos),
+ .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)),
.flowi4_oif = dev->ifindex,
.flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
@@ -2378,12 +2376,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
err = bpf_out_neigh_v4(net, skb, dev, nh);
if (unlikely(net_xmit_eval(err)))
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
else
ret = NET_XMIT_SUCCESS;
goto out_xmit;
out_drop:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
out_xmit:
return ret;
@@ -2423,9 +2421,9 @@ out:
/* Internal, non-exposed redirect flags. */
enum {
- BPF_F_NEIGH = (1ULL << 1),
- BPF_F_PEER = (1ULL << 2),
- BPF_F_NEXTHOP = (1ULL << 3),
+ BPF_F_NEIGH = (1ULL << 16),
+ BPF_F_PEER = (1ULL << 17),
+ BPF_F_NEXTHOP = (1ULL << 18),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};
@@ -2435,6 +2433,8 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
struct sk_buff *clone;
int ret;
+ BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);
+
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return -EINVAL;
@@ -2469,9 +2469,6 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING,
};
-DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
-EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
-
static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -2484,7 +2481,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev)
int skb_do_redirect(struct sk_buff *skb)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct net *net = dev_net(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
@@ -2517,7 +2514,7 @@ out_drop:
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return TC_ACT_SHOT;
@@ -2538,7 +2535,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return TC_ACT_SHOT;
@@ -2560,7 +2557,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
int, plen, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely((plen && plen < sizeof(*params)) || flags))
return TC_ACT_SHOT;
@@ -2607,18 +2604,16 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
static void sk_msg_reset_curr(struct sk_msg *msg)
{
- u32 i = msg->sg.start;
- u32 len = 0;
-
- do {
- len += sk_msg_elem(msg, i)->length;
- sk_msg_iter_var_next(i);
- if (len >= msg->sg.size)
- break;
- } while (i != msg->sg.end);
+ if (!msg->sg.size) {
+ msg->sg.curr = msg->sg.start;
+ msg->sg.copybreak = 0;
+ } else {
+ u32 i = msg->sg.end;
- msg->sg.curr = i;
- msg->sg.copybreak = 0;
+ sk_msg_iter_var_prev(i);
+ msg->sg.curr = i;
+ msg->sg.copybreak = msg->sg.data[i].length;
+ }
}
static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
@@ -2781,7 +2776,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
- if (start >= offset + l)
+ if (start > offset + l)
return -EINVAL;
space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
@@ -2806,6 +2801,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
raw = page_address(page);
+ if (i == msg->sg.end)
+ sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
front = start - offset;
back = psge->length - front;
@@ -2822,7 +2819,13 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
}
put_page(sg_page(psge));
- } else if (start - offset) {
+ new = i;
+ goto place_new;
+ }
+
+ if (start - offset) {
+ if (i == msg->sg.end)
+ sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
rsge = sk_msg_elem_cpy(msg, i);
@@ -2833,39 +2836,44 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_next(i);
sg_unmark_end(psge);
sg_unmark_end(&rsge);
- sk_msg_iter_next(msg, end);
}
/* Slot(s) to place newly allocated data */
+ sk_msg_iter_next(msg, end);
new = i;
+ sk_msg_iter_var_next(i);
+
+ if (i == msg->sg.end) {
+ if (!rsge.length)
+ goto place_new;
+ sk_msg_iter_next(msg, end);
+ goto place_new;
+ }
/* Shift one or two slots as needed */
- if (!copy) {
- sge = sk_msg_elem_cpy(msg, i);
+ sge = sk_msg_elem_cpy(msg, new);
+ sg_unmark_end(&sge);
+ nsge = sk_msg_elem_cpy(msg, i);
+ if (rsge.length) {
sk_msg_iter_var_next(i);
- sg_unmark_end(&sge);
+ nnsge = sk_msg_elem_cpy(msg, i);
sk_msg_iter_next(msg, end);
+ }
- nsge = sk_msg_elem_cpy(msg, i);
+ while (i != msg->sg.end) {
+ msg->sg.data[i] = sge;
+ sge = nsge;
+ sk_msg_iter_var_next(i);
if (rsge.length) {
- sk_msg_iter_var_next(i);
+ nsge = nnsge;
nnsge = sk_msg_elem_cpy(msg, i);
- }
-
- while (i != msg->sg.end) {
- msg->sg.data[i] = sge;
- sge = nsge;
- sk_msg_iter_var_next(i);
- if (rsge.length) {
- nsge = nnsge;
- nnsge = sk_msg_elem_cpy(msg, i);
- } else {
- nsge = sk_msg_elem_cpy(msg, i);
- }
+ } else {
+ nsge = sk_msg_elem_cpy(msg, i);
}
}
+place_new:
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
@@ -2894,8 +2902,10 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
static void sk_msg_shift_left(struct sk_msg *msg, int i)
{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
int prev;
+ put_page(sg_page(sge));
do {
prev = i;
sk_msg_iter_var_next(i);
@@ -2932,6 +2942,9 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
if (unlikely(flags))
return -EINVAL;
+ if (unlikely(len == 0))
+ return 0;
+
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
@@ -2944,7 +2957,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
} while (i != msg->sg.end);
/* Bounds checks: start and pop must be inside message */
- if (start >= offset + l || last >= msg->sg.size)
+ if (start >= offset + l || last > msg->sg.size)
return -EINVAL;
space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
@@ -2968,17 +2981,17 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
*
* Then if B is non-zero AND there is no space allocate space and
* compact A, B regions into page. If there is space shift ring to
- * the rigth free'ing the next element in ring to place B, leaving
+ * the right free'ing the next element in ring to place B, leaving
* A untouched except to reduce length.
*/
if (start != offset) {
struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
- int a = start;
+ int a = start - offset;
int b = sge->length - pop - a;
sk_msg_iter_var_next(i);
- if (pop < sge->length - a) {
+ if (b > 0) {
if (space) {
sge->length = a;
sk_msg_shift_right(msg, i);
@@ -2997,7 +3010,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
if (unlikely(!page))
return -ENOMEM;
- sge->length = a;
orig = sg_page(sge);
from = sg_virt(sge);
to = page_address(page);
@@ -3007,7 +3019,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
put_page(orig);
}
pop = 0;
- } else if (pop >= sge->length - a) {
+ } else {
pop -= (sge->length - a);
sge->length = a;
}
@@ -3041,7 +3053,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
pop -= sge->length;
sk_msg_shift_left(msg, i);
}
- sk_msg_iter_var_next(i);
}
sk_mem_uncharge(msg->sk, len - pop);
@@ -3178,6 +3189,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
bpf_push_mac_rcsum(skb);
ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
bpf_pull_mac_rcsum(skb);
+ skb_reset_mac_len(skb);
bpf_compute_data_pointers(skb);
return ret;
@@ -3537,13 +3549,20 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* Due to header grow, MSS needs to be downgraded. */
- if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
- skb_decrease_gso_size(shinfo, len_diff);
-
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= gso_type;
shinfo->gso_segs = 0;
+
+ /* Due to header growth, MSS needs to be downgraded.
+ * There is a BUG_ON() when segmenting the frag_list with
+ * head_frag true, so linearize the skb after downgrading
+ * the MSS.
+ */
+ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
+ skb_decrease_gso_size(shinfo, len_diff);
+ if (shinfo->frag_list)
+ return skb_linearize(skb);
+ }
}
return 0;
@@ -3715,13 +3734,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
- u32 min_len = skb_network_offset(skb);
+ int offset = skb_network_offset(skb);
+ u32 min_len = 0;
- if (skb_transport_header_was_set(skb))
- min_len = skb_transport_offset(skb);
- if (skb->ip_summed == CHECKSUM_PARTIAL)
- min_len = skb_checksum_start_offset(skb) +
- skb->csum_offset + sizeof(__sum16);
+ if (offset > 0)
+ min_len = offset;
+ if (skb_transport_header_was_set(skb)) {
+ offset = skb_transport_offset(skb);
+ if (offset > 0)
+ min_len = offset;
+ }
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ offset = skb_checksum_start_offset(skb) +
+ skb->csum_offset + sizeof(__sum16);
+ if (offset > 0)
+ min_len = offset;
+ }
return min_len;
}
@@ -4100,13 +4128,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
}
static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
- struct xdp_mem_info *mem_info, bool release)
+ enum xdp_mem_type mem_type, bool release)
{
struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
if (release) {
xsk_buff_del_tail(zc_frag);
- __xdp_return(NULL, mem_info, false, zc_frag);
+ __xdp_return(0, mem_type, false, zc_frag);
} else {
zc_frag->data_end -= shrink;
}
@@ -4115,19 +4143,16 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
int shrink)
{
- struct xdp_mem_info *mem_info = &xdp->rxq->mem;
+ enum xdp_mem_type mem_type = xdp->rxq->mem.type;
bool release = skb_frag_size(frag) == shrink;
- if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
- bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
+ if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
+ bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release);
goto out;
}
- if (release) {
- struct page *page = skb_frag_page(frag);
-
- __xdp_return(page_address(page), mem_info, false, NULL);
- }
+ if (release)
+ __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL);
out:
return release;
@@ -4266,50 +4291,50 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
*/
void xdp_do_flush(void)
{
- __dev_flush();
- __cpu_map_flush();
- __xsk_map_flush();
+ struct list_head *lh_map, *lh_dev, *lh_xsk;
+
+ bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+ if (lh_dev)
+ __dev_flush(lh_dev);
+ if (lh_map)
+ __cpu_map_flush(lh_map);
+ if (lh_xsk)
+ __xsk_map_flush(lh_xsk);
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
void xdp_do_check_flushed(struct napi_struct *napi)
{
- bool ret;
+ struct list_head *lh_map, *lh_dev, *lh_xsk;
+ bool missed = false;
- ret = dev_check_flush();
- ret |= cpu_map_check_flush();
- ret |= xsk_map_check_flush();
+ bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+ if (lh_dev) {
+ __dev_flush(lh_dev);
+ missed = true;
+ }
+ if (lh_map) {
+ __cpu_map_flush(lh_map);
+ missed = true;
+ }
+ if (lh_xsk) {
+ __xsk_map_flush(lh_xsk);
+ missed = true;
+ }
- WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+ WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
napi->poll);
}
#endif
-void bpf_clear_redirect_map(struct bpf_map *map)
-{
- struct bpf_redirect_info *ri;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- ri = per_cpu_ptr(&bpf_redirect_info, cpu);
- /* Avoid polluting remote cacheline due to writes if
- * not needed. Once we pass this test, we need the
- * cmpxchg() to make sure it hasn't been changed in
- * the meantime by remote CPU.
- */
- if (unlikely(READ_ONCE(ri->map) == map))
- cmpxchg(&ri->map, map, NULL);
- }
-}
-
DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
u32 xdp_master_redirect(struct xdp_buff *xdp)
{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct net_device *master, *slave;
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
@@ -4329,9 +4354,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp)
EXPORT_SYMBOL_GPL(xdp_master_redirect);
static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
- struct net_device *dev,
+ const struct net_device *dev,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
@@ -4352,18 +4377,20 @@ err:
return err;
}
-static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
- struct net_device *dev,
- struct xdp_frame *xdpf,
- struct bpf_prog *xdp_prog)
+static __always_inline int
+__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
+ u32 flags = ri->flags;
struct bpf_map *map;
int err;
ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->flags = 0;
ri->map_type = BPF_MAP_TYPE_UNSPEC;
if (unlikely(!xdpf)) {
@@ -4375,11 +4402,20 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
- map = READ_ONCE(ri->map);
- if (unlikely(map)) {
+ if (unlikely(flags & BPF_F_BROADCAST)) {
+ map = READ_ONCE(ri->map);
+
+ /* The map pointer is cleared when the map is being torn
+ * down by dev_map_free()
+ */
+ if (unlikely(!map)) {
+ err = -ENOENT;
+ break;
+ }
+
WRITE_ONCE(ri->map, NULL);
err = dev_map_enqueue_multi(xdpf, dev, map,
- ri->flags & BPF_F_EXCLUDE_INGRESS);
+ flags & BPF_F_EXCLUDE_INGRESS);
} else {
err = dev_map_enqueue(fwd, xdpf, dev);
}
@@ -4413,9 +4449,9 @@ err:
}
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
if (map_type == BPF_MAP_TYPE_XSKMAP)
@@ -4427,9 +4463,10 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
EXPORT_SYMBOL_GPL(xdp_do_redirect);
int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
- struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
if (map_type == BPF_MAP_TYPE_XSKMAP)
@@ -4442,11 +4479,11 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog,
- void *fwd,
- enum bpf_map_type map_type, u32 map_id)
+ const struct bpf_prog *xdp_prog,
+ void *fwd, enum bpf_map_type map_type,
+ u32 map_id, u32 flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct bpf_map *map;
int err;
@@ -4454,11 +4491,20 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
- map = READ_ONCE(ri->map);
- if (unlikely(map)) {
+ if (unlikely(flags & BPF_F_BROADCAST)) {
+ map = READ_ONCE(ri->map);
+
+ /* The map pointer is cleared when the map is being torn
+ * down by dev_map_free()
+ */
+ if (unlikely(!map)) {
+ err = -ENOENT;
+ break;
+ }
+
WRITE_ONCE(ri->map, NULL);
err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
- ri->flags & BPF_F_EXCLUDE_INGRESS);
+ flags & BPF_F_EXCLUDE_INGRESS);
} else {
err = dev_map_generic_redirect(fwd, skb, xdp_prog);
}
@@ -4489,15 +4535,18 @@ err:
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
- struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
+ u32 flags = ri->flags;
int err;
ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->flags = 0;
ri->map_type = BPF_MAP_TYPE_UNSPEC;
if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
@@ -4517,7 +4566,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
return 0;
}
- return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
err:
_trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
return err;
@@ -4525,7 +4574,7 @@ err:
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return XDP_ABORTED;
@@ -4662,7 +4711,7 @@ set_compat:
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
if (flags & BPF_F_TUNINFO_FLAGS)
- to->tunnel_flags = info->key.tun_flags;
+ to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
else
to->tunnel_ext = 0;
@@ -4705,7 +4754,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
int err;
if (unlikely(!info ||
- !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
+ !ip_tunnel_is_options_present(info->key.tun_flags))) {
err = -ENOENT;
goto err_clear;
}
@@ -4775,15 +4824,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
memset(info, 0, sizeof(*info));
info->mode = IP_TUNNEL_INFO_TX;
- info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
- if (flags & BPF_F_DONT_FRAGMENT)
- info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
- if (flags & BPF_F_ZERO_CSUM_TX)
- info->key.tun_flags &= ~TUNNEL_CSUM;
- if (flags & BPF_F_SEQ_NUMBER)
- info->key.tun_flags |= TUNNEL_SEQ;
- if (flags & BPF_F_NO_TUNNEL_KEY)
- info->key.tun_flags &= ~TUNNEL_KEY;
+ __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
+ __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
+ flags & BPF_F_DONT_FRAGMENT);
+ __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
+ !(flags & BPF_F_ZERO_CSUM_TX));
+ __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
+ flags & BPF_F_SEQ_NUMBER);
+ __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
+ !(flags & BPF_F_NO_TUNNEL_KEY));
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
@@ -4821,13 +4870,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
{
struct ip_tunnel_info *info = skb_tunnel_info(skb);
const struct metadata_dst *md = this_cpu_ptr(md_dst);
+ IP_TUNNEL_DECLARE_FLAGS(present) = { };
if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
return -EINVAL;
if (unlikely(size > IP_TUNNEL_OPTS_MAX))
return -ENOMEM;
- ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
+ ip_tunnel_set_options_present(present);
+ ip_tunnel_info_opts_set(info, from, size, present);
return 0;
}
@@ -5094,6 +5145,17 @@ static u64 __bpf_get_netns_cookie(struct sock *sk)
return net->net_cookie;
}
+BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
+{
+ return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
+ .func = bpf_get_netns_cookie,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
{
return __bpf_get_netns_cookie(ctx);
@@ -5236,6 +5298,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
return -EINVAL;
inet_csk(sk)->icsk_rto_min = timeout;
break;
+ case TCP_BPF_SOCK_OPS_CB_FLAGS:
+ if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
+ return -EINVAL;
+ tp->bpf_sock_ops_cb_flags = val;
+ break;
default:
return -EINVAL;
}
@@ -5324,6 +5391,17 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
if (*optlen < 1)
return -EINVAL;
break;
+ case TCP_BPF_SOCK_OPS_CB_FLAGS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ if (getopt) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ int cb_flags = tp->bpf_sock_ops_cb_flags;
+
+ memcpy(optval, &cb_flags, *optlen);
+ return 0;
+ }
+ return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
default:
if (getopt)
return -EINVAL;
@@ -5857,7 +5935,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
fl4.flowi4_iif = params->ifindex;
fl4.flowi4_oif = 0;
}
- fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+ fl4.flowi4_tos = params->tos & INET_DSCP_MASK;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
@@ -5884,7 +5962,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
} else {
- fl4.flowi4_mark = 0;
+ if (flags & BPF_FIB_LOOKUP_MARK)
+ fl4.flowi4_mark = params->mark;
+ else
+ fl4.flowi4_mark = 0;
fl4.flowi4_secid = 0;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_uid = sock_net_uid(net, NULL);
@@ -5988,7 +6069,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
return -ENODEV;
idev = __in6_dev_get_safely(dev);
- if (unlikely(!idev || !idev->cnf.forwarding))
+ if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
return BPF_FIB_LKUP_RET_FWD_DISABLED;
if (flags & BPF_FIB_LOOKUP_OUTPUT) {
@@ -6027,7 +6108,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
strict);
} else {
- fl6.flowi6_mark = 0;
+ if (flags & BPF_FIB_LOOKUP_MARK)
+ fl6.flowi6_mark = params->mark;
+ else
+ fl6.flowi6_mark = 0;
fl6.flowi6_secid = 0;
fl6.flowi6_tun_key.tun_id = 0;
fl6.flowi6_uid = sock_net_uid(net, NULL);
@@ -6105,7 +6189,7 @@ set_fwd_params:
#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
- BPF_FIB_LOOKUP_SRC)
+ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)
BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
struct bpf_fib_lookup *, params, int, plen, u32, flags)
@@ -6213,12 +6297,10 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
{
int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
struct net_device *dev = skb->dev;
- int skb_len, dev_len;
- int mtu;
+ int mtu, dev_len, skb_len;
if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
return -EINVAL;
-
if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
return -EINVAL;
@@ -6227,7 +6309,6 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
return -ENODEV;
mtu = READ_ONCE(dev->mtu);
-
dev_len = mtu + dev->hard_header_len;
/* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
@@ -6245,15 +6326,12 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
*/
if (skb_is_gso(skb)) {
ret = BPF_MTU_CHK_RET_SUCCESS;
-
if (flags & BPF_MTU_CHK_SEGS &&
!skb_gso_validate_network_len(skb, mtu))
ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
}
out:
- /* BPF verifier guarantees valid pointer */
*mtu_len = mtu;
-
return ret;
}
@@ -6274,8 +6352,6 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
return -ENODEV;
mtu = READ_ONCE(dev->mtu);
-
- /* Add L2-header as dev MTU is L3 size */
dev_len = mtu + dev->hard_header_len;
/* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
@@ -6286,9 +6362,7 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
if (xdp_len > dev_len)
ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
- /* BPF verifier guarantees valid pointer */
*mtu_len = mtu;
-
return ret;
}
@@ -6298,7 +6372,8 @@ static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_INT,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u32),
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6309,7 +6384,8 @@ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_INT,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u32),
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6418,6 +6494,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
void *srh_tlvs, *srh_end, *ptr;
int srhoff = 0;
+ lockdep_assert_held(&srh_state->bh_lock);
if (srh == NULL)
return -EINVAL;
@@ -6474,6 +6551,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
int hdroff = 0;
int err;
+ lockdep_assert_held(&srh_state->bh_lock);
switch (action) {
case SEG6_LOCAL_ACTION_END_X:
if (!seg6_bpf_has_valid_srh(skb))
@@ -6550,6 +6628,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
int srhoff = 0;
int ret;
+ lockdep_assert_held(&srh_state->bh_lock);
if (unlikely(srh == NULL))
return -EINVAL;
@@ -6705,8 +6784,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
- sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
/* Ensure there is no need to bump sk2 refcnt */
@@ -6753,8 +6830,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
- sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
/* Ensure there is no need to bump sk2 refcnt */
@@ -6783,7 +6858,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6802,7 +6877,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6821,7 +6896,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6845,7 +6920,7 @@ static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6869,7 +6944,7 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6893,7 +6968,7 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6931,7 +7006,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6955,7 +7030,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6979,7 +7054,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6999,7 +7074,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -7018,7 +7093,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -7037,7 +7112,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -7203,7 +7278,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
- if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+ if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
return (unsigned long)sk;
return (unsigned long)NULL;
@@ -7576,7 +7651,7 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -7694,17 +7769,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
return -EOPNOTSUPP;
switch (tstamp_type) {
- case BPF_SKB_TSTAMP_DELIVERY_MONO:
+ case BPF_SKB_CLOCK_REALTIME:
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
+ break;
+ case BPF_SKB_CLOCK_MONOTONIC:
if (!tstamp)
return -EINVAL;
skb->tstamp = tstamp;
- skb->mono_delivery_time = 1;
+ skb->tstamp_type = SKB_CLOCK_MONOTONIC;
break;
- case BPF_SKB_TSTAMP_UNSPEC:
- if (tstamp)
+ case BPF_SKB_CLOCK_TAI:
+ if (!tstamp)
return -EINVAL;
- skb->tstamp = 0;
- skb->mono_delivery_time = 0;
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_TAI;
break;
default:
return -EINVAL;
@@ -7828,42 +7907,37 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
#endif /* CONFIG_INET */
-bool bpf_helper_changes_pkt_data(void *func)
-{
- if (func == bpf_skb_vlan_push ||
- func == bpf_skb_vlan_pop ||
- func == bpf_skb_store_bytes ||
- func == bpf_skb_change_proto ||
- func == bpf_skb_change_head ||
- func == sk_skb_change_head ||
- func == bpf_skb_change_tail ||
- func == sk_skb_change_tail ||
- func == bpf_skb_adjust_room ||
- func == sk_skb_adjust_room ||
- func == bpf_skb_pull_data ||
- func == sk_skb_pull_data ||
- func == bpf_clone_redirect ||
- func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace ||
- func == bpf_xdp_adjust_head ||
- func == bpf_xdp_adjust_meta ||
- func == bpf_msg_pull_data ||
- func == bpf_msg_push_data ||
- func == bpf_msg_pop_data ||
- func == bpf_xdp_adjust_tail ||
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
- func == bpf_lwt_seg6_store_bytes ||
- func == bpf_lwt_seg6_adjust_srh ||
- func == bpf_lwt_seg6_action ||
-#endif
-#ifdef CONFIG_INET
- func == bpf_sock_ops_store_hdr_opt ||
-#endif
- func == bpf_lwt_in_push_encap ||
- func == bpf_lwt_xmit_push_encap)
+bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_clone_redirect:
+ case BPF_FUNC_l3_csum_replace:
+ case BPF_FUNC_l4_csum_replace:
+ case BPF_FUNC_lwt_push_encap:
+ case BPF_FUNC_lwt_seg6_action:
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ case BPF_FUNC_msg_pop_data:
+ case BPF_FUNC_msg_pull_data:
+ case BPF_FUNC_msg_push_data:
+ case BPF_FUNC_skb_adjust_room:
+ case BPF_FUNC_skb_change_head:
+ case BPF_FUNC_skb_change_proto:
+ case BPF_FUNC_skb_change_tail:
+ case BPF_FUNC_skb_pull_data:
+ case BPF_FUNC_skb_store_bytes:
+ case BPF_FUNC_skb_vlan_pop:
+ case BPF_FUNC_skb_vlan_push:
+ case BPF_FUNC_store_hdr_opt:
+ case BPF_FUNC_xdp_adjust_head:
+ case BPF_FUNC_xdp_adjust_meta:
+ case BPF_FUNC_xdp_adjust_tail:
+ /* tail-called program could call any of the above */
+ case BPF_FUNC_tail_call:
return true;
-
- return false;
+ default:
+ return false;
+ }
}
const struct bpf_func_proto bpf_event_output_data_proto __weak;
@@ -7894,7 +7968,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -7987,7 +8061,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return NULL;
}
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -8006,7 +8080,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -8132,6 +8206,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_under_cgroup_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
case BPF_FUNC_fib_lookup:
@@ -8193,7 +8269,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#endif
#endif
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -8252,13 +8328,13 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#endif
#endif
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
/* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
* kfuncs are defined in two different modules, and we want to be able
- * to use them interchangably with the same BTF type ID. Because modules
+ * to use them interchangeably with the same BTF type ID. Because modules
* can't de-duplicate BTF IDs between each other, we need the type to be
* referenced in the vmlinux BTF or the verifier will get confused about
* the different types. So we add this dummy type reference which will
@@ -8313,7 +8389,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -8342,8 +8418,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_event_output_data_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
@@ -8355,7 +8429,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_cgroup_classid_curr_proto;
#endif
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -8399,7 +8473,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skc_lookup_tcp_proto;
#endif
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -8410,7 +8484,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_load_bytes:
return &bpf_flow_dissector_load_bytes_proto;
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -8437,7 +8511,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -8526,13 +8600,16 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (off + size > offsetofend(struct __sk_buff, cb[4]))
return false;
break;
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (info->is_ldsx || size != size_default)
+ return false;
+ break;
case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
- case bpf_ctx_range(struct __sk_buff, data):
- case bpf_ctx_range(struct __sk_buff, data_meta):
- case bpf_ctx_range(struct __sk_buff, data_end):
if (size != size_default)
return false;
break;
@@ -8612,7 +8689,7 @@ static bool cg_skb_is_valid_access(int off, int size,
return false;
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
- if (!bpf_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
return false;
break;
}
@@ -8624,7 +8701,7 @@ static bool cg_skb_is_valid_access(int off, int size,
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
case bpf_ctx_range(struct __sk_buff, tstamp):
- if (!bpf_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
return false;
break;
default:
@@ -8976,6 +9053,14 @@ static bool xdp_is_valid_access(int off, int size,
}
}
return false;
+ } else {
+ switch (off) {
+ case offsetof(struct xdp_md, data_meta):
+ case offsetof(struct xdp_md, data):
+ case offsetof(struct xdp_md, data_end):
+ if (info->is_ldsx)
+ return false;
+ }
}
switch (off) {
@@ -8993,7 +9078,8 @@ static bool xdp_is_valid_access(int off, int size,
return __is_valid_xdp_access(off, size);
}
-void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act)
+void bpf_warn_invalid_xdp_action(const struct net_device *dev,
+ const struct bpf_prog *prog, u32 act)
{
const u32 act_max = XDP_REDIRECT;
@@ -9301,12 +9387,12 @@ static bool flow_dissector_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, data):
- if (size != size_default)
+ if (info->is_ldsx || size != size_default)
return false;
info->reg_type = PTR_TO_PACKET;
return true;
case bpf_ctx_range(struct __sk_buff, data_end):
- if (size != size_default)
+ if (info->is_ldsx || size != size_default)
return false;
info->reg_type = PTR_TO_PACKET_END;
return true;
@@ -9357,16 +9443,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
{
__u8 value_reg = si->dst_reg;
__u8 skb_reg = si->src_reg;
- /* AX is needed because src_reg and dst_reg could be the same */
- __u8 tmp_reg = BPF_REG_AX;
-
- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
- SKB_BF_MONO_TC_OFFSET);
- *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
- SKB_MONO_DELIVERY_TIME_MASK, 2);
- *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
- *insn++ = BPF_JMP_A(1);
- *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
+ BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
+ BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
+ BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
+ BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
+ *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
+#else
+ BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
+#endif
return insn;
}
@@ -9409,11 +9496,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
__u8 tmp_reg = BPF_REG_AX;
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
- TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
- *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
- TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
- /* skb->tc_at_ingress && skb->mono_delivery_time,
+ /* check if ingress mask bits is set */
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+ *insn++ = BPF_JMP_A(4);
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
+ *insn++ = BPF_JMP_A(2);
+ /* skb->tc_at_ingress && skb->tstamp_type,
* read 0 as the (rcv) timestamp.
*/
*insn++ = BPF_MOV64_IMM(value_reg, 0);
@@ -9438,7 +9526,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
* the bpf prog is aware the tstamp could have delivery time.
* Thus, write skb->tstamp as is if tstamp_type_access is true.
* Otherwise, writing at ingress will have to clear the
- * mono_delivery_time bit also.
+ * skb->tstamp_type bit also.
*/
if (!prog->tstamp_type_access) {
__u8 tmp_reg = BPF_REG_AX;
@@ -9448,8 +9536,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
/* goto <store> */
*insn++ = BPF_JMP_A(2);
- /* <clear>: mono_delivery_time */
- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
+ /* <clear>: skb->tstamp_type */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
}
#endif
@@ -10153,10 +10241,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
} \
} while (0)
-#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \
- SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \
- S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)
-
static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -11005,7 +11089,6 @@ const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
};
const struct bpf_prog_ops lwt_seg6local_prog_ops = {
- .test_run = bpf_prog_test_run_skb,
};
const struct bpf_verifier_ops cg_sock_verifier_ops = {
@@ -11168,6 +11251,7 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
struct sock_reuseport *reuse;
struct sock *selected_sk;
+ int err;
selected_sk = map->ops->map_lookup_elem(map, key);
if (!selected_sk)
@@ -11175,10 +11259,6 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
if (!reuse) {
- /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
- if (sk_is_refcounted(selected_sk))
- sock_put(selected_sk);
-
/* reuseport_array has only sk with non NULL sk_reuseport_cb.
* The only (!reuse) case here is - the sk has already been
* unhashed (e.g. by close()), so treat it as -ENOENT.
@@ -11186,24 +11266,33 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
* Other maps (e.g. sock_map) do not provide this guarantee and
* the sk may never be in the reuseport group to begin with.
*/
- return is_sockarray ? -ENOENT : -EINVAL;
+ err = is_sockarray ? -ENOENT : -EINVAL;
+ goto error;
}
if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
struct sock *sk = reuse_kern->sk;
- if (sk->sk_protocol != selected_sk->sk_protocol)
- return -EPROTOTYPE;
- else if (sk->sk_family != selected_sk->sk_family)
- return -EAFNOSUPPORT;
-
- /* Catch all. Likely bound to a different sockaddr. */
- return -EBADFD;
+ if (sk->sk_protocol != selected_sk->sk_protocol) {
+ err = -EPROTOTYPE;
+ } else if (sk->sk_family != selected_sk->sk_family) {
+ err = -EAFNOSUPPORT;
+ } else {
+ /* Catch all. Likely bound to a different sockaddr. */
+ err = -EBADFD;
+ }
+ goto error;
}
reuse_kern->selected_sk = selected_sk;
return 0;
+error:
+ /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
+ if (sk_is_refcounted(selected_sk))
+ sock_put(selected_sk);
+
+ return err;
}
static const struct bpf_func_proto sk_select_reuseport_proto = {
@@ -11268,7 +11357,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -11450,7 +11539,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -11784,7 +11873,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = {
};
static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id)
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
const struct bpf_func_proto *func;
@@ -11813,38 +11902,44 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
- if (!perfmon_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
return NULL;
return func;
}
__bpf_kfunc_start_defs();
-__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
- struct bpf_dynptr_kern *ptr__uninit)
+__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)s;
+
if (flags) {
- bpf_dynptr_set_null(ptr__uninit);
+ bpf_dynptr_set_null(ptr);
return -EINVAL;
}
- bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
return 0;
}
-__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
- struct bpf_dynptr_kern *ptr__uninit)
+__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct xdp_buff *xdp = (struct xdp_buff *)x;
+
if (flags) {
- bpf_dynptr_set_null(ptr__uninit);
+ bpf_dynptr_set_null(ptr);
return -EINVAL;
}
- bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
+ bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
return 0;
}
@@ -11869,33 +11964,136 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
return 0;
}
+
+__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
+ struct bpf_tcp_req_attrs *attrs, int attrs__sz)
+{
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ struct sk_buff *skb = (struct sk_buff *)s;
+ const struct request_sock_ops *ops;
+ struct inet_request_sock *ireq;
+ struct tcp_request_sock *treq;
+ struct request_sock *req;
+ struct net *net;
+ __u16 min_mss;
+ u32 tsoff = 0;
+
+ if (attrs__sz != sizeof(*attrs) ||
+ attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
+ return -EINVAL;
+
+ if (!skb_at_tc_ingress(skb))
+ return -EINVAL;
+
+ net = dev_net(skb->dev);
+ if (net != sock_net(sk))
+ return -ENETUNREACH;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ops = &tcp_request_sock_ops;
+ min_mss = 536;
+ break;
+#if IS_BUILTIN(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ ops = &tcp6_request_sock_ops;
+ min_mss = IPV6_MIN_MTU - 60;
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
+ sk_is_mptcp(sk))
+ return -EINVAL;
+
+ if (attrs->mss < min_mss)
+ return -EINVAL;
+
+ if (attrs->wscale_ok) {
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
+ return -EINVAL;
+
+ if (attrs->snd_wscale > TCP_MAX_WSCALE ||
+ attrs->rcv_wscale > TCP_MAX_WSCALE)
+ return -EINVAL;
+ }
+
+ if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
+ return -EINVAL;
+
+ if (attrs->tstamp_ok) {
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
+ return -EINVAL;
+
+ tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
+ }
+
+ req = inet_reqsk_alloc(ops, sk, false);
+ if (!req)
+ return -ENOMEM;
+
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+
+ req->rsk_listener = sk;
+ req->syncookie = 1;
+ req->mss = attrs->mss;
+ req->ts_recent = attrs->rcv_tsval;
+
+ ireq->snd_wscale = attrs->snd_wscale;
+ ireq->rcv_wscale = attrs->rcv_wscale;
+ ireq->tstamp_ok = !!attrs->tstamp_ok;
+ ireq->sack_ok = !!attrs->sack_ok;
+ ireq->wscale_ok = !!attrs->wscale_ok;
+ ireq->ecn_ok = !!attrs->ecn_ok;
+
+ treq->req_usec_ts = !!attrs->usec_ts_ok;
+ treq->ts_off = tsoff;
+
+ skb_orphan(skb);
+ skb->sk = req_to_sk(req);
+ skb->destructor = sock_pfree;
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
__bpf_kfunc_end_defs();
-int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
- struct bpf_dynptr_kern *ptr__uninit)
+int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
int err;
err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
if (err)
return err;
- bpf_dynptr_set_rdonly(ptr__uninit);
+ bpf_dynptr_set_rdonly(ptr);
return 0;
}
-BTF_SET8_START(bpf_kfunc_check_set_skb)
-BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
-BTF_SET8_END(bpf_kfunc_check_set_skb)
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
-BTF_SET8_START(bpf_kfunc_check_set_xdp)
+BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
-BTF_SET8_END(bpf_kfunc_check_set_xdp)
+BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)
-BTF_SET8_START(bpf_kfunc_check_set_sock_addr)
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
-BTF_SET8_END(bpf_kfunc_check_set_sock_addr)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
+BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.owner = THIS_MODULE,
@@ -11912,6 +12110,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
.set = &bpf_kfunc_check_set_sock_addr,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_tcp_reqsk,
+};
+
static int __init bpf_kfunc_init(void)
{
int ret;
@@ -11926,9 +12129,11 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
- return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
- &bpf_kfunc_set_sock_addr);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ &bpf_kfunc_set_sock_addr);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
}
late_initcall(bpf_kfunc_init);
@@ -11968,9 +12173,9 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
__bpf_kfunc_end_defs();
-BTF_SET8_START(bpf_sk_iter_kfunc_ids)
+BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
-BTF_SET8_END(bpf_sk_iter_kfunc_ids)
+BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)
static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 272f09251343..9cd8de6bebb5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -299,9 +299,10 @@ void skb_flow_dissect_meta(const struct sk_buff *skb,
EXPORT_SYMBOL(skb_flow_dissect_meta);
static void
-skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
- struct flow_dissector *flow_dissector,
- void *target_container)
+skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type,
+ u32 ctrl_flags,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
{
struct flow_dissector_key_control *ctrl;
@@ -312,6 +313,7 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
FLOW_DISSECTOR_KEY_ENC_CONTROL,
target_container);
ctrl->addr_type = type;
+ ctrl->flags = ctrl_flags;
}
void
@@ -367,6 +369,7 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
{
struct ip_tunnel_info *info;
struct ip_tunnel_key *key;
+ u32 ctrl_flags = 0;
/* A quick check to see if there might be something to do. */
if (!dissector_uses_key(flow_dissector,
@@ -391,11 +394,20 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
key = &info->key;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM;
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT;
+ if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM;
+ if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT;
+
switch (ip_tunnel_info_af(info)) {
case AF_INET:
- skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
- flow_dissector,
- target_container);
+ skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ ctrl_flags, flow_dissector,
+ target_container);
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
struct flow_dissector_key_ipv4_addrs *ipv4;
@@ -408,9 +420,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
}
break;
case AF_INET6:
- skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
- flow_dissector,
- target_container);
+ skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ ctrl_flags, flow_dissector,
+ target_container);
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
struct flow_dissector_key_ipv6_addrs *ipv6;
@@ -422,6 +434,10 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
ipv6->dst = key->u.ipv6.dst;
}
break;
+ default:
+ skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector,
+ target_container);
+ break;
}
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
@@ -455,17 +471,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
struct flow_dissector_key_enc_opts *enc_opt;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ u32 val;
enc_opt = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_OPTS,
target_container);
- if (info->options_len) {
- enc_opt->len = info->options_len;
- ip_tunnel_info_opts_get(enc_opt->data, info);
- enc_opt->dst_opt_type = info->key.tun_flags &
- TUNNEL_OPTIONS_PRESENT;
- }
+ if (!info->options_len)
+ return;
+
+ enc_opt->len = info->options_len;
+ ip_tunnel_info_opts_get(enc_opt->data, info);
+
+ ip_tunnel_set_options_present(flags);
+ ip_tunnel_flags_and(flags, info->key.tun_flags, flags);
+
+ val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
+ IP_TUNNEL_GENEVE_OPT_BIT);
+ enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
}
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
@@ -829,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb,
void *target_container, const void *data,
int nhoff, u8 ip_proto, int hlen)
{
- enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
- struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
+ struct flow_dissector_key_ports *key_ports = NULL;
+ __be32 ports;
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
- dissector_ports = FLOW_DISSECTOR_KEY_PORTS;
- else if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE))
- dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE;
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
- if (dissector_ports == FLOW_DISSECTOR_KEY_MAX)
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE))
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+
+ if (!key_ports && !key_ports_range)
return;
- key_ports = skb_flow_dissector_target(flow_dissector,
- dissector_ports,
- target_container);
- key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
- data, hlen);
+ ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);
+
+ if (key_ports)
+ key_ports->ports = ports;
+
+ if (key_ports_range)
+ key_ports_range->tp.ports = ports;
}
static void
@@ -900,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
struct flow_dissector *flow_dissector,
void *target_container)
{
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
struct flow_dissector_key_ports *key_ports = NULL;
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
@@ -944,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
- if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
key_ports = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_PORTS,
target_container);
- else if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE))
- key_ports = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE,
- target_container);
-
- if (key_ports) {
key_ports->src = flow_keys->sport;
key_ports->dst = flow_keys->dport;
}
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE)) {
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+ key_ports_range->tp.src = flow_keys->sport;
+ key_ports_range->tp.dst = flow_keys->dport;
+ }
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
@@ -1084,21 +1117,22 @@ bool __skb_flow_dissect(const struct net *net,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
+ rcu_read_lock();
+
if (skb) {
if (!net) {
if (skb->dev)
- net = dev_net(skb->dev);
+ net = dev_net_rcu(skb->dev);
else if (skb->sk)
net = sock_net(skb->sk);
}
}
- WARN_ON_ONCE(!net);
+ DEBUG_NET_WARN_ON_ONCE(!net);
if (net) {
enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
struct bpf_prog_array *run_array;
- rcu_read_lock();
run_array = rcu_dereference(init_net.bpf.run_array[type]);
if (!run_array)
run_array = rcu_dereference(net->bpf.run_array[type]);
@@ -1126,17 +1160,17 @@ bool __skb_flow_dissect(const struct net *net,
prog = READ_ONCE(run_array->items[0].prog);
result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
hlen, flags);
- if (result == BPF_FLOW_DISSECTOR_CONTINUE)
- goto dissect_continue;
- __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
- target_container);
- rcu_read_unlock();
- return result == BPF_OK;
+ if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
+ __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+ target_container);
+ rcu_read_unlock();
+ return result == BPF_OK;
+ }
}
-dissect_continue:
- rcu_read_unlock();
}
+ rcu_read_unlock();
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
@@ -1784,6 +1818,13 @@ u32 flow_hash_from_keys(struct flow_keys *keys)
}
EXPORT_SYMBOL(flow_hash_from_keys);
+u32 flow_hash_from_keys_seed(struct flow_keys *keys,
+ const siphash_key_t *keyval)
+{
+ return __flow_hash_from_keys(keys, keyval);
+}
+EXPORT_SYMBOL(flow_hash_from_keys_seed);
+
static inline u32 ___skb_get_hash(const struct sk_buff *skb,
struct flow_keys *keys,
const siphash_key_t *keyval)
@@ -1823,22 +1864,23 @@ EXPORT_SYMBOL(make_flow_keys_digest);
static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
-u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
+u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)
{
struct flow_keys keys;
__flow_hash_secret_init();
memset(&keys, 0, sizeof(keys));
- __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric,
+ __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,
&keys, NULL, 0, 0, 0, 0);
return __flow_hash_from_keys(&keys, &hashrnd);
}
-EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
+EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);
/**
- * __skb_get_hash: calculate a flow hash
+ * __skb_get_hash_net: calculate a flow hash
+ * @net: associated network namespace, derived from @skb if NULL
* @skb: sk_buff to calculate flow hash from
*
* This function calculates a flow hash based on src/dst addresses
@@ -1846,18 +1888,24 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
* on success, zero indicates no valid hash. Also, sets l4_hash in skb
* if hash is a canonical 4-tuple hash over transport ports.
*/
-void __skb_get_hash(struct sk_buff *skb)
+void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
struct flow_keys keys;
u32 hash;
+ memset(&keys, 0, sizeof(keys));
+
+ __skb_flow_dissect(net, skb, &flow_keys_dissector,
+ &keys, NULL, 0, 0, 0,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
__flow_hash_secret_init();
- hash = ___skb_get_hash(skb, &keys, &hashrnd);
+ hash = __flow_hash_from_keys(&keys, &hashrnd);
__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
-EXPORT_SYMBOL(__skb_get_hash);
+EXPORT_SYMBOL(__skb_get_hash_net);
__u32 skb_get_hash_perturb(const struct sk_buff *skb,
const siphash_key_t *perturb)
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index fae9c4694186..412816076b8b 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -206,7 +206,7 @@ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
struct net_rate_estimator *est;
- est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ est = unrcu_pointer(xchg(rate_est, NULL));
if (est) {
timer_shutdown_sync(&est->timer);
kfree_rcu(est, rcu);
diff --git a/net/core/gro.c b/net/core/gro.c
index 0759277dc14e..0ad549b07e03 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -3,16 +3,11 @@
#include <net/dst_metadata.h>
#include <net/busy_poll.h>
#include <trace/events/net.h>
+#include <linux/skbuff_ref.h>
#define MAX_GRO_SKBS 8
-/* This should be increased if a protocol with a bigger head is added. */
-#define GRO_MAX_HEAD (MAX_HEADER + 128)
-
static DEFINE_SPINLOCK(offload_lock);
-struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base);
-/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
-int gro_normal_batch __read_mostly = 8;
/**
* dev_add_offload - register offload handlers
@@ -31,7 +26,7 @@ void dev_add_offload(struct packet_offload *po)
struct packet_offload *elem;
spin_lock(&offload_lock);
- list_for_each_entry(elem, &offload_base, list) {
+ list_for_each_entry(elem, &net_hotdata.offload_base, list) {
if (po->priority < elem->priority)
break;
}
@@ -55,7 +50,7 @@ EXPORT_SYMBOL(dev_add_offload);
*/
static void __dev_remove_offload(struct packet_offload *po)
{
- struct list_head *head = &offload_base;
+ struct list_head *head = &net_hotdata.offload_base;
struct packet_offload *po1;
spin_lock(&offload_lock);
@@ -100,7 +95,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
unsigned int delta_truesize;
- unsigned int gro_max_size;
unsigned int new_truesize;
struct sk_buff *lp;
int segs;
@@ -114,12 +108,8 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
if (p->pp_recycle != skb->pp_recycle)
return -ETOOMANYREFS;
- /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
- gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
- READ_ONCE(p->dev->gro_max_size) :
- READ_ONCE(p->dev->gro_ipv4_max_size);
-
- if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
+ if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) ||
+ NAPI_GRO_CB(skb)->flush))
return -E2BIG;
if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
@@ -195,8 +185,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
}
merge:
- /* sk owenrship - if any - completely transferred to the aggregated packet */
+ /* sk ownership - if any - completely transferred to the aggregated packet */
skb->destructor = NULL;
+ skb->sk = NULL;
delta_truesize = skb->truesize;
if (offset > headlen) {
unsigned int eat = offset - headlen;
@@ -232,12 +223,39 @@ done:
return 0;
}
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+
+ /* sk ownership - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
+
static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
{
+ struct list_head *head = &net_hotdata.offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *head = &offload_base;
int err = -ENOENT;
BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
@@ -332,8 +350,6 @@ static void gro_list_prepare(const struct list_head *head,
list_for_each_entry(p, head, list) {
unsigned long diffs;
- NAPI_GRO_CB(p)->flush = 0;
-
if (hash != skb_get_hash_raw(p)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
@@ -350,7 +366,7 @@ static void gro_list_prepare(const struct list_head *head,
skb_mac_header(skb),
maclen);
- /* in most common scenarions 'slow_gro' is 0
+ /* in most common scenarios 'slow_gro' is 0
* otherwise we are already on some slower paths
* either skip all the infrequent tests altogether or
* avoid trying too hard to skip each of them individually
@@ -369,14 +385,22 @@ static void gro_list_prepare(const struct list_head *head,
static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
{
- const struct skb_shared_info *pinfo = skb_shinfo(skb);
- const skb_frag_t *frag0 = &pinfo->frags[0];
+ const struct skb_shared_info *pinfo;
+ const skb_frag_t *frag0;
+ unsigned int headlen;
+ NAPI_GRO_CB(skb)->network_offset = 0;
NAPI_GRO_CB(skb)->data_offset = 0;
- NAPI_GRO_CB(skb)->frag0 = NULL;
- NAPI_GRO_CB(skb)->frag0_len = 0;
+ headlen = skb_headlen(skb);
+ NAPI_GRO_CB(skb)->frag0 = skb->data;
+ NAPI_GRO_CB(skb)->frag0_len = headlen;
+ if (headlen)
+ return;
- if (!skb_headlen(skb) && pinfo->nr_frags &&
+ pinfo = skb_shinfo(skb);
+ frag0 = &pinfo->frags[0];
+
+ if (pinfo->nr_frags && skb_frag_page(frag0) &&
!PageHighMem(skb_frag_page(frag0)) &&
(!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
@@ -438,7 +462,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
{
u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
struct gro_list *gro_list = &napi->gro_hash[bucket];
- struct list_head *head = &offload_base;
+ struct list_head *head = &net_hotdata.offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct sk_buff *pp = NULL;
@@ -466,7 +490,6 @@ found_ptype:
sizeof(u32))); /* Avoid slow unaligned acc */
*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->count = 1;
if (unlikely(skb_is_gso(skb))) {
NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
@@ -544,7 +567,7 @@ normal:
struct packet_offload *gro_find_receive_by_type(__be16 type)
{
- struct list_head *offload_head = &offload_base;
+ struct list_head *offload_head = &net_hotdata.offload_base;
struct packet_offload *ptype;
list_for_each_entry_rcu(ptype, offload_head, list) {
@@ -558,7 +581,7 @@ EXPORT_SYMBOL(gro_find_receive_by_type);
struct packet_offload *gro_find_complete_by_type(__be16 type)
{
- struct list_head *offload_head = &offload_base;
+ struct list_head *offload_head = &net_hotdata.offload_base;
struct packet_offload *ptype;
list_for_each_entry_rcu(ptype, offload_head, list) {
@@ -630,6 +653,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->pkt_type = PACKET_HOST;
skb->encapsulation = 0;
+ skb->ip_summed = CHECKSUM_NONE;
skb_shinfo(skb)->gso_type = 0;
skb_shinfo(skb)->gso_size = 0;
if (unlikely(skb->slow_gro)) {
@@ -700,7 +724,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
skb_reset_mac_header(skb);
skb_gro_reset_offset(skb, hlen);
- if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ if (unlikely(!skb_gro_may_pull(skb, hlen))) {
eth = skb_gro_header_slow(skb, hlen, 0);
if (unlikely(!eth)) {
net_warn_ratelimited("%s: dropping impossible skb from %s\n",
@@ -710,7 +734,10 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
}
} else {
eth = (const struct ethhdr *)skb->data;
- gro_pull_from_frag0(skb, hlen);
+
+ if (NAPI_GRO_CB(skb)->frag0 != skb->data)
+ gro_pull_from_frag0(skb, hlen);
+
NAPI_GRO_CB(skb)->frag0 += hlen;
NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index ed5ec5de47f6..ff8e5b64bf6b 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -3,6 +3,7 @@
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <net/gro_cells.h>
+#include <net/hotdata.h>
struct gro_cell {
struct sk_buff_head napi_skbs;
@@ -26,7 +27,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
cell = this_cpu_ptr(gcells->cells);
- if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) {
+ if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) {
drop:
dev_core_stats_rx_dropped_inc(dev);
kfree_skb(skb);
diff --git a/net/core/gso.c b/net/core/gso.c
index 9e1803bfc9c6..bcd156372f4d 100644
--- a/net/core/gso.c
+++ b/net/core/gso.c
@@ -17,7 +17,7 @@ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
struct packet_offload *ptype;
rcu_read_lock();
- list_for_each_entry_rcu(ptype, &offload_base, list) {
+ list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
segs = ptype->callbacks.gso_segment(skb, features);
break;
@@ -48,7 +48,7 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
__skb_pull(skb, vlan_depth);
rcu_read_lock();
- list_for_each_entry_rcu(ptype, &offload_base, list) {
+ list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
segs = ptype->callbacks.gso_segment(skb, features);
break;
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
new file mode 100644
index 000000000000..d0aaaaa556f2
--- /dev/null
+++ b/net/core/hotdata.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/cache.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <net/hotdata.h>
+#include <net/proto_memory.h>
+
+struct net_hotdata net_hotdata __cacheline_aligned = {
+ .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
+ .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all),
+ .gro_normal_batch = 8,
+
+ .netdev_budget = 300,
+ /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
+ .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ,
+
+ .tstamp_prequeue = 1,
+ .max_backlog = 1000,
+ .dev_tx_weight = 64,
+ .dev_rx_weight = 64,
+ .sysctl_max_skb_frags = MAX_SKB_FRAGS,
+ .sysctl_skb_defer_max = 64,
+ .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE
+};
+EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c
new file mode 100644
index 000000000000..759a9b9f3f89
--- /dev/null
+++ b/net/core/ieee8021q_helpers.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/array_size.h>
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <net/dscp.h>
+#include <net/ieee8021q.h>
+
+/* The following arrays map Traffic Types (TT) to traffic classes (TC) for
+ * different number of queues as shown in the example provided by
+ * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and
+ * Table I-1 "Traffic type to traffic class mapping".
+ */
+static const u8 ieee8021q_8queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2,
+ [IEEE8021Q_TT_CA] = 3,
+ [IEEE8021Q_TT_VI] = 4,
+ [IEEE8021Q_TT_VO] = 5,
+ [IEEE8021Q_TT_IC] = 6,
+ [IEEE8021Q_TT_NC] = 7,
+};
+
+static const u8 ieee8021q_7queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2,
+ [IEEE8021Q_TT_CA] = 3,
+ [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4,
+ [IEEE8021Q_TT_IC] = 5,
+ [IEEE8021Q_TT_NC] = 6,
+};
+
+static const u8 ieee8021q_6queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2,
+ [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3,
+ [IEEE8021Q_TT_IC] = 4,
+ [IEEE8021Q_TT_NC] = 5,
+};
+
+static const u8 ieee8021q_5queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+ [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+ [IEEE8021Q_TT_IC] = 3,
+ [IEEE8021Q_TT_NC] = 4,
+};
+
+static const u8 ieee8021q_4queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+ [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+ [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3,
+};
+
+static const u8 ieee8021q_3queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+ [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2,
+};
+
+static const u8 ieee8021q_2queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+ [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1,
+};
+
+static const u8 ieee8021q_1queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0,
+ [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0,
+};
+
+/**
+ * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class
+ * @tt: IEEE 802.1Q Traffic Type
+ * @num_queues: Number of queues
+ *
+ * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based
+ * on the number of queues configured on the NIC. The mapping is based on the
+ * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic
+ * class mapping" and Table I-1 "Traffic type to traffic class mapping".
+ *
+ * Return: Traffic Class corresponding to the given Traffic Type or negative
+ * value in case of error.
+ */
+int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues)
+{
+ if (tt < 0 || tt >= IEEE8021Q_TT_MAX) {
+ pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt,
+ IEEE8021Q_TT_MAX);
+ return -EINVAL;
+ }
+
+ switch (num_queues) {
+ case 8:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_8queue_tt_tc_map != max - 1");
+ return ieee8021q_8queue_tt_tc_map[tt];
+ case 7:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_7queue_tt_tc_map != max - 1");
+
+ return ieee8021q_7queue_tt_tc_map[tt];
+ case 6:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_6queue_tt_tc_map != max - 1");
+
+ return ieee8021q_6queue_tt_tc_map[tt];
+ case 5:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_5queue_tt_tc_map != max - 1");
+
+ return ieee8021q_5queue_tt_tc_map[tt];
+ case 4:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_4queue_tt_tc_map != max - 1");
+
+ return ieee8021q_4queue_tt_tc_map[tt];
+ case 3:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_3queue_tt_tc_map != max - 1");
+
+ return ieee8021q_3queue_tt_tc_map[tt];
+ case 2:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_2queue_tt_tc_map != max - 1");
+
+ return ieee8021q_2queue_tt_tc_map[tt];
+ case 1:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_1queue_tt_tc_map != max - 1");
+
+ return ieee8021q_1queue_tt_tc_map[tt];
+ }
+
+ pr_err("Invalid number of queues %d\n", num_queues);
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc);
+
+/**
+ * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type
+ * @dscp: IETF DSCP value
+ *
+ * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT).
+ * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic
+ * Type, this function is inspired by the RFC8325 documentation which describe
+ * the mapping between DSCP and 802.11 User Priority (UP) values.
+ *
+ * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value
+ */
+int ietf_dscp_to_ieee8021q_tt(u8 dscp)
+{
+ switch (dscp) {
+ case DSCP_CS0:
+ /* Comment from RFC8325:
+ * [RFC4594], Section 4.8, recommends High-Throughput Data be marked
+ * AF1x (that is, AF11, AF12, and AF13, according to the rules defined
+ * in [RFC2475]).
+ *
+ * By default (as described in Section 2.3), High-Throughput Data will
+ * map to UP 1 and, thus, to the Background Access Category (AC_BK),
+ * which is contrary to the intent expressed in [RFC4594].
+
+ * Unfortunately, there really is no corresponding fit for the High-
+ * Throughput Data service class within the constrained 4 Access
+ * Category [IEEE.802.11-2016] model. If the High-Throughput Data
+ * service class is assigned to the Best Effort Access Category (AC_BE),
+ * then it would contend with Low-Latency Data (while [RFC4594]
+ * recommends a distinction in servicing between these service classes)
+ * as well as with the default service class; alternatively, if it is
+ * assigned to the Background Access Category (AC_BK), then it would
+ * receive a less-then-best-effort service and contend with Low-Priority
+ * Data (as discussed in Section 4.2.10).
+ *
+ * As such, since there is no directly corresponding fit for the High-
+ * Throughout Data service class within the [IEEE.802.11-2016] model, it
+ * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby
+ * admitting it to the Best Effort Access Category (AC_BE).
+ *
+ * Note: The above text is from RFC8325 which is describing the mapping
+ * between DSCP and 802.11 User Priority (UP) values. The mapping
+ * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but
+ * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q
+ * Traffic Types BE and BK.
+ */
+ case DSCP_AF11:
+ case DSCP_AF12:
+ case DSCP_AF13:
+ return IEEE8021Q_TT_BE;
+ /* Comment from RFC8325:
+ * RFC3662 and RFC4594 both recommend Low-Priority Data be marked
+ * with DSCP CS1. The Low-Priority Data service class loosely
+ * corresponds to the [IEEE.802.11-2016] Background Access Category
+ */
+ case DSCP_CS1:
+ return IEEE8021Q_TT_BK;
+ case DSCP_CS2:
+ case DSCP_AF21:
+ case DSCP_AF22:
+ case DSCP_AF23:
+ return IEEE8021Q_TT_EE;
+ case DSCP_CS3:
+ case DSCP_AF31:
+ case DSCP_AF32:
+ case DSCP_AF33:
+ return IEEE8021Q_TT_CA;
+ case DSCP_CS4:
+ case DSCP_AF41:
+ case DSCP_AF42:
+ case DSCP_AF43:
+ return IEEE8021Q_TT_VI;
+ case DSCP_CS5:
+ case DSCP_EF:
+ case DSCP_VOICE_ADMIT:
+ return IEEE8021Q_TT_VO;
+ case DSCP_CS6:
+ return IEEE8021Q_TT_IC;
+ case DSCP_CS7:
+ return IEEE8021Q_TT_NC;
+ }
+
+ return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp);
+}
+EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 429571c258da..cb04ef2b9807 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -33,7 +33,7 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
static LIST_HEAD(lweventlist);
static DEFINE_SPINLOCK(lweventlist_lock);
-static unsigned char default_operstate(const struct net_device *dev)
+static unsigned int default_operstate(const struct net_device *dev)
{
if (netif_testing(dev))
return IF_OPER_TESTING;
@@ -42,12 +42,21 @@ static unsigned char default_operstate(const struct net_device *dev)
* first check whether lower is indeed the source of its down state.
*/
if (!netif_carrier_ok(dev)) {
- int iflink = dev_get_iflink(dev);
struct net_device *peer;
+ int iflink;
+
+ /* If called from netdev_run_todo()/linkwatch_sync_dev(),
+ * dev_net(dev) can be already freed, and RTNL is not held.
+ */
+ if (dev->reg_state <= NETREG_REGISTERED)
+ iflink = dev_get_iflink(dev);
+ else
+ iflink = dev->ifindex;
if (iflink == dev->ifindex)
return IF_OPER_DOWN;
+ ASSERT_RTNL();
peer = __dev_get_by_index(dev_net(dev), iflink);
if (!peer)
return IF_OPER_DOWN;
@@ -62,16 +71,13 @@ static unsigned char default_operstate(const struct net_device *dev)
return IF_OPER_UP;
}
-
static void rfc2863_policy(struct net_device *dev)
{
- unsigned char operstate = default_operstate(dev);
+ unsigned int operstate = default_operstate(dev);
- if (operstate == dev->operstate)
+ if (operstate == READ_ONCE(dev->operstate))
return;
- write_lock(&dev_base_lock);
-
switch(dev->link_mode) {
case IF_LINK_MODE_TESTING:
if (operstate == IF_OPER_UP)
@@ -87,9 +93,7 @@ static void rfc2863_policy(struct net_device *dev)
break;
}
- dev->operstate = operstate;
-
- write_unlock(&dev_base_lock);
+ WRITE_ONCE(dev->operstate, operstate);
}
@@ -153,9 +157,9 @@ static void linkwatch_schedule_work(int urgent)
* override the existing timer.
*/
if (test_bit(LW_URGENT, &linkwatch_flags))
- mod_delayed_work(system_wq, &linkwatch_work, 0);
+ mod_delayed_work(system_unbound_wq, &linkwatch_work, 0);
else
- schedule_delayed_work(&linkwatch_work, delay);
+ queue_delayed_work(system_unbound_wq, &linkwatch_work, delay);
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 4a0797f0a154..ae74634310a3 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -10,8 +10,10 @@
#include <linux/bpf.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
+#include <net/ip.h>
#include <net/ip6_route.h>
#include <net/ipv6_stubs.h>
+#include <net/inet_dscp.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -38,13 +40,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
struct dst_entry *dst, bool can_redirect)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int ret;
- /* Migration disable and BH disable are needed to protect per-cpu
- * redirect_info between BPF prog and skb_do_redirect().
+ /* Disabling BH is needed to protect per-CPU bpf_redirect_info between
+ * BPF prog and skb_do_redirect().
*/
- migrate_disable();
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
@@ -77,24 +80,26 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
break;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
- migrate_enable();
return ret;
}
static int bpf_lwt_input_reroute(struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
int err = -EINVAL;
if (skb->protocol == htons(ETH_P_IP)) {
struct net_device *dev = skb_dst(skb)->dev;
- struct iphdr *iph = ip_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
dev_hold(dev);
skb_dst_drop(skb);
- err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, dev);
+ reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev);
+ err = reason ? -EINVAL : 0;
dev_put(dev);
} else if (skb->protocol == htons(ETH_P_IPV6)) {
skb_dst_drop(skb);
@@ -204,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
fl4.flowi4_oif = oif;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_uid = sock_net_uid(net, sk);
- fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
fl4.flowi4_proto = iph->protocol;
fl4.daddr = iph->daddr;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 711cd3b4347a..4417a18b3e95 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -23,6 +23,8 @@
#include <net/ip6_fib.h>
#include <net/rtnh.h>
+#include "dev.h"
+
DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
@@ -325,13 +327,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
+ goto drop;
+ }
- if (!dst)
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -341,8 +353,11 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->output))
+ if (likely(ops && ops->output)) {
+ dev_xmit_recursion_inc();
ret = ops->output(net, sk, skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
@@ -359,13 +374,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_output);
int lwtunnel_xmit(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
+ goto drop;
+ }
- if (!dst)
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
goto drop;
+ }
lwtstate = dst->lwtstate;
@@ -376,8 +401,11 @@ int lwtunnel_xmit(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->xmit))
+ if (likely(ops && ops->xmit)) {
+ dev_xmit_recursion_inc();
ret = ops->xmit(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
@@ -394,13 +422,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_xmit);
int lwtunnel_input(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
- if (!dst)
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -410,8 +448,11 @@ int lwtunnel_input(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->input))
+ if (likely(ops && ops->input)) {
+ dev_xmit_recursion_inc();
ret = ops->input(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
diff --git a/net/core/mp_dmabuf_devmem.h b/net/core/mp_dmabuf_devmem.h
new file mode 100644
index 000000000000..67cd0dd7319c
--- /dev/null
+++ b/net/core/mp_dmabuf_devmem.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Dmabuf device memory provider.
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ *
+ */
+#ifndef _NET_MP_DMABUF_DEVMEM_H
+#define _NET_MP_DMABUF_DEVMEM_H
+
+#include <net/netmem.h>
+
+#if defined(CONFIG_NET_DEVMEM)
+int mp_dmabuf_devmem_init(struct page_pool *pool);
+
+netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp);
+
+void mp_dmabuf_devmem_destroy(struct page_pool *pool);
+
+bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem);
+#else
+static inline int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline netmem_ref
+mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ return 0;
+}
+
+static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+}
+
+static inline bool
+mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
+{
+ return false;
+}
+#endif
+
+#endif /* _NET_MP_DMABUF_DEVMEM_H */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 552719c3bbc3..1a620f903c56 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -14,7 +14,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/slab.h>
-#include <linux/kmemleak.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -61,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
static const struct seq_operations neigh_stat_seq_ops;
#endif
+static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family)
+{
+ int i;
+
+ switch (family) {
+ default:
+ DEBUG_NET_WARN_ON_ONCE(1);
+ fallthrough; /* to avoid panic by null-ptr-deref */
+ case AF_INET:
+ i = NEIGH_ARP_TABLE;
+ break;
+ case AF_INET6:
+ i = NEIGH_ND_TABLE;
+ break;
+ }
+
+ return &dev->neighbours[i];
+}
+
/*
Neighbour hash table buckets are protected with rwlock tbl->lock.
@@ -205,18 +223,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
}
}
-static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
- struct neigh_table *tbl)
+bool neigh_remove_one(struct neighbour *n)
{
bool retval = false;
write_lock(&n->lock);
if (refcount_read(&n->refcnt) == 1) {
- struct neighbour *neigh;
-
- neigh = rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock));
- rcu_assign_pointer(*np, neigh);
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
retval = true;
}
@@ -226,29 +240,6 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
return retval;
}
-bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
-{
- struct neigh_hash_table *nht;
- void *pkey = ndel->primary_key;
- u32 hash_val;
- struct neighbour *n;
- struct neighbour __rcu **np;
-
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
- hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
- hash_val = hash_val >> (32 - nht->hash_shift);
-
- np = &nht->hash_buckets[hash_val];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock)))) {
- if (n == ndel)
- return neigh_del(n, np, tbl);
- np = &n->next;
- }
- return false;
-}
-
static int neigh_forced_gc(struct neigh_table *tbl)
{
int max_clean = atomic_read(&tbl->gc_entries) -
@@ -276,7 +267,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
remove = true;
write_unlock(&n->lock);
- if (remove && neigh_remove_one(n, tbl))
+ if (remove && neigh_remove_one(n))
shrunk++;
if (shrunk >= max_clean)
break;
@@ -380,54 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
bool skip_perm)
{
- int i;
- struct neigh_hash_table *nht;
+ struct hlist_head *dev_head;
+ struct hlist_node *tmp;
+ struct neighbour *n;
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
+ dev_head = neigh_get_dev_table(dev, tbl->family);
- for (i = 0; i < (1 << nht->hash_shift); i++) {
- struct neighbour *n;
- struct neighbour __rcu **np = &nht->hash_buckets[i];
+ hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
+ if (skip_perm && n->nud_state & NUD_PERMANENT)
+ continue;
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
- if (dev && n->dev != dev) {
- np = &n->next;
- continue;
- }
- if (skip_perm && n->nud_state & NUD_PERMANENT) {
- np = &n->next;
- continue;
- }
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
- write_lock(&n->lock);
- neigh_del_timer(n);
- neigh_mark_dead(n);
- if (refcount_read(&n->refcnt) != 1) {
- /* The most unpleasant situation.
- We must destroy neighbour entry,
- but someone still uses it.
-
- The destroy will be delayed until
- the last user releases us, but
- we must kill timers etc. and move
- it to safe state.
- */
- __skb_queue_purge(&n->arp_queue);
- n->arp_queue_len_bytes = 0;
- WRITE_ONCE(n->output, neigh_blackhole);
- if (n->nud_state & NUD_VALID)
- n->nud_state = NUD_NOARP;
- else
- n->nud_state = NUD_NONE;
- neigh_dbg(2, "neigh %p is stray\n", n);
- }
- write_unlock(&n->lock);
- neigh_cleanup_and_release(n);
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ write_lock(&n->lock);
+ neigh_del_timer(n);
+ neigh_mark_dead(n);
+ if (refcount_read(&n->refcnt) != 1) {
+ /* The most unpleasant situation.
+ * We must destroy neighbour entry,
+ * but someone still uses it.
+ *
+ * The destroy will be delayed until
+ * the last user releases us, but
+ * we must kill timers etc. and move
+ * it to safe state.
+ */
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
+ WRITE_ONCE(n->output, neigh_blackhole);
+ if (n->nud_state & NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+ neigh_dbg(2, "neigh %p is stray\n", n);
}
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
}
}
@@ -530,27 +509,21 @@ static void neigh_get_hash_rnd(u32 *x)
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
- size_t size = (1 << shift) * sizeof(struct neighbour *);
+ size_t size = (1 << shift) * sizeof(struct hlist_head);
+ struct hlist_head *hash_heads;
struct neigh_hash_table *ret;
- struct neighbour __rcu **buckets;
int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
- if (size <= PAGE_SIZE) {
- buckets = kzalloc(size, GFP_ATOMIC);
- } else {
- buckets = (struct neighbour __rcu **)
- __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
- get_order(size));
- kmemleak_alloc(buckets, size, 1, GFP_ATOMIC);
- }
- if (!buckets) {
+
+ hash_heads = kvzalloc(size, GFP_ATOMIC);
+ if (!hash_heads) {
kfree(ret);
return NULL;
}
- ret->hash_buckets = buckets;
+ ret->hash_heads = hash_heads;
ret->hash_shift = shift;
for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
neigh_get_hash_rnd(&ret->hash_rnd[i]);
@@ -562,15 +535,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
- size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
- struct neighbour __rcu **buckets = nht->hash_buckets;
- if (size <= PAGE_SIZE) {
- kfree(buckets);
- } else {
- kmemleak_free(buckets);
- free_pages((unsigned long)buckets, get_order(size));
- }
+ kvfree(nht->hash_heads);
kfree(nht);
}
@@ -589,24 +555,17 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
return old_nht;
for (i = 0; i < (1 << old_nht->hash_shift); i++) {
- struct neighbour *n, *next;
+ struct hlist_node *tmp;
+ struct neighbour *n;
- for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
- lockdep_is_held(&tbl->lock));
- n != NULL;
- n = next) {
+ neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) {
hash = tbl->hash(n->primary_key, n->dev,
new_nht->hash_rnd);
hash >>= (32 - new_nht->hash_shift);
- next = rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock));
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(
- new_nht->hash_buckets[hash],
- lockdep_is_held(&tbl->lock)));
- rcu_assign_pointer(new_nht->hash_buckets[hash], n);
+ hlist_del_rcu(&n->hash);
+ hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]);
}
}
@@ -693,11 +652,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_tbl_unlock;
}
- for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock));
- n1 != NULL;
- n1 = rcu_dereference_protected(n1->next,
- lockdep_is_held(&tbl->lock))) {
+ neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
@@ -713,10 +668,11 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
list_add_tail(&n->managed_list, &n->tbl->managed_list);
if (want_ref)
neigh_hold(n);
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock)));
- rcu_assign_pointer(nht->hash_buckets[hash_val], n);
+ hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
+
+ hlist_add_head_rcu(&n->dev_list,
+ neigh_get_dev_table(dev, tbl->family));
+
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
@@ -734,7 +690,9 @@ out_neigh_release:
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
- return ___neigh_create(tbl, pkey, dev, 0, false, want_ref);
+ bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK);
+
+ return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);
}
EXPORT_SYMBOL(__neigh_create);
@@ -946,10 +904,10 @@ static void neigh_connect(struct neighbour *neigh)
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
+ struct neigh_hash_table *nht;
+ struct hlist_node *tmp;
struct neighbour *n;
- struct neighbour __rcu **np;
unsigned int i;
- struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
@@ -974,10 +932,7 @@ static void neigh_periodic_work(struct work_struct *work)
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
- np = &nht->hash_buckets[i];
-
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
unsigned int state;
write_lock(&n->lock);
@@ -986,7 +941,7 @@ static void neigh_periodic_work(struct work_struct *work)
if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
(n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
- goto next_elt;
+ continue;
}
if (time_before(n->used, n->confirmed) &&
@@ -997,18 +952,14 @@ static void neigh_periodic_work(struct work_struct *work)
(state == NUD_FAILED ||
!time_in_range_open(jiffies, n->used,
n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
-
-next_elt:
- np = &n->next;
}
/*
* It's fine to release lock here, even if hash table
@@ -1769,7 +1720,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms)
static struct lock_class_key neigh_table_proxy_queue_class;
-static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
+static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
void neigh_table_init(int index, struct neigh_table *tbl)
{
@@ -1826,13 +1777,19 @@ void neigh_table_init(int index, struct neigh_table *tbl)
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
- neigh_tables[index] = tbl;
+ rcu_assign_pointer(neigh_tables[index], tbl);
}
EXPORT_SYMBOL(neigh_table_init);
+/*
+ * Only called from ndisc_cleanup(), which means this is dead code
+ * because we no longer can unload IPv6 module.
+ */
int neigh_table_clear(int index, struct neigh_table *tbl)
{
- neigh_tables[index] = NULL;
+ RCU_INIT_POINTER(neigh_tables[index], NULL);
+ synchronize_rcu();
+
/* It is not clean... Fix it to unload IPv6 module safely */
cancel_delayed_work_sync(&tbl->managed_work);
cancel_delayed_work_sync(&tbl->gc_work);
@@ -1864,10 +1821,10 @@ static struct neigh_table *neigh_find_table(int family)
switch (family) {
case AF_INET:
- tbl = neigh_tables[NEIGH_ARP_TABLE];
+ tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]);
break;
case AF_INET6:
- tbl = neigh_tables[NEIGH_ND_TABLE];
+ tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]);
break;
}
@@ -1949,7 +1906,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
NETLINK_CB(skb).portid, extack);
write_lock_bh(&tbl->lock);
neigh_release(neigh);
- neigh_remove_one(neigh, tbl);
+ neigh_remove_one(neigh);
write_unlock_bh(&tbl->lock);
out:
@@ -2293,6 +2250,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_IFINDEX] = { .type = NLA_U32 },
[NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 },
[NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
@@ -2331,7 +2289,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
ndtmsg = nlmsg_data(nlh);
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
- tbl = neigh_tables[tidx];
+ tbl = rcu_dereference_rtnl(neigh_tables[tidx]);
if (!tbl)
continue;
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
@@ -2519,7 +2477,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
- tbl = neigh_tables[tidx];
+ tbl = rcu_dereference_rtnl(neigh_tables[tidx]);
if (!tbl)
continue;
@@ -2674,7 +2632,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)
if (!master_idx)
return false;
- master = dev ? netdev_master_upper_dev_get(dev) : NULL;
+ master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL;
/* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another
* invalid value for ifindex to denote "no master".
@@ -2707,7 +2665,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
{
struct net *net = sock_net(skb->sk);
struct neighbour *n;
- int rc, h, s_h = cb->args[1];
+ int err = 0, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
struct neigh_hash_table *nht;
unsigned int flags = NLM_F_MULTI;
@@ -2715,37 +2673,31 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (filter->dev_idx || filter->master_idx)
flags |= NLM_F_DUMP_FILTERED;
- rcu_read_lock();
nht = rcu_dereference(tbl->nht);
for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
- for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0;
- n != NULL;
- n = rcu_dereference(n->next)) {
+ idx = 0;
+ neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) {
if (idx < s_idx || !net_eq(dev_net(n->dev), net))
goto next;
if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
neigh_master_filtered(n->dev, filter->master_idx))
goto next;
- if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- flags) < 0) {
- rc = -1;
+ err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, flags);
+ if (err < 0)
goto out;
- }
next:
idx++;
}
}
- rc = skb->len;
out:
- rcu_read_unlock();
cb->args[1] = h;
cb->args[2] = idx;
- return rc;
+ return err;
}
static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
@@ -2754,7 +2706,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
{
struct pneigh_entry *n;
struct net *net = sock_net(skb->sk);
- int rc, h, s_h = cb->args[3];
+ int err = 0, h, s_h = cb->args[3];
int idx, s_idx = idx = cb->args[4];
unsigned int flags = NLM_F_MULTI;
@@ -2772,11 +2724,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
neigh_master_filtered(n->dev, filter->master_idx))
goto next;
- if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH, flags, tbl) < 0) {
+ err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, flags, tbl);
+ if (err < 0) {
read_unlock_bh(&tbl->lock);
- rc = -1;
goto out;
}
next:
@@ -2785,12 +2737,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
}
read_unlock_bh(&tbl->lock);
- rc = skb->len;
out:
cb->args[3] = h;
cb->args[4] = idx;
- return rc;
-
+ return err;
}
static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
@@ -2875,11 +2825,13 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack);
if (err < 0 && cb->strict_check)
return err;
+ err = 0;
s_t = cb->args[0];
+ rcu_read_lock();
for (t = 0; t < NEIGH_NR_TABLES; t++) {
- tbl = neigh_tables[t];
+ tbl = rcu_dereference(neigh_tables[t]);
if (!tbl)
continue;
@@ -2895,9 +2847,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
if (err < 0)
break;
}
+ rcu_read_unlock();
cb->args[0] = t;
- return skb->len;
+ return err;
}
static int neigh_valid_get_req(const struct nlmsghdr *nlh,
@@ -3094,9 +3047,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
- for (n = rcu_dereference(nht->hash_buckets[chain]);
- n != NULL;
- n = rcu_dereference(n->next))
+ neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
cb(n, cookie);
}
read_unlock_bh(&tbl->lock);
@@ -3108,29 +3059,25 @@ EXPORT_SYMBOL(neigh_for_each);
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
- int chain;
struct neigh_hash_table *nht;
+ int chain;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct hlist_node *tmp;
struct neighbour *n;
- struct neighbour __rcu **np;
- np = &nht->hash_buckets[chain];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
- } else
- np = &n->next;
+ }
write_unlock(&n->lock);
if (release)
neigh_cleanup_and_release(n);
@@ -3143,14 +3090,15 @@ int neigh_xmit(int index, struct net_device *dev,
const void *addr, struct sk_buff *skb)
{
int err = -EAFNOSUPPORT;
+
if (likely(index < NEIGH_NR_TABLES)) {
struct neigh_table *tbl;
struct neighbour *neigh;
- tbl = neigh_tables[index];
- if (!tbl)
- goto out;
rcu_read_lock();
+ tbl = rcu_dereference(neigh_tables[index]);
+ if (!tbl)
+ goto out_unlock;
if (index == NEIGH_ARP_TABLE) {
u32 key = *((u32 *)addr);
@@ -3166,6 +3114,7 @@ int neigh_xmit(int index, struct net_device *dev,
goto out_kfree_skb;
}
err = READ_ONCE(neigh->output)(neigh, skb);
+out_unlock:
rcu_read_unlock();
}
else if (index == NEIGH_LINK_TABLE) {
@@ -3185,43 +3134,53 @@ EXPORT_SYMBOL(neigh_xmit);
#ifdef CONFIG_PROC_FS
-static struct neighbour *neigh_get_first(struct seq_file *seq)
+static struct neighbour *neigh_get_valid(struct seq_file *seq,
+ struct neighbour *n,
+ loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
+
+ if (!net_eq(dev_net(n->dev), net))
+ return NULL;
+
+ if (state->neigh_sub_iter) {
+ loff_t fakep = 0;
+ void *v;
+
+ v = state->neigh_sub_iter(state, n, pos ? pos : &fakep);
+ if (!v)
+ return NULL;
+ if (pos)
+ return v;
+ }
+
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ return n;
+
+ if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
+ return n;
+
+ return NULL;
+}
+
+static struct neighbour *neigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
struct neigh_hash_table *nht = state->nht;
- struct neighbour *n = NULL;
- int bucket;
+ struct neighbour *n, *tmp;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
- for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
- n = rcu_dereference(nht->hash_buckets[bucket]);
-
- while (n) {
- if (!net_eq(dev_net(n->dev), net))
- goto next;
- if (state->neigh_sub_iter) {
- loff_t fakep = 0;
- void *v;
- v = state->neigh_sub_iter(state, n, &fakep);
- if (!v)
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
- if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
- break;
-next:
- n = rcu_dereference(n->next);
+ while (++state->bucket < (1 << nht->hash_shift)) {
+ neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) {
+ tmp = neigh_get_valid(seq, n, NULL);
+ if (tmp)
+ return tmp;
}
-
- if (n)
- break;
}
- state->bucket = bucket;
- return n;
+ return NULL;
}
static struct neighbour *neigh_get_next(struct seq_file *seq,
@@ -3229,46 +3188,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
- struct net *net = seq_file_net(seq);
- struct neigh_hash_table *nht = state->nht;
+ struct neighbour *tmp;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
+
if (v)
return n;
}
- n = rcu_dereference(n->next);
- while (1) {
- while (n) {
- if (!net_eq(dev_net(n->dev), net))
- goto next;
- if (state->neigh_sub_iter) {
- void *v = state->neigh_sub_iter(state, n, pos);
- if (v)
- return n;
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
-
- if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
- break;
-next:
- n = rcu_dereference(n->next);
+ hlist_for_each_entry_continue(n, hash) {
+ tmp = neigh_get_valid(seq, n, pos);
+ if (tmp) {
+ n = tmp;
+ goto out;
}
-
- if (n)
- break;
-
- if (++state->bucket >= (1 << nht->hash_shift))
- break;
-
- n = rcu_dereference(nht->hash_buckets[state->bucket]);
}
+ n = neigh_get_first(seq);
+out:
if (n && pos)
--(*pos);
+
return n;
}
@@ -3371,7 +3312,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
struct neigh_seq_state *state = seq->private;
state->tbl = tbl;
- state->bucket = 0;
+ state->bucket = -1;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
rcu_read_lock();
@@ -3507,10 +3448,12 @@ static const struct seq_operations neigh_stat_seq_ops = {
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid)
{
- struct net *net = dev_net(n->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
+ struct net *net;
+ rcu_read_lock();
+ net = dev_net_rcu(n->dev);
skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
@@ -3523,10 +3466,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
- return;
+ goto out;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+out:
+ rcu_read_unlock();
}
void neigh_app_ns(struct neighbour *n)
@@ -3538,7 +3482,7 @@ EXPORT_SYMBOL(neigh_app_ns);
#ifdef CONFIG_SYSCTL
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
-static int proc_unres_qlen(struct ctl_table *ctl, int write,
+static int proc_unres_qlen(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int size, ret;
@@ -3573,7 +3517,7 @@ static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
rcu_read_unlock();
}
-static void neigh_proc_update(struct ctl_table *ctl, int write)
+static void neigh_proc_update(const struct ctl_table *ctl, int write)
{
struct net_device *dev = ctl->extra1;
struct neigh_parms *p = ctl->extra2;
@@ -3590,7 +3534,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
neigh_copy_dflt_parms(net, p, index);
}
-static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
+static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3605,7 +3549,7 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
return ret;
}
-static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write,
+static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table tmp = *ctl;
@@ -3621,7 +3565,7 @@ static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int wr
return ret;
}
-int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
+int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -3631,7 +3575,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
}
EXPORT_SYMBOL(neigh_proc_dointvec);
-int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,
+int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3641,7 +3585,7 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,
}
EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
-static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
+static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3651,7 +3595,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
return ret;
}
-int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
+int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3661,7 +3605,7 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
}
EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
-static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
+static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3671,7 +3615,7 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
return ret;
}
-static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
+static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3728,7 +3672,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX];
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
@@ -3779,7 +3723,6 @@ static struct neigh_sysctl_table {
.extra2 = SYSCTL_INT_MAX,
.proc_handler = proc_dointvec_minmax,
},
- {},
},
};
@@ -3807,8 +3750,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
- memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
- sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1;
} else {
struct neigh_table *tbl = p->tbl;
@@ -3885,16 +3826,18 @@ EXPORT_SYMBOL(neigh_sysctl_unregister);
#endif /* CONFIG_SYSCTL */
+static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNEIGH, .doit = neigh_add},
+ {.msgtype = RTM_DELNEIGH, .doit = neigh_delete},
+ {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info},
+ {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set},
+};
+
static int __init neigh_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
- 0);
- rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0);
-
+ rtnl_register_many(neigh_rtnl_msg_handlers);
return 0;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 09f7ed1a04e8..fa6d3969734a 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -3,52 +3,22 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/wext.h>
+#include <net/hotdata.h>
#include "dev.h"
-#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
-
-#define get_bucket(x) ((x) >> BUCKET_SPACE)
-#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
-#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
-
-static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
+static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos)
{
- struct net *net = seq_file_net(seq);
+ unsigned long ifindex = *pos;
struct net_device *dev;
- struct hlist_head *h;
- unsigned int count = 0, offset = get_offset(*pos);
- h = &net->dev_index_head[get_bucket(*pos)];
- hlist_for_each_entry_rcu(dev, h, index_hlist) {
- if (++count == offset)
- return dev;
+ for_each_netdev_dump(seq_file_net(seq), dev, ifindex) {
+ *pos = dev->ifindex;
+ return dev;
}
-
- return NULL;
-}
-
-static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
-{
- struct net_device *dev;
- unsigned int bucket;
-
- do {
- dev = dev_from_same_bucket(seq, pos);
- if (dev)
- return dev;
-
- bucket = get_bucket(*pos) + 1;
- *pos = set_bucket_offset(bucket, 1);
- } while (bucket < NETDEV_HASHENTRIES);
-
return NULL;
}
-/*
- * This is invoked by the /proc filesystem handler to display a device
- * in detail.
- */
static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
@@ -56,16 +26,13 @@ static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
if (!*pos)
return SEQ_START_TOKEN;
- if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
- return NULL;
-
- return dev_from_bucket(seq, pos);
+ return dev_seq_from_index(seq, pos);
}
static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
- return dev_from_bucket(seq, pos);
+ return dev_seq_from_index(seq, pos);
}
static void dev_seq_stop(struct seq_file *seq, void *v)
@@ -177,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x\n",
- sd->processed, sd->dropped, sd->time_squeeze, 0,
+ sd->processed, atomic_read(&sd->dropped),
+ sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
sd->received_rps, flow_limit_count,
@@ -217,7 +185,7 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
}
}
- list_for_each_entry_rcu(pt, &ptype_all, list) {
+ list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) {
if (i == pos)
return pt;
++i;
@@ -265,13 +233,13 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
}
- nxt = ptype_all.next;
+ nxt = net_hotdata.ptype_all.next;
goto ptype_all;
}
if (pt->type == htons(ETH_P_ALL)) {
ptype_all:
- if (nxt != &ptype_all)
+ if (nxt != &net_hotdata.ptype_all)
goto found;
hash = 0;
nxt = ptype_base[0].next;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index a09d507c5b03..07cb99b114bd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -24,6 +24,7 @@
#include <linux/of_net.h>
#include <linux/cpu.h>
#include <net/netdev_rx_queue.h>
+#include <net/rps.h>
#include "dev.h"
#include "net-sysfs.h"
@@ -31,13 +32,14 @@
#ifdef CONFIG_SYSFS
static const char fmt_hex[] = "%#x\n";
static const char fmt_dec[] = "%d\n";
+static const char fmt_uint[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
-/* Caller holds RTNL or dev_base_lock */
+/* Caller holds RTNL, netdev->lock or RCU */
static inline int dev_isalive(const struct net_device *dev)
{
- return dev->reg_state <= NETREG_REGISTERED;
+ return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED;
}
/* use same locking rules as GIF* ioctl's */
@@ -48,10 +50,10 @@ static ssize_t netdev_show(const struct device *dev,
struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- read_lock(&dev_base_lock);
+ rcu_read_lock();
if (dev_isalive(ndev))
ret = (*format)(ndev, buf);
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
return ret;
}
@@ -60,7 +62,7 @@ static ssize_t netdev_show(const struct device *dev,
#define NETDEVICE_SHOW(field, format_string) \
static ssize_t format_##field(const struct net_device *dev, char *buf) \
{ \
- return sysfs_emit(buf, format_string, dev->field); \
+ return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \
} \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -106,6 +108,36 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
return ret;
}
+/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */
+static ssize_t
+netdev_lock_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ unsigned long new;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
+ return ret;
+
+ netdev_lock(netdev);
+
+ if (dev_isalive(netdev)) {
+ ret = (*set)(netdev, new);
+ if (ret == 0)
+ ret = len;
+ }
+ netdev_unlock(netdev);
+
+ return ret;
+}
+
NETDEVICE_SHOW_RO(dev_id, fmt_hex);
NETDEVICE_SHOW_RO(dev_port, fmt_dec);
NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
@@ -125,7 +157,7 @@ static DEVICE_ATTR_RO(iflink);
static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
{
- return sysfs_emit(buf, fmt_dec, dev->name_assign_type);
+ return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type));
}
static ssize_t name_assign_type_show(struct device *dev,
@@ -135,24 +167,28 @@ static ssize_t name_assign_type_show(struct device *dev,
struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- if (ndev->name_assign_type != NET_NAME_UNKNOWN)
+ if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN)
ret = netdev_show(dev, attr, buf, format_name_assign_type);
return ret;
}
static DEVICE_ATTR_RO(name_assign_type);
-/* use same locking rules as GIFHWADDR ioctl's */
+/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */
static ssize_t address_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- read_lock(&dev_base_lock);
+ down_read(&dev_addr_sem);
+
+ rcu_read_lock();
if (dev_isalive(ndev))
ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len);
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
+
+ up_read(&dev_addr_sem);
return ret;
}
static DEVICE_ATTR_RO(address);
@@ -161,10 +197,13 @@ static ssize_t broadcast_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *ndev = to_net_dev(dev);
+ int ret = -EINVAL;
+ rcu_read_lock();
if (dev_isalive(ndev))
- return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
- return -EINVAL;
+ ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
+ rcu_read_unlock();
+ return ret;
}
static DEVICE_ATTR_RO(broadcast);
@@ -227,7 +266,7 @@ static ssize_t speed_show(struct device *dev,
if (!rtnl_trylock())
return restart_syscall();
- if (netif_running(netdev) && netif_device_present(netdev)) {
+ if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
if (!__ethtool_get_link_ksettings(netdev, &cmd))
@@ -318,11 +357,9 @@ static ssize_t operstate_show(struct device *dev,
const struct net_device *netdev = to_net_dev(dev);
unsigned char operstate;
- read_lock(&dev_base_lock);
- operstate = netdev->operstate;
+ operstate = READ_ONCE(netdev->operstate);
if (!netif_running(netdev))
operstate = IF_OPER_DOWN;
- read_unlock(&dev_base_lock);
if (operstate >= ARRAY_SIZE(operstates))
return -EINVAL; /* should not happen */
@@ -402,7 +439,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
- WRITE_ONCE(dev->gro_flush_timeout, val);
+ netdev_set_gro_flush_timeout(dev, val);
return 0;
}
@@ -413,13 +450,16 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+ return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout);
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
{
- WRITE_ONCE(dev->napi_defer_hard_irqs, val);
+ if (val > S32_MAX)
+ return -ERANGE;
+
+ netdev_set_defer_hard_irqs(dev, (u32)val);
return 0;
}
@@ -430,9 +470,10 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
+ return netdev_lock_store(dev, attr, buf, len,
+ change_napi_defer_hard_irqs);
}
-NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
+NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint);
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
@@ -599,13 +640,13 @@ static ssize_t threaded_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- if (!rtnl_trylock())
- return restart_syscall();
+ rcu_read_lock();
if (dev_isalive(netdev))
- ret = sysfs_emit(buf, fmt_dec, netdev->threaded);
+ ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded));
+
+ rcu_read_unlock();
- rtnl_unlock();
return ret;
}
@@ -628,7 +669,7 @@ static ssize_t threaded_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+ return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded);
}
static DEVICE_ATTR_RW(threaded);
@@ -680,14 +721,14 @@ static ssize_t netstat_show(const struct device *d,
WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
offset % sizeof(u64) != 0);
- read_lock(&dev_base_lock);
+ rcu_read_lock();
if (dev_isalive(dev)) {
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
}
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
return ret;
}
@@ -1050,7 +1091,7 @@ static const void *rx_queue_namespace(const struct kobject *kobj)
struct device *dev = &queue->dev->dev;
const void *ns = NULL;
- if (dev->class && dev->class->ns_type)
+ if (dev->class && dev->class->namespace)
ns = dev->class->namespace(dev);
return ns;
@@ -1409,6 +1450,65 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
= __ATTR(hold_time, 0644,
bql_show_hold_time, bql_set_hold_time);
+static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
+}
+
+static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
+ const char *buf, size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned int value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ value = msecs_to_jiffies(value);
+ if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG))
+ return -ERANGE;
+
+ if (!dql->stall_thrs && value)
+ dql->last_reap = jiffies;
+ /* Force last_reap to be live */
+ smp_wmb();
+ dql->stall_thrs = value;
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =
+ __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs);
+
+static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
+}
+
+static ssize_t bql_set_stall_max(struct netdev_queue *queue,
+ const char *buf, size_t len)
+{
+ WRITE_ONCE(queue->dql.stall_max, 0);
+ return len;
+}
+
+static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init =
+ __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max);
+
+static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sysfs_emit(buf, "%lu\n", dql->stall_cnt);
+}
+
+static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init =
+ __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL);
+
static ssize_t bql_show_inflight(struct netdev_queue *queue,
char *buf)
{
@@ -1447,6 +1547,9 @@ static struct attribute *dql_attrs[] __ro_after_init = {
&bql_limit_min_attribute.attr,
&bql_hold_time_attribute.attr,
&bql_inflight_attribute.attr,
+ &bql_stall_thrs_attribute.attr,
+ &bql_stall_cnt_attribute.attr,
+ &bql_stall_max_attribute.attr,
NULL
};
@@ -1454,6 +1557,9 @@ static const struct attribute_group dql_group = {
.name = "byte_queue_limits",
.attrs = dql_attrs,
};
+#else
+/* Fake declaration, all the code using it should be dead */
+static const struct attribute_group dql_group = {};
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
@@ -1669,7 +1775,7 @@ static const void *netdev_queue_namespace(const struct kobject *kobj)
struct device *dev = &queue->dev->dev;
const void *ns = NULL;
- if (dev->class && dev->class->ns_type)
+ if (dev->class && dev->class->namespace)
ns = dev->class->namespace(dev);
return ns;
@@ -1691,6 +1797,14 @@ static const struct kobj_type netdev_queue_ktype = {
.get_ownership = netdev_queue_get_ownership,
};
+static bool netdev_uses_bql(const struct net_device *dev)
+{
+ if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE))
+ return false;
+
+ return IS_ENABLED(CONFIG_BQL);
+}
+
static int netdev_queue_add_kobject(struct net_device *dev, int index)
{
struct netdev_queue *queue = dev->_tx + index;
@@ -1708,11 +1822,11 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
if (error)
goto err;
-#ifdef CONFIG_BQL
- error = sysfs_create_group(kobj, &dql_group);
- if (error)
- goto err;
-#endif
+ if (netdev_uses_bql(dev)) {
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto err;
+ }
kobject_uevent(kobj, KOBJ_ADD);
return 0;
@@ -1733,9 +1847,9 @@ static int tx_queue_change_owner(struct net_device *ndev, int index,
if (error)
return error;
-#ifdef CONFIG_BQL
- error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid);
-#endif
+ if (netdev_uses_bql(ndev))
+ error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid);
+
return error;
}
#endif /* CONFIG_SYSFS */
@@ -1767,9 +1881,10 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
if (!refcount_read(&dev_net(dev)->ns.count))
queue->kobj.uevent_suppress = 1;
-#ifdef CONFIG_BQL
- sysfs_remove_group(&queue->kobj, &dql_group);
-#endif
+
+ if (netdev_uses_bql(dev))
+ sysfs_remove_group(&queue->kobj, &dql_group);
+
kobject_put(&queue->kobj);
}
@@ -1947,7 +2062,7 @@ static void netdev_release(struct device *d)
* device is dead and about to be freed.
*/
kfree(rcu_access_pointer(dev->ifalias));
- netdev_freemem(dev);
+ kvfree(dev);
}
static const void *net_namespace(const struct device *d)
@@ -1965,7 +2080,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid)
net_ns_get_ownership(net, uid, gid);
}
-static struct class net_class __ro_after_init = {
+static const struct class net_class = {
.name = "net",
.dev_release = netdev_release,
.dev_groups = net_class_groups,
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 6aef976bc1da..f2fa34b1d78d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -23,7 +23,7 @@
#include <linux/net_dropmon.h>
#include <linux/slab.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <asm/bitops.h>
#define CREATE_TRACE_POINTS
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 72799533426b..4303f2a49262 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -56,7 +56,6 @@ static bool init_net_initialized;
* outside.
*/
DECLARE_RWSEM(pernet_ops_rwsem);
-EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
#define MIN_PERNET_OPS_ID \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -69,12 +68,15 @@ DEFINE_COOKIE(net_cookie);
static struct net_generic *net_alloc_generic(void)
{
+ unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
+ unsigned int generic_size;
struct net_generic *ng;
- unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+ generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
- ng->s.len = max_gen_ptrs;
+ ng->s.len = gen_ptrs;
return ng;
}
@@ -122,7 +124,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net)
int err = -ENOMEM;
void *data = NULL;
- if (ops->id && ops->size) {
+ if (ops->id) {
data = kzalloc(ops->size, GFP_KERNEL);
if (!data)
goto out;
@@ -137,7 +139,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net)
if (!err)
return 0;
- if (ops->id && ops->size) {
+ if (ops->id) {
ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&pernet_ops_rwsem));
ng->ptr[*ops->id] = NULL;
@@ -179,7 +181,8 @@ static void ops_free_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
struct net *net;
- if (ops->size && ops->id) {
+
+ if (ops->id) {
list_for_each_entry(net, net_exit_list, exit_list)
kfree(net_generic(net, *ops->id));
}
@@ -305,35 +308,55 @@ struct net *get_net_ns_by_id(const struct net *net, int id)
}
EXPORT_SYMBOL_GPL(get_net_ns_by_id);
+static __net_init void preinit_net_sysctl(struct net *net)
+{
+ net->core.sysctl_somaxconn = SOMAXCONN;
+ /* Limits per socket sk_omem_alloc usage.
+ * TCP zerocopy regular usage needs 128 KB.
+ */
+ net->core.sysctl_optmem_max = 128 * 1024;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+ net->core.sysctl_tstamp_allow_data = 1;
+}
+
/* init code that must occur even if setup_net() is not called. */
-static __net_init void preinit_net(struct net *net)
+static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
{
+ refcount_set(&net->passive, 1);
+ refcount_set(&net->ns.count, 1);
+ ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");
+
+ get_random_bytes(&net->hash_mix, sizeof(u32));
+ net->dev_base_seq = 1;
+ net->user_ns = user_ns;
+
+ idr_init(&net->netns_ids);
+ spin_lock_init(&net->nsid_lock);
+ mutex_init(&net->ipv4.ra_mutex);
+
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+ mutex_init(&net->rtnl_mutex);
+ lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
+#endif
+
+ preinit_net_sysctl(net);
}
/*
* setup_net runs the initializers for the network namespace object.
*/
-static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
+static __net_init int setup_net(struct net *net)
{
/* Must be called with pernet_ops_rwsem held */
const struct pernet_operations *ops, *saved_ops;
- int error = 0;
LIST_HEAD(net_exit_list);
+ LIST_HEAD(dev_kill_list);
+ int error = 0;
- refcount_set(&net->ns.count, 1);
- ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
-
- refcount_set(&net->passive, 1);
- get_random_bytes(&net->hash_mix, sizeof(u32));
preempt_disable();
net->net_cookie = gen_cookie_next(&net_cookie);
preempt_enable();
- net->dev_base_seq = 1;
- net->user_ns = user_ns;
- idr_init(&net->netns_ids);
- spin_lock_init(&net->nsid_lock);
- mutex_init(&net->ipv4.ra_mutex);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
@@ -358,6 +381,15 @@ out_undo:
synchronize_rcu();
ops = saved_ops;
+ rtnl_lock();
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
+ if (ops->exit_batch_rtnl)
+ ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
+ }
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
+
+ ops = saved_ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
@@ -369,32 +401,6 @@ out_undo:
goto out;
}
-static int __net_init net_defaults_init_net(struct net *net)
-{
- net->core.sysctl_somaxconn = SOMAXCONN;
- /* Limits per socket sk_omem_alloc usage.
- * TCP zerocopy regular usage needs 128 KB.
- */
- net->core.sysctl_optmem_max = 128 * 1024;
- net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
-
- return 0;
-}
-
-static struct pernet_operations net_defaults_ops = {
- .init = net_defaults_init_net,
-};
-
-static __init int net_defaults_init(void)
-{
- if (register_pernet_subsys(&net_defaults_ops))
- panic("Cannot initialize net default settings");
-
- return 0;
-}
-
-core_initcall(net_defaults_init);
-
#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
{
@@ -443,7 +449,22 @@ out_free:
goto out;
}
-static void net_free(struct net *net)
+static LLIST_HEAD(defer_free_list);
+
+static void net_complete_free(void)
+{
+ struct llist_node *kill_list;
+ struct net *net, *next;
+
+ /* Get the list of namespaces to free from last round. */
+ kill_list = llist_del_all(&defer_free_list);
+
+ llist_for_each_entry_safe(net, next, kill_list, defer_free_list)
+ kmem_cache_free(net_cachep, net);
+
+}
+
+void net_passive_dec(struct net *net)
{
if (refcount_dec_and_test(&net->passive)) {
kfree(rcu_access_pointer(net->gen));
@@ -451,7 +472,8 @@ static void net_free(struct net *net)
/* There should not be any trackers left there. */
ref_tracker_dir_exit(&net->notrefcnt_tracker);
- kmem_cache_free(net_cachep, net);
+ /* Wait for an extra rcu_barrier() before final free. */
+ llist_add(&net->defer_free_list, &defer_free_list);
}
}
@@ -460,7 +482,7 @@ void net_drop_ns(void *p)
struct net *net = (struct net *)p;
if (net)
- net_free(net);
+ net_passive_dec(net);
}
struct net *copy_net_ns(unsigned long flags,
@@ -483,8 +505,7 @@ struct net *copy_net_ns(unsigned long flags,
goto dec_ucounts;
}
- preinit_net(net);
- refcount_set(&net->passive, 1);
+ preinit_net(net, user_ns);
net->ucounts = ucounts;
get_user_ns(user_ns);
@@ -492,7 +513,7 @@ struct net *copy_net_ns(unsigned long flags,
if (rv < 0)
goto put_userns;
- rv = setup_net(net, user_ns);
+ rv = setup_net(net);
up_read(&pernet_ops_rwsem);
@@ -502,7 +523,7 @@ put_userns:
key_remove_domain(net->key_domain);
#endif
put_user_ns(user_ns);
- net_free(net);
+ net_passive_dec(net);
dec_ucounts:
dec_net_namespaces(ucounts);
return ERR_PTR(rv);
@@ -567,12 +588,17 @@ static void unhash_nsid(struct net *net, struct net *last)
static LLIST_HEAD(cleanup_list);
+struct task_struct *cleanup_net_task;
+
static void cleanup_net(struct work_struct *work)
{
const struct pernet_operations *ops;
struct net *net, *tmp, *last;
struct llist_node *net_kill_list;
LIST_HEAD(net_exit_list);
+ LIST_HEAD(dev_kill_list);
+
+ cleanup_net_task = current;
/* Atomically snapshot the list of namespaces to cleanup */
net_kill_list = llist_del_all(&cleanup_list);
@@ -611,7 +637,15 @@ static void cleanup_net(struct work_struct *work)
* the rcu_barrier() below isn't sufficient alone.
* Also the pre_exit() and exit() methods need this barrier.
*/
- synchronize_rcu();
+ synchronize_rcu_expedited();
+
+ rtnl_lock();
+ list_for_each_entry_reverse(ops, &pernet_list, list) {
+ if (ops->exit_batch_rtnl)
+ ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
+ }
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
/* Run all of the network namespace exit methods */
list_for_each_entry_reverse(ops, &pernet_list, list)
@@ -628,6 +662,8 @@ static void cleanup_net(struct work_struct *work)
*/
rcu_barrier();
+ net_complete_free();
+
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
@@ -636,8 +672,9 @@ static void cleanup_net(struct work_struct *work)
key_remove_domain(net->key_domain);
#endif
put_user_ns(net->user_ns);
- net_free(net);
+ net_passive_dec(net);
}
+ cleanup_net_task = NULL;
}
/**
@@ -671,30 +708,33 @@ EXPORT_SYMBOL_GPL(__put_net);
* get_net_ns - increment the refcount of the network namespace
* @ns: common namespace (net)
*
- * Returns the net's common namespace.
+ * Returns the net's common namespace or ERR_PTR() if ref is zero.
*/
struct ns_common *get_net_ns(struct ns_common *ns)
{
- return &get_net(container_of(ns, struct net, ns))->ns;
+ struct net *net;
+
+ net = maybe_get_net(container_of(ns, struct net, ns));
+ if (net)
+ return &net->ns;
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns);
struct net *get_net_ns_by_fd(int fd)
{
- struct fd f = fdget(fd);
- struct net *net = ERR_PTR(-EINVAL);
+ CLASS(fd, f)(fd);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (proc_ns_file(f.file)) {
- struct ns_common *ns = get_proc_ns(file_inode(f.file));
+ if (proc_ns_file(fd_file(f))) {
+ struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
if (ns->ops == &netns_operations)
- net = get_net(container_of(ns, struct net, ns));
+ return get_net(container_of(ns, struct net, ns));
}
- fdput(f);
- return net;
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
#endif
@@ -1071,7 +1111,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
end:
if (net_cb.fillargs.add_ref)
put_net(net_cb.tgt_net);
- return err < 0 ? err : skb->len;
+ return err;
}
static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
@@ -1140,13 +1180,23 @@ static void __init netns_ipv4_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_early_demux);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_l3mdev_accept);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_rmem);
- CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18);
+ CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
}
#endif
+static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid,
+ .dumpit = rtnl_net_dumpid,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+};
+
void __init net_ns_init(void)
{
struct net_generic *ng;
@@ -1172,9 +1222,10 @@ void __init net_ns_init(void)
#ifdef CONFIG_KEYS
init_net.key_domain = &init_net_key_domain;
#endif
+ preinit_net(&init_net, &init_user_ns);
+
down_write(&pernet_ops_rwsem);
- preinit_net(&init_net);
- if (setup_net(&init_net, &init_user_ns))
+ if (setup_net(&init_net))
panic("Could not setup the initial network namespace");
init_net_initialized = true;
@@ -1183,17 +1234,24 @@ void __init net_ns_init(void)
if (register_pernet_subsys(&net_ns_ops))
panic("Could not register network namespace subsystems");
- rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,
- RTNL_FLAG_DOIT_UNLOCKED);
- rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
- RTNL_FLAG_DOIT_UNLOCKED);
+ rtnl_register_many(net_ns_rtnl_msg_handlers);
}
static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
{
ops_pre_exit_list(ops, net_exit_list);
synchronize_rcu();
+
+ if (ops->exit_batch_rtnl) {
+ LIST_HEAD(dev_kill_list);
+
+ rtnl_lock();
+ ops->exit_batch_rtnl(net_exit_list, &dev_kill_list);
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
+ }
ops_exit_list(ops, net_exit_list);
+
ops_free_list(ops, net_exit_list);
}
@@ -1206,7 +1264,7 @@ static int __register_pernet_operations(struct list_head *list,
LIST_HEAD(net_exit_list);
list_add_tail(&ops->list, list);
- if (ops->init || (ops->id && ops->size)) {
+ if (ops->init || ops->id) {
/* We held write locked pernet_ops_rwsem, and parallel
* setup_net() and cleanup_net() are not possible.
*/
@@ -1272,13 +1330,20 @@ static int register_pernet_operations(struct list_head *list,
{
int error;
+ if (WARN_ON(!!ops->id ^ !!ops->size))
+ return -EINVAL;
+
if (ops->id) {
error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
GFP_KERNEL);
if (error < 0)
return error;
*ops->id = error;
- max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
+ /* This does not require READ_ONCE as writers already hold
+ * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
+ * net_alloc_generic.
+ */
+ WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
}
error = __register_pernet_operations(list, ops);
if (error) {
diff --git a/net/core/gso_test.c b/net/core/net_test.c
index 4c2e77bd12f4..9c3a590865d2 100644
--- a/net/core/gso_test.c
+++ b/net/core/net_test.c
@@ -1,6 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <kunit/test.h>
+
+/* GSO */
+
#include <linux/skbuff.h>
static const char hdr[] = "abcdefgh";
@@ -225,7 +228,7 @@ static void gso_test_func(struct kunit *test)
segs = skb_segment(skb, features);
if (IS_ERR(segs)) {
- KUNIT_FAIL(test, "segs error %lld", PTR_ERR(segs));
+ KUNIT_FAIL(test, "segs error %pe", segs);
goto free_gso_skb;
} else if (!segs) {
KUNIT_FAIL(test, "no segments");
@@ -258,17 +261,127 @@ free_gso_skb:
consume_skb(skb);
}
-static struct kunit_case gso_test_cases[] = {
- KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params),
- {}
+/* IP tunnel flags */
+
+#include <net/ip_tunnels.h>
+
+struct ip_tunnel_flags_test {
+ const char *name;
+
+ const u16 *src_bits;
+ const u16 *exp_bits;
+ u8 src_num;
+ u8 exp_num;
+
+ __be16 exp_val;
+ bool exp_comp;
+};
+
+#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \
+ .name = (n), \
+ .src_bits = (src), \
+ .src_num = ARRAY_SIZE(src), \
+ .exp_comp = (comp), \
+ .exp_val = (eval), \
+ .exp_bits = (exp), \
+ .exp_num = ARRAY_SIZE(exp), \
+}
+
+/* These are __be16-compatible and can be compared as is */
+static const u16 ip_tunnel_flags_1[] = {
+ IP_TUNNEL_KEY_BIT,
+ IP_TUNNEL_STRICT_BIT,
+ IP_TUNNEL_ERSPAN_OPT_BIT,
+};
+
+/* Due to the previous flags design limitation, setting either
+ * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT``
+ * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they
+ * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is
+ * backward-compatible.
+ */
+#ifdef __LITTLE_ENDIAN
+#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT
+#else
+#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT
+#endif
+
+static const u16 ip_tunnel_flags_2_src[] = {
+ IP_TUNNEL_CONFLICT_BIT,
+};
+
+static const u16 ip_tunnel_flags_2_exp[] = {
+ IP_TUNNEL_CONFLICT_BIT,
+ IP_TUNNEL_SIT_ISATAP_BIT,
};
-static struct kunit_suite gso_test_suite = {
- .name = "net_core_gso",
- .test_cases = gso_test_cases,
+/* Bits 17 and higher are not compatible with __be16 flags */
+static const u16 ip_tunnel_flags_3_src[] = {
+ IP_TUNNEL_VXLAN_OPT_BIT,
+ 17,
+ 18,
+ 20,
};
-kunit_test_suite(gso_test_suite);
+static const u16 ip_tunnel_flags_3_exp[] = {
+ IP_TUNNEL_VXLAN_OPT_BIT,
+};
+
+static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = {
+ IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true,
+ cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) |
+ BIT(IP_TUNNEL_STRICT_BIT) |
+ BIT(IP_TUNNEL_ERSPAN_OPT_BIT)),
+ ip_tunnel_flags_1),
+ IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true,
+ VTI_ISVTI, ip_tunnel_flags_2_exp),
+ IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false,
+ cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)),
+ ip_tunnel_flags_3_exp),
+};
+
+static void
+ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t,
+ char *desc)
+{
+ strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE);
+}
+KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test,
+ ip_tunnel_flags_test_case_to_desc);
+
+static void ip_tunnel_flags_test_run(struct kunit *test)
+{
+ const struct ip_tunnel_flags_test *t = test->param_value;
+ IP_TUNNEL_DECLARE_FLAGS(src) = { };
+ IP_TUNNEL_DECLARE_FLAGS(exp) = { };
+ IP_TUNNEL_DECLARE_FLAGS(out);
+
+ for (u32 j = 0; j < t->src_num; j++)
+ __set_bit(t->src_bits[j], src);
+ for (u32 j = 0; j < t->exp_num; j++)
+ __set_bit(t->exp_bits[j], exp);
+
+ KUNIT_ASSERT_EQ(test, t->exp_comp,
+ ip_tunnel_flags_is_be16_compat(src));
+ KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val,
+ (__force u16)ip_tunnel_flags_to_be16(src));
+
+ ip_tunnel_flags_from_be16(out, t->exp_val);
+ KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out));
+}
+
+static struct kunit_case net_test_cases[] = {
+ KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params),
+ KUNIT_CASE_PARAM(ip_tunnel_flags_test_run,
+ ip_tunnel_flags_test_gen_params),
+ { },
+};
+
+static struct kunit_suite net_test_suite = {
+ .name = "net_core",
+ .test_cases = net_test_cases,
+};
+kunit_test_suite(net_test_suite);
+MODULE_DESCRIPTION("KUnit tests for networking core");
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("KUnit tests for segmentation offload");
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index be7f2ebd61b2..996ac6a449eb 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -9,16 +9,21 @@
#include "netdev-genl-gen.h"
#include <uapi/linux/netdev.h>
+#include <linux/list.h>
/* Integer value ranges */
static const struct netlink_range_validation netdev_a_page_pool_id_range = {
.min = 1ULL,
- .max = 4294967295ULL,
+ .max = U32_MAX,
};
static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = {
.min = 1ULL,
- .max = 2147483647ULL,
+ .max = S32_MAX,
+};
+
+static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
+ .max = S32_MAX,
};
/* Common nested types */
@@ -27,6 +32,11 @@ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFIND
[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
};
+const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = {
+ [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, },
+ [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+};
+
/* NETDEV_CMD_DEV_GET - do */
static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
[NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
@@ -68,6 +78,27 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN
[NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
};
+/* NETDEV_CMD_QSTATS_GET - dump */
+static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = {
+ [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1),
+};
+
+/* NETDEV_CMD_BIND_RX - do */
+static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+ [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+};
+
+/* NETDEV_CMD_NAPI_SET - do */
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = {
+ [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+ [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
+ [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
+ [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
+};
+
/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
{
@@ -138,6 +169,27 @@ static const struct genl_split_ops netdev_nl_ops[] = {
.maxattr = NETDEV_A_NAPI_IFINDEX,
.flags = GENL_CMD_CAP_DUMP,
},
+ {
+ .cmd = NETDEV_CMD_QSTATS_GET,
+ .dumpit = netdev_nl_qstats_get_dumpit,
+ .policy = netdev_qstats_get_nl_policy,
+ .maxattr = NETDEV_A_QSTATS_SCOPE,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = NETDEV_CMD_BIND_RX,
+ .doit = netdev_nl_bind_rx_doit,
+ .policy = netdev_bind_rx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_SET,
+ .doit = netdev_nl_napi_set_doit,
+ .policy = netdev_napi_set_nl_policy,
+ .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
};
static const struct genl_multicast_group netdev_nl_mcgrps[] = {
@@ -145,6 +197,16 @@ static const struct genl_multicast_group netdev_nl_mcgrps[] = {
[NETDEV_NLGRP_PAGE_POOL] = { "page-pool", },
};
+static void __netdev_nl_sock_priv_init(void *priv)
+{
+ netdev_nl_sock_priv_init(priv);
+}
+
+static void __netdev_nl_sock_priv_destroy(void *priv)
+{
+ netdev_nl_sock_priv_destroy(priv);
+}
+
struct genl_family netdev_nl_family __ro_after_init = {
.name = NETDEV_FAMILY_NAME,
.version = NETDEV_FAMILY_VERSION,
@@ -155,4 +217,7 @@ struct genl_family netdev_nl_family __ro_after_init = {
.n_split_ops = ARRAY_SIZE(netdev_nl_ops),
.mcgrps = netdev_nl_mcgrps,
.n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps),
+ .sock_priv_size = sizeof(struct list_head),
+ .sock_priv_init = __netdev_nl_sock_priv_init,
+ .sock_priv_destroy = __netdev_nl_sock_priv_destroy,
};
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index a47f2bcbe4fa..e09dd7539ff2 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -10,9 +10,11 @@
#include <net/genetlink.h>
#include <uapi/linux/netdev.h>
+#include <linux/list.h>
/* Common nested types */
extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
+extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
@@ -28,6 +30,10 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
enum {
NETDEV_NLGRP_MGMT,
@@ -36,4 +42,7 @@ enum {
extern struct genl_family netdev_nl_family;
+void netdev_nl_sock_priv_init(struct list_head *priv);
+void netdev_nl_sock_priv_destroy(struct list_head *priv);
+
#endif /* _LINUX_NETDEV_GEN_H */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index fd98936da3ae..715f85c6b62e 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -3,15 +3,17 @@
#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/rtnetlink.h>
+#include <net/busy_poll.h>
#include <net/net_namespace.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <net/xdp_sock.h>
-#include <net/netdev_rx_queue.h>
-#include <net/busy_poll.h>
-#include "netdev-genl-gen.h"
#include "dev.h"
+#include "devmem.h"
+#include "netdev-genl-gen.h"
struct netdev_nl_dump_ctx {
unsigned long ifindex;
@@ -22,7 +24,7 @@ struct netdev_nl_dump_ctx {
static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb)
{
- NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx);
+ NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx);
return (struct netdev_nl_dump_ctx *)cb->ctx;
}
@@ -58,22 +60,22 @@ XDP_METADATA_KFUNC_xxx
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
xdp_rx_meta, NETDEV_A_DEV_PAD) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
- xsk_features, NETDEV_A_DEV_PAD)) {
- genlmsg_cancel(rsp, hdr);
- return -EINVAL;
- }
+ xsk_features, NETDEV_A_DEV_PAD))
+ goto err_cancel_msg;
if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
- netdev->xdp_zc_max_segs)) {
- genlmsg_cancel(rsp, hdr);
- return -EINVAL;
- }
+ netdev->xdp_zc_max_segs))
+ goto err_cancel_msg;
}
genlmsg_end(rsp, hdr);
return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
}
static void
@@ -152,30 +154,27 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
}
rtnl_unlock();
- if (err != -EMSGSIZE)
- return err;
-
- return skb->len;
+ return err;
}
static int
netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
const struct genl_info *info)
{
+ unsigned long irq_suspend_timeout;
+ unsigned long gro_flush_timeout;
+ u32 napi_defer_hard_irqs;
void *hdr;
pid_t pid;
- if (WARN_ON_ONCE(!napi->dev))
- return -EINVAL;
- if (!(napi->dev->flags & IFF_UP))
+ if (!napi->dev->up)
return 0;
hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
- if (napi->napi_id >= MIN_NAPI_ID &&
- nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
goto nla_put_failure;
if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex))
@@ -190,6 +189,21 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
goto nla_put_failure;
}
+ napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi);
+ if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS,
+ napi_defer_hard_irqs))
+ goto nla_put_failure;
+
+ irq_suspend_timeout = napi_get_irq_suspend_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+ irq_suspend_timeout))
+ goto nla_put_failure;
+
+ gro_flush_timeout = napi_get_gro_flush_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+ gro_flush_timeout))
+ goto nla_put_failure;
+
genlmsg_end(rsp, hdr);
return 0;
@@ -215,18 +229,21 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
-
- napi = napi_by_id(napi_id);
- if (napi)
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
+ if (napi) {
err = netdev_nl_napi_fill_one(rsp, napi, info);
- else
- err = -EINVAL;
-
- rtnl_unlock();
+ netdev_unlock(napi->dev);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
- if (err)
+ if (err) {
goto err_free_msg;
+ } else if (!rsp->len) {
+ err = -ENOENT;
+ goto err_free_msg;
+ }
return genlmsg_reply(rsp, info);
@@ -241,12 +258,21 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
struct netdev_nl_dump_ctx *ctx)
{
struct napi_struct *napi;
+ unsigned int prev_id;
int err = 0;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return err;
+ prev_id = UINT_MAX;
list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (napi->napi_id < MIN_NAPI_ID)
+ continue;
+
+ /* Dump continuation below depends on the list being sorted */
+ WARN_ON_ONCE(napi->napi_id >= prev_id);
+ prev_id = napi->napi_id;
+
if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
continue;
@@ -270,33 +296,79 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
if (info->attrs[NETDEV_A_NAPI_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock(net, ifindex);
+ if (netdev) {
err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
- else
+ netdev_unlock(netdev);
+ } else {
err = -ENODEV;
+ }
} else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
ctx->napi_id = 0;
}
}
- rtnl_unlock();
- if (err != -EMSGSIZE)
- return err;
+ return err;
+}
+
+static int
+netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
+{
+ u64 irq_suspend_timeout = 0;
+ u64 gro_flush_timeout = 0;
+ u32 defer = 0;
+
+ if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
+ defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
+ napi_set_defer_hard_irqs(napi, defer);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) {
+ irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]);
+ napi_set_irq_suspend_timeout(napi, irq_suspend_timeout);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
+ gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
+ napi_set_gro_flush_timeout(napi, gro_flush_timeout);
+ }
+
+ return 0;
+}
+
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct napi_struct *napi;
+ unsigned int napi_id;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+ return -EINVAL;
+
+ napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
+ if (napi) {
+ err = netdev_nl_napi_set_config(napi, info);
+ netdev_unlock(napi->dev);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
- return skb->len;
+ return err;
}
static int
netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
u32 q_idx, u32 q_type, const struct genl_info *info)
{
+ struct net_devmem_dmabuf_binding *binding;
struct netdev_rx_queue *rxq;
struct netdev_queue *txq;
void *hdr;
@@ -316,6 +388,12 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID,
rxq->napi->napi_id))
goto nla_put_failure;
+
+ binding = rxq->mp_params.mp_priv;
+ if (binding &&
+ nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id))
+ goto nla_put_failure;
+
break;
case NETDEV_QUEUE_TYPE_TX:
txq = netdev_get_tx_queue(netdev, q_idx);
@@ -352,10 +430,10 @@ static int
netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx,
u32 q_type, const struct genl_info *info)
{
- int err = 0;
+ int err;
- if (!(netdev->flags & IFF_UP))
- return err;
+ if (!netdev->up)
+ return -ENOENT;
err = netdev_nl_queue_validate(netdev, q_idx, q_type);
if (err)
@@ -386,11 +464,13 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
rtnl_lock();
- netdev = __dev_get_by_index(genl_info_net(info), ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (netdev) {
err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
- else
+ netdev_unlock(netdev);
+ } else {
err = -ENODEV;
+ }
rtnl_unlock();
@@ -410,24 +490,21 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp,
struct netdev_nl_dump_ctx *ctx)
{
int err = 0;
- int i;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return err;
- for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) {
- err = netdev_nl_queue_fill_one(rsp, netdev, i,
+ for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx,
NETDEV_QUEUE_TYPE_RX, info);
if (err)
return err;
- ctx->rxq_idx = i++;
}
- for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) {
- err = netdev_nl_queue_fill_one(rsp, netdev, i,
+ for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx,
NETDEV_QUEUE_TYPE_TX, info);
if (err)
return err;
- ctx->txq_idx = i++;
}
return err;
@@ -447,13 +524,15 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock(net, ifindex);
+ if (netdev) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
- else
+ netdev_unlock(netdev);
+ } else {
err = -ENODEV;
+ }
} else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
@@ -463,10 +542,390 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
}
rtnl_unlock();
- if (err != -EMSGSIZE)
- return err;
+ return err;
+}
- return skb->len;
+#define NETDEV_STAT_NOT_SET (~0ULL)
+
+static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size)
+{
+ const u64 *add = _add;
+ u64 *sum = _sum;
+
+ while (size) {
+ if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET)
+ *sum += *add;
+ sum++;
+ add++;
+ size -= 8;
+ }
+}
+
+static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value)
+{
+ if (value == NETDEV_STAT_NOT_SET)
+ return 0;
+ return nla_put_uint(rsp, attr_id, value);
+}
+
+static int
+netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
+{
+ if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int
+netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)
+{
+ if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int
+netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp,
+ u32 q_type, int i, const struct genl_info *info)
+{
+ const struct netdev_stat_ops *ops = netdev->stat_ops;
+ struct netdev_queue_stats_rx rx;
+ struct netdev_queue_stats_tx tx;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+ if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) ||
+ nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) ||
+ nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i))
+ goto nla_put_failure;
+
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ memset(&rx, 0xff, sizeof(rx));
+ ops->get_queue_stats_rx(netdev, i, &rx);
+ if (!memchr_inv(&rx, 0xff, sizeof(rx)))
+ goto nla_cancel;
+ if (netdev_nl_stats_write_rx(rsp, &rx))
+ goto nla_put_failure;
+ break;
+ case NETDEV_QUEUE_TYPE_TX:
+ memset(&tx, 0xff, sizeof(tx));
+ ops->get_queue_stats_tx(netdev, i, &tx);
+ if (!memchr_inv(&tx, 0xff, sizeof(tx)))
+ goto nla_cancel;
+ if (netdev_nl_stats_write_tx(rsp, &tx))
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(rsp, hdr);
+ return 0;
+
+nla_cancel:
+ genlmsg_cancel(rsp, hdr);
+ return 0;
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ const struct netdev_stat_ops *ops = netdev->stat_ops;
+ int i, err;
+
+ if (!(netdev->flags & IFF_UP))
+ return 0;
+
+ i = ctx->rxq_idx;
+ while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) {
+ err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX,
+ i, info);
+ if (err)
+ return err;
+ ctx->rxq_idx = ++i;
+ }
+ i = ctx->txq_idx;
+ while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) {
+ err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX,
+ i, info);
+ if (err)
+ return err;
+ ctx->txq_idx = ++i;
+ }
+
+ ctx->rxq_idx = 0;
+ ctx->txq_idx = 0;
+ return 0;
+}
+
+static int
+netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info)
+{
+ struct netdev_queue_stats_rx rx_sum, rx;
+ struct netdev_queue_stats_tx tx_sum, tx;
+ const struct netdev_stat_ops *ops;
+ void *hdr;
+ int i;
+
+ ops = netdev->stat_ops;
+ /* Netdev can't guarantee any complete counters */
+ if (!ops->get_base_stats)
+ return 0;
+
+ memset(&rx_sum, 0xff, sizeof(rx_sum));
+ memset(&tx_sum, 0xff, sizeof(tx_sum));
+
+ ops->get_base_stats(netdev, &rx_sum, &tx_sum);
+
+ /* The op was there, but nothing reported, don't bother */
+ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) &&
+ !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum)))
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+ if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ for (i = 0; i < netdev->real_num_rx_queues; i++) {
+ memset(&rx, 0xff, sizeof(rx));
+ if (ops->get_queue_stats_rx)
+ ops->get_queue_stats_rx(netdev, i, &rx);
+ netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx));
+ }
+ for (i = 0; i < netdev->real_num_tx_queues; i++) {
+ memset(&tx, 0xff, sizeof(tx));
+ if (ops->get_queue_stats_tx)
+ ops->get_queue_stats_tx(netdev, i, &tx);
+ netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx));
+ }
+
+ if (netdev_nl_stats_write_rx(rsp, &rx_sum) ||
+ netdev_nl_stats_write_tx(rsp, &tx_sum))
+ goto nla_put_failure;
+
+ genlmsg_end(rsp, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope,
+ struct sk_buff *skb, const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ if (!netdev->stat_ops)
+ return 0;
+
+ switch (scope) {
+ case 0:
+ return netdev_nl_stats_by_netdev(netdev, skb, info);
+ case NETDEV_QSTATS_SCOPE_QUEUE:
+ return netdev_nl_stats_by_queue(netdev, skb, info, ctx);
+ }
+
+ return -EINVAL; /* Should not happen, per netlink policy */
+}
+
+int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ unsigned int ifindex;
+ unsigned int scope;
+ int err = 0;
+
+ scope = 0;
+ if (info->attrs[NETDEV_A_QSTATS_SCOPE])
+ scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]);
+
+ ifindex = 0;
+ if (info->attrs[NETDEV_A_QSTATS_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+
+ rtnl_lock();
+ if (ifindex) {
+ netdev = __dev_get_by_index(net, ifindex);
+ if (netdev && netdev->stat_ops) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ } else {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ err = netdev ? -EOPNOTSUPP : -ENODEV;
+ }
+ } else {
+ for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ if (err < 0)
+ break;
+ }
+ }
+ rtnl_unlock();
+
+ return err;
+}
+
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
+ struct net_devmem_dmabuf_binding *binding;
+ struct list_head *sock_binding_list;
+ u32 ifindex, dmabuf_fd, rxq_idx;
+ struct net_device *netdev;
+ struct sk_buff *rsp;
+ struct nlattr *attr;
+ int rem, err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ sock_binding_list = genl_sk_priv_get(&netdev_nl_family,
+ NETLINK_CB(skb).sk);
+ if (IS_ERR(sock_binding_list))
+ return PTR_ERR(sock_binding_list);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ rtnl_lock();
+
+ netdev = __dev_get_by_index(genl_info_net(info), ifindex);
+ if (!netdev || !netif_device_present(netdev)) {
+ err = -ENODEV;
+ goto err_unlock;
+ }
+
+ if (dev_xdp_prog_count(netdev)) {
+ NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached");
+ err = -EEXIST;
+ goto err_unlock;
+ }
+
+ binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_unlock;
+ }
+
+ nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
+ genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ err = nla_parse_nested(
+ tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr,
+ netdev_queue_id_nl_policy, info->extack);
+ if (err < 0)
+ goto err_unbind;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
+ NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) {
+ err = -EINVAL;
+ goto err_unbind;
+ }
+
+ if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
+ err = -EINVAL;
+ goto err_unbind;
+ }
+
+ rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+
+ err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
+ info->extack);
+ if (err)
+ goto err_unbind;
+ }
+
+ list_add(&binding->list, sock_binding_list);
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ err = genlmsg_reply(rsp, info);
+ if (err)
+ goto err_unbind;
+
+ rtnl_unlock();
+
+ return 0;
+
+err_unbind:
+ net_devmem_unbind_dmabuf(binding);
+err_unlock:
+ rtnl_unlock();
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
+void netdev_nl_sock_priv_init(struct list_head *priv)
+{
+ INIT_LIST_HEAD(priv);
+}
+
+void netdev_nl_sock_priv_destroy(struct list_head *priv)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct net_devmem_dmabuf_binding *temp;
+
+ list_for_each_entry_safe(binding, temp, priv, list) {
+ rtnl_lock();
+ net_devmem_unbind_dmabuf(binding);
+ rtnl_unlock();
+ }
}
static int netdev_genl_netdevice_event(struct notifier_block *nb,
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
new file mode 100644
index 000000000000..db82786fa0c4
--- /dev/null
+++ b/net/core/netdev_rx_queue.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/netdevice.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+
+#include "page_pool_priv.h"
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
+ void *new_mem, *old_mem;
+ int err;
+
+ if (!dev->queue_mgmt_ops || !dev->queue_mgmt_ops->ndo_queue_stop ||
+ !dev->queue_mgmt_ops->ndo_queue_mem_free ||
+ !dev->queue_mgmt_ops->ndo_queue_mem_alloc ||
+ !dev->queue_mgmt_ops->ndo_queue_start)
+ return -EOPNOTSUPP;
+
+ ASSERT_RTNL();
+
+ new_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL);
+ if (!new_mem)
+ return -ENOMEM;
+
+ old_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL);
+ if (!old_mem) {
+ err = -ENOMEM;
+ goto err_free_new_mem;
+ }
+
+ err = dev->queue_mgmt_ops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_free_old_mem;
+
+ err = page_pool_check_memory_provider(dev, rxq);
+ if (err)
+ goto err_free_new_queue_mem;
+
+ err = dev->queue_mgmt_ops->ndo_queue_stop(dev, old_mem, rxq_idx);
+ if (err)
+ goto err_free_new_queue_mem;
+
+ err = dev->queue_mgmt_ops->ndo_queue_start(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_start_queue;
+
+ dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem);
+
+ kvfree(old_mem);
+ kvfree(new_mem);
+
+ return 0;
+
+err_start_queue:
+ /* Restarting the queue with old_mem should be successful as we haven't
+ * changed any of the queue configuration, and there is not much we can
+ * do to recover from a failure here.
+ *
+ * WARN if we fail to recover the old rx queue, and at least free
+ * old_mem so we don't also leak that.
+ */
+ if (dev->queue_mgmt_ops->ndo_queue_start(dev, old_mem, rxq_idx)) {
+ WARN(1,
+ "Failed to restart old queue in error path. RX queue %d may be unhealthy.",
+ rxq_idx);
+ dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem);
+ }
+
+err_free_new_queue_mem:
+ dev->queue_mgmt_ops->ndo_queue_mem_free(dev, new_mem);
+
+err_free_old_mem:
+ kvfree(old_mem);
+
+err_free_new_mem:
+ kvfree(new_mem);
+
+ return err;
+}
+EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
new file mode 100644
index 000000000000..7eadb8393e00
--- /dev/null
+++ b/net/core/netmem_priv.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __NETMEM_PRIV_H
+#define __NETMEM_PRIV_H
+
+static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
+{
+ return __netmem_clear_lsb(netmem)->pp_magic;
+}
+
+static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
+{
+ __netmem_clear_lsb(netmem)->pp_magic |= pp_magic;
+}
+
+static inline void netmem_clear_pp_magic(netmem_ref netmem)
+{
+ __netmem_clear_lsb(netmem)->pp_magic = 0;
+}
+
+static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
+{
+ __netmem_clear_lsb(netmem)->pp = pool;
+}
+
+static inline void netmem_set_dma_addr(netmem_ref netmem,
+ unsigned long dma_addr)
+{
+ __netmem_clear_lsb(netmem)->dma_addr = dma_addr;
+}
+#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 543007f159f9..0ab722d95a2d 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -34,7 +34,7 @@
#include <net/addrconf.h>
#include <net/ndisc.h>
#include <net/ip6_checksum.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <trace/events/napi.h>
#include <linux/kconfig.h>
@@ -45,11 +45,6 @@
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
-
-static struct sk_buff_head skb_pool;
-
-DEFINE_STATIC_SRCU(netpoll_srcu);
-
#define USEC_PER_POLL 50
#define MAX_SKB_SIZE \
@@ -162,7 +157,7 @@ static void poll_one_napi(struct napi_struct *napi)
if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state))
return;
- /* We explicilty pass the polling call a budget of 0 to
+ /* We explicitly pass the polling call a budget of 0 to
* indicate that we are clearing the Tx path only.
*/
work = napi->poll(napi, 0);
@@ -220,41 +215,39 @@ EXPORT_SYMBOL(netpoll_poll_dev);
void netpoll_poll_disable(struct net_device *dev)
{
struct netpoll_info *ni;
- int idx;
+
might_sleep();
- idx = srcu_read_lock(&netpoll_srcu);
- ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
+ ni = rtnl_dereference(dev->npinfo);
if (ni)
down(&ni->dev_lock);
- srcu_read_unlock(&netpoll_srcu, idx);
}
-EXPORT_SYMBOL(netpoll_poll_disable);
void netpoll_poll_enable(struct net_device *dev)
{
struct netpoll_info *ni;
- rcu_read_lock();
- ni = rcu_dereference(dev->npinfo);
+
+ ni = rtnl_dereference(dev->npinfo);
if (ni)
up(&ni->dev_lock);
- rcu_read_unlock();
}
-EXPORT_SYMBOL(netpoll_poll_enable);
-static void refill_skbs(void)
+static void refill_skbs(struct netpoll *np)
{
+ struct sk_buff_head *skb_pool;
struct sk_buff *skb;
unsigned long flags;
- spin_lock_irqsave(&skb_pool.lock, flags);
- while (skb_pool.qlen < MAX_SKBS) {
+ skb_pool = &np->skb_pool;
+
+ spin_lock_irqsave(&skb_pool->lock, flags);
+ while (skb_pool->qlen < MAX_SKBS) {
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
if (!skb)
break;
- __skb_queue_tail(&skb_pool, skb);
+ __skb_queue_tail(skb_pool, skb);
}
- spin_unlock_irqrestore(&skb_pool.lock, flags);
+ spin_unlock_irqrestore(&skb_pool->lock, flags);
}
static void zap_completion_queue(void)
@@ -291,12 +284,12 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
struct sk_buff *skb;
zap_completion_queue();
- refill_skbs();
+ refill_skbs(np);
repeat:
skb = alloc_skb(len, GFP_ATOMIC);
if (!skb)
- skb = skb_dequeue(&skb_pool);
+ skb = skb_dequeue(&np->skb_pool);
if (!skb) {
if (++count < 10) {
@@ -316,7 +309,7 @@ static int netpoll_owner_active(struct net_device *dev)
struct napi_struct *napi;
list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
- if (napi->poll_owner == smp_processor_id())
+ if (READ_ONCE(napi->poll_owner) == smp_processor_id())
return 1;
}
return 0;
@@ -326,6 +319,7 @@ static int netpoll_owner_active(struct net_device *dev)
static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
netdev_tx_t status = NETDEV_TX_BUSY;
+ netdev_tx_t ret = NET_XMIT_DROP;
struct net_device *dev;
unsigned long tries;
/* It is up to the caller to keep npinfo alive. */
@@ -334,11 +328,12 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
lockdep_assert_irqs_disabled();
dev = np->dev;
+ rcu_read_lock();
npinfo = rcu_dereference_bh(dev->npinfo);
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
dev_kfree_skb_irq(skb);
- return NET_XMIT_DROP;
+ goto out;
}
/* don't get messages out of order, and no recursion */
@@ -377,7 +372,10 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
- return NETDEV_TX_OK;
+ ret = NETDEV_TX_OK;
+out:
+ rcu_read_unlock();
+ return ret;
}
netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
@@ -397,7 +395,7 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
}
EXPORT_SYMBOL(netpoll_send_skb);
-void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+int netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
int total_len, ip_len, udp_len;
struct sk_buff *skb;
@@ -421,7 +419,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb = find_skb(np, total_len + np->dev->needed_tailroom,
total_len - len);
if (!skb)
- return;
+ return -ENOMEM;
skb_copy_to_linear_data(skb, msg, len);
skb_put(skb, len);
@@ -497,7 +495,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb->dev = np->dev;
- netpoll_send_skb(np, skb);
+ return (int)netpoll_send_skb(np, skb);
}
EXPORT_SYMBOL(netpoll_send_udp);
@@ -538,6 +536,14 @@ static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
return -1;
}
+static void skb_pool_flush(struct netpoll *np)
+{
+ struct sk_buff_head *skb_pool;
+
+ skb_pool = &np->skb_pool;
+ skb_queue_purge_reason(skb_pool, SKB_CONSUMED);
+}
+
int netpoll_parse_options(struct netpoll *np, char *opt)
{
char *cur=opt, *delim;
@@ -626,17 +632,17 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
const struct net_device_ops *ops;
int err;
- np->dev = ndev;
- strscpy(np->dev_name, ndev->name, IFNAMSIZ);
+ skb_queue_head_init(&np->skb_pool);
if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
np_err(np, "%s doesn't support polling, aborting\n",
- np->dev_name);
+ ndev->name);
err = -ENOTSUPP;
goto out;
}
- if (!ndev->npinfo) {
+ npinfo = rtnl_dereference(ndev->npinfo);
+ if (!npinfo) {
npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
if (!npinfo) {
err = -ENOMEM;
@@ -649,19 +655,23 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
refcount_set(&npinfo->refcnt, 1);
- ops = np->dev->netdev_ops;
+ ops = ndev->netdev_ops;
if (ops->ndo_netpoll_setup) {
- err = ops->ndo_netpoll_setup(ndev, npinfo);
+ err = ops->ndo_netpoll_setup(ndev);
if (err)
goto free_npinfo;
}
} else {
- npinfo = rtnl_dereference(ndev->npinfo);
refcount_inc(&npinfo->refcnt);
}
+ np->dev = ndev;
+ strscpy(np->dev_name, ndev->name, IFNAMSIZ);
npinfo->netpoll = np;
+ /* fill up the skb queue */
+ refill_skbs(np);
+
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
@@ -677,6 +687,7 @@ EXPORT_SYMBOL_GPL(__netpoll_setup);
int netpoll_setup(struct netpoll *np)
{
struct net_device *ndev = NULL;
+ bool ip_overwritten = false;
struct in_device *in_dev;
int err;
@@ -741,6 +752,7 @@ put_noaddr:
}
np->local_ip.ip = ifa->ifa_local;
+ ip_overwritten = true;
np_info(np, "local IP %pI4\n", &np->local_ip.ip);
} else {
#if IS_ENABLED(CONFIG_IPV6)
@@ -757,6 +769,7 @@ put_noaddr:
!!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL))
continue;
np->local_ip.in6 = ifp->addr;
+ ip_overwritten = true;
err = 0;
break;
}
@@ -777,16 +790,18 @@ put_noaddr:
}
}
- /* fill up the skb queue */
- refill_skbs();
-
err = __netpoll_setup(np, ndev);
if (err)
- goto put;
+ goto flush;
rtnl_unlock();
return 0;
+flush:
+ skb_pool_flush(np);
put:
+ DEBUG_NET_WARN_ON_ONCE(np->dev);
+ if (ip_overwritten)
+ memset(&np->local_ip, 0, sizeof(np->local_ip));
netdev_put(ndev, &np->dev_tracker);
unlock:
rtnl_unlock();
@@ -794,13 +809,6 @@ unlock:
}
EXPORT_SYMBOL(netpoll_setup);
-static int __init netpoll_init(void)
-{
- skb_queue_head_init(&skb_pool);
- return 0;
-}
-core_initcall(netpoll_init);
-
static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
{
struct netpoll_info *npinfo =
@@ -826,8 +834,6 @@ void __netpoll_cleanup(struct netpoll *np)
if (!npinfo)
return;
- synchronize_srcu(&netpoll_srcu);
-
if (refcount_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
@@ -839,6 +845,8 @@ void __netpoll_cleanup(struct netpoll *np)
call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
} else
RCU_INIT_POINTER(np->dev->npinfo, NULL);
+
+ skb_pool_flush(np);
}
EXPORT_SYMBOL_GPL(__netpoll_cleanup);
@@ -853,14 +861,20 @@ void __netpoll_free(struct netpoll *np)
}
EXPORT_SYMBOL_GPL(__netpoll_free);
+void do_netpoll_cleanup(struct netpoll *np)
+{
+ __netpoll_cleanup(np);
+ netdev_put(np->dev, &np->dev_tracker);
+ np->dev = NULL;
+}
+EXPORT_SYMBOL(do_netpoll_cleanup);
+
void netpoll_cleanup(struct netpoll *np)
{
rtnl_lock();
if (!np->dev)
goto out;
- __netpoll_cleanup(np);
- netdev_put(np->dev, &np->dev_tracker);
- np->dev = NULL;
+ do_netpoll_cleanup(np);
out:
rtnl_unlock();
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 4933762e5a6b..f5e908c9e7ad 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -5,11 +5,13 @@
* Copyright (C) 2016 Red Hat, Inc.
*/
+#include <linux/error-injection.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/device.h>
+#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
#include <net/xdp.h>
@@ -23,14 +25,20 @@
#include <trace/events/page_pool.h>
+#include "mp_dmabuf_devmem.h"
+#include "netmem_priv.h"
#include "page_pool_priv.h"
+DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
+
#define DEFER_TIME (msecs_to_jiffies(1000))
#define DEFER_WARN_INTERVAL (60 * HZ)
#define BIAS_MAX (LONG_MAX >> 1)
#ifdef CONFIG_PAGE_POOL_STATS
+static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
+
/* alloc_stat_inc is intended to be used in softirq context */
#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
/* recycle_stat_inc is safe to use when preemption is possible. */
@@ -121,9 +129,9 @@ int page_pool_ethtool_stats_get_count(void)
}
EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
-u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
+u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
{
- struct page_pool_stats *pool_stats = stats;
+ const struct page_pool_stats *pool_stats = stats;
*data++ = pool_stats->alloc_stats.fast;
*data++ = pool_stats->alloc_stats.slow;
@@ -170,16 +178,33 @@ static void page_pool_producer_unlock(struct page_pool *pool,
spin_unlock_bh(&pool->ring.producer_lock);
}
+static void page_pool_struct_check(void)
+{
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
+ CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
+ PAGE_POOL_FRAG_GROUP_ALIGN);
+}
+
static int page_pool_init(struct page_pool *pool,
- const struct page_pool_params *params)
+ const struct page_pool_params *params,
+ int cpuid)
{
unsigned int ring_qsize = 1024; /* Default */
+ struct netdev_rx_queue *rxq;
+ int err;
+
+ page_pool_struct_check();
memcpy(&pool->p, &params->fast, sizeof(pool->p));
memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
+ pool->cpuid = cpuid;
+ pool->dma_sync_for_cpu = true;
+
/* Validate only known flags were used */
- if (pool->p.flags & ~(PP_FLAG_ALL))
+ if (pool->slow.flags & ~PP_FLAG_ALL)
return -EINVAL;
if (pool->p.pool_size)
@@ -193,22 +218,26 @@ static int page_pool_init(struct page_pool *pool,
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
* which is the XDP_TX use-case.
*/
- if (pool->p.flags & PP_FLAG_DMA_MAP) {
+ if (pool->slow.flags & PP_FLAG_DMA_MAP) {
if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
(pool->p.dma_dir != DMA_BIDIRECTIONAL))
return -EINVAL;
+
+ pool->dma_map = true;
}
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
+ if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
/* In order to request DMA-sync-for-device the page
* needs to be mapped
*/
- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
return -EINVAL;
if (!pool->p.max_len)
return -EINVAL;
+ pool->dma_sync = true;
+
/* pool->p.offset has to be set according to the address
* offset used by the DMA engine to start copying rx data
*/
@@ -217,14 +246,24 @@ static int page_pool_init(struct page_pool *pool,
pool->has_init_callback = !!pool->slow.init_callback;
#ifdef CONFIG_PAGE_POOL_STATS
- pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
- if (!pool->recycle_stats)
- return -ENOMEM;
+ if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
+ pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
+ if (!pool->recycle_stats)
+ return -ENOMEM;
+ } else {
+ /* For system page pool instance we use a singular stats object
+ * instead of allocating a separate percpu variable for each
+ * (also percpu) page pool instance.
+ */
+ pool->recycle_stats = &pp_system_recycle_stats;
+ pool->system = true;
+ }
#endif
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
#ifdef CONFIG_PAGE_POOL_STATS
- free_percpu(pool->recycle_stats);
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
#endif
return -ENOMEM;
}
@@ -234,29 +273,65 @@ static int page_pool_init(struct page_pool *pool,
/* Driver calling page_pool_create() also call page_pool_destroy() */
refcount_set(&pool->user_cnt, 1);
- if (pool->p.flags & PP_FLAG_DMA_MAP)
+ if (pool->dma_map)
get_device(pool->p.dev);
+ if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
+ /* We rely on rtnl_lock()ing to make sure netdev_rx_queue
+ * configuration doesn't change while we're initializing
+ * the page_pool.
+ */
+ ASSERT_RTNL();
+ rxq = __netif_get_rx_queue(pool->slow.netdev,
+ pool->slow.queue_idx);
+ pool->mp_priv = rxq->mp_params.mp_priv;
+ }
+
+ if (pool->mp_priv) {
+ if (!pool->dma_map || !pool->dma_sync)
+ return -EOPNOTSUPP;
+
+ err = mp_dmabuf_devmem_init(pool);
+ if (err) {
+ pr_warn("%s() mem-provider init failed %d\n", __func__,
+ err);
+ goto free_ptr_ring;
+ }
+
+ static_branch_inc(&page_pool_mem_providers);
+ }
+
return 0;
+
+free_ptr_ring:
+ ptr_ring_cleanup(&pool->ring, NULL);
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
+#endif
+ return err;
}
static void page_pool_uninit(struct page_pool *pool)
{
ptr_ring_cleanup(&pool->ring, NULL);
- if (pool->p.flags & PP_FLAG_DMA_MAP)
+ if (pool->dma_map)
put_device(pool->p.dev);
#ifdef CONFIG_PAGE_POOL_STATS
- free_percpu(pool->recycle_stats);
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
#endif
}
/**
- * page_pool_create() - create a page pool.
+ * page_pool_create_percpu() - create a page pool for a given cpu.
* @params: parameters, see struct page_pool_params
+ * @cpuid: cpu identifier
*/
-struct page_pool *page_pool_create(const struct page_pool_params *params)
+struct page_pool *
+page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
{
struct page_pool *pool;
int err;
@@ -265,7 +340,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
if (!pool)
return ERR_PTR(-ENOMEM);
- err = page_pool_init(pool, params);
+ err = page_pool_init(pool, params, cpuid);
if (err < 0)
goto err_free;
@@ -282,21 +357,30 @@ err_free:
kfree(pool);
return ERR_PTR(err);
}
+EXPORT_SYMBOL(page_pool_create_percpu);
+
+/**
+ * page_pool_create() - create a page pool
+ * @params: parameters, see struct page_pool_params
+ */
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ return page_pool_create_percpu(params, -1);
+}
EXPORT_SYMBOL(page_pool_create);
-static void page_pool_return_page(struct page_pool *pool, struct page *page);
+static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
-noinline
-static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
+static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
{
struct ptr_ring *r = &pool->ring;
- struct page *page;
+ netmem_ref netmem;
int pref_nid; /* preferred NUMA node */
/* Quicker fallback, avoid locks when ring is empty */
if (__ptr_ring_empty(r)) {
alloc_stat_inc(pool, empty);
- return NULL;
+ return 0;
}
/* Softirq guarantee CPU and thus NUMA node is stable. This,
@@ -311,64 +395,74 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
/* Refill alloc array, but only if NUMA match */
do {
- page = __ptr_ring_consume(r);
- if (unlikely(!page))
+ netmem = (__force netmem_ref)__ptr_ring_consume(r);
+ if (unlikely(!netmem))
break;
- if (likely(page_to_nid(page) == pref_nid)) {
- pool->alloc.cache[pool->alloc.count++] = page;
+ if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
+ pool->alloc.cache[pool->alloc.count++] = netmem;
} else {
/* NUMA mismatch;
* (1) release 1 page to page-allocator and
* (2) break out to fallthrough to alloc_pages_node.
* This limit stress on page buddy alloactor.
*/
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
alloc_stat_inc(pool, waive);
- page = NULL;
+ netmem = 0;
break;
}
} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
/* Return last page */
if (likely(pool->alloc.count > 0)) {
- page = pool->alloc.cache[--pool->alloc.count];
+ netmem = pool->alloc.cache[--pool->alloc.count];
alloc_stat_inc(pool, refill);
}
- return page;
+ return netmem;
}
/* fast path */
-static struct page *__page_pool_get_cached(struct page_pool *pool)
+static netmem_ref __page_pool_get_cached(struct page_pool *pool)
{
- struct page *page;
+ netmem_ref netmem;
/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
if (likely(pool->alloc.count)) {
/* Fast-path */
- page = pool->alloc.cache[--pool->alloc.count];
+ netmem = pool->alloc.cache[--pool->alloc.count];
alloc_stat_inc(pool, fast);
} else {
- page = page_pool_refill_alloc_cache(pool);
+ netmem = page_pool_refill_alloc_cache(pool);
}
- return page;
+ return netmem;
}
-static void page_pool_dma_sync_for_device(struct page_pool *pool,
- struct page *page,
- unsigned int dma_sync_size)
+static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
+ netmem_ref netmem,
+ u32 dma_sync_size)
{
- dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
+ dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
dma_sync_size = min(dma_sync_size, pool->p.max_len);
- dma_sync_single_range_for_device(pool->p.dev, dma_addr,
- pool->p.offset, dma_sync_size,
- pool->p.dma_dir);
+ __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
+ dma_sync_size, pool->p.dma_dir);
+#endif
+}
+
+static __always_inline void
+page_pool_dma_sync_for_device(const struct page_pool *pool,
+ netmem_ref netmem,
+ u32 dma_sync_size)
+{
+ if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
+ __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
}
-static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
+static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
{
dma_addr_t dma;
@@ -377,52 +471,28 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
* into page private data (i.e 32bit cpu with 64bit DMA caps)
* This mapping is kept for lifetime of page, until leaving pool.
*/
- dma = dma_map_page_attrs(pool->p.dev, page, 0,
- (PAGE_SIZE << pool->p.order),
- pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
- DMA_ATTR_WEAK_ORDERING);
+ dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
+ (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_WEAK_ORDERING);
if (dma_mapping_error(pool->p.dev, dma))
return false;
- if (page_pool_set_dma_addr(page, dma))
+ if (page_pool_set_dma_addr_netmem(netmem, dma))
goto unmap_failed;
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
- page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+ page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
return true;
unmap_failed:
- WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
+ WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
return false;
}
-static void page_pool_set_pp_info(struct page_pool *pool,
- struct page *page)
-{
- page->pp = pool;
- page->pp_magic |= PP_SIGNATURE;
-
- /* Ensuring all pages have been split into one fragment initially:
- * page_pool_set_pp_info() is only called once for every page when it
- * is allocated from the page allocator and page_pool_fragment_page()
- * is dirtying the same cache line as the page->pp_magic above, so
- * the overhead is negligible.
- */
- page_pool_fragment_page(page, 1);
- if (pool->has_init_callback)
- pool->slow.init_callback(page, pool->slow.init_arg);
-}
-
-static void page_pool_clear_pp_info(struct page *page)
-{
- page->pp_magic = 0;
- page->pp = NULL;
-}
-
static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
gfp_t gfp)
{
@@ -433,94 +503,102 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
if (unlikely(!page))
return NULL;
- if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
- unlikely(!page_pool_dma_map(pool, page))) {
+ if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
put_page(page);
return NULL;
}
alloc_stat_inc(pool, slow_high_order);
- page_pool_set_pp_info(pool, page);
+ page_pool_set_pp_info(pool, page_to_netmem(page));
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
- trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+ trace_page_pool_state_hold(pool, page_to_netmem(page),
+ pool->pages_state_hold_cnt);
return page;
}
/* slow path */
-noinline
-static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
- gfp_t gfp)
+static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
+ gfp_t gfp)
{
const int bulk = PP_ALLOC_CACHE_REFILL;
- unsigned int pp_flags = pool->p.flags;
unsigned int pp_order = pool->p.order;
- struct page *page;
+ bool dma_map = pool->dma_map;
+ netmem_ref netmem;
int i, nr_pages;
/* Don't support bulk alloc for high-order pages */
if (unlikely(pp_order))
- return __page_pool_alloc_page_order(pool, gfp);
+ return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
/* Unnecessary as alloc cache is empty, but guarantees zero count */
if (unlikely(pool->alloc.count > 0))
return pool->alloc.cache[--pool->alloc.count];
- /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
+ /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
- nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
- pool->alloc.cache);
+ nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
+ (struct page **)pool->alloc.cache);
if (unlikely(!nr_pages))
- return NULL;
+ return 0;
/* Pages have been filled into alloc.cache array, but count is zero and
* page element have not been (possibly) DMA mapped.
*/
for (i = 0; i < nr_pages; i++) {
- page = pool->alloc.cache[i];
- if ((pp_flags & PP_FLAG_DMA_MAP) &&
- unlikely(!page_pool_dma_map(pool, page))) {
- put_page(page);
+ netmem = pool->alloc.cache[i];
+ if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
+ put_page(netmem_to_page(netmem));
continue;
}
- page_pool_set_pp_info(pool, page);
- pool->alloc.cache[pool->alloc.count++] = page;
+ page_pool_set_pp_info(pool, netmem);
+ pool->alloc.cache[pool->alloc.count++] = netmem;
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
- trace_page_pool_state_hold(pool, page,
+ trace_page_pool_state_hold(pool, netmem,
pool->pages_state_hold_cnt);
}
/* Return last page */
if (likely(pool->alloc.count > 0)) {
- page = pool->alloc.cache[--pool->alloc.count];
+ netmem = pool->alloc.cache[--pool->alloc.count];
alloc_stat_inc(pool, slow);
} else {
- page = NULL;
+ netmem = 0;
}
/* When page just alloc'ed is should/must have refcnt 1. */
- return page;
+ return netmem;
}
/* For using page_pool replace: alloc_pages() API calls, but provide
* synchronization guarantee for allocation side.
*/
-struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
{
- struct page *page;
+ netmem_ref netmem;
/* Fast-path: Get a page from cache */
- page = __page_pool_get_cached(pool);
- if (page)
- return page;
+ netmem = __page_pool_get_cached(pool);
+ if (netmem)
+ return netmem;
/* Slow-path: cache empty, do real allocation */
- page = __page_pool_alloc_pages_slow(pool, gfp);
- return page;
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
+ netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
+ else
+ netmem = __page_pool_alloc_pages_slow(pool, gfp);
+ return netmem;
+}
+EXPORT_SYMBOL(page_pool_alloc_netmems);
+ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
+
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
}
EXPORT_SYMBOL(page_pool_alloc_pages);
@@ -548,24 +626,46 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict)
return inflight;
}
-static __always_inline
-void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
+{
+ netmem_set_pp(netmem, pool);
+ netmem_or_pp_magic(netmem, PP_SIGNATURE);
+
+ /* Ensuring all pages have been split into one fragment initially:
+ * page_pool_set_pp_info() is only called once for every page when it
+ * is allocated from the page allocator and page_pool_fragment_page()
+ * is dirtying the same cache line as the page->pp_magic above, so
+ * the overhead is negligible.
+ */
+ page_pool_fragment_netmem(netmem, 1);
+ if (pool->has_init_callback)
+ pool->slow.init_callback(netmem, pool->slow.init_arg);
+}
+
+void page_pool_clear_pp_info(netmem_ref netmem)
+{
+ netmem_clear_pp_magic(netmem);
+ netmem_set_pp(netmem, NULL);
+}
+
+static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
+ netmem_ref netmem)
{
dma_addr_t dma;
- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ if (!pool->dma_map)
/* Always account for inflight pages, even if we didn't
* map them
*/
return;
- dma = page_pool_get_dma_addr(page);
+ dma = page_pool_get_dma_addr_netmem(netmem);
/* When page is unmapped, it cannot be returned to our pool */
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
- page_pool_set_dma_addr(page, 0);
+ page_pool_set_dma_addr_netmem(netmem, 0);
}
/* Disconnects a page (from a page_pool). API users can have a need
@@ -573,35 +673,41 @@ void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
* a regular page (that will eventually be returned to the normal
* page-allocator via put_page).
*/
-void page_pool_return_page(struct page_pool *pool, struct page *page)
+void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
{
int count;
+ bool put;
- __page_pool_release_page_dma(pool, page);
-
- page_pool_clear_pp_info(page);
+ put = true;
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
+ put = mp_dmabuf_devmem_release_page(pool, netmem);
+ else
+ __page_pool_release_page_dma(pool, netmem);
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
*/
count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
- trace_page_pool_state_release(pool, page, count);
+ trace_page_pool_state_release(pool, netmem, count);
- put_page(page);
+ if (put) {
+ page_pool_clear_pp_info(netmem);
+ put_page(netmem_to_page(netmem));
+ }
/* An optimization would be to call __free_pages(page, pool->p.order)
* knowing page is not part of page-cache (thus avoiding a
* __page_cache_release() call).
*/
}
-static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
+static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
{
int ret;
/* BH protection not needed if current is softirq */
if (in_softirq())
- ret = ptr_ring_produce(&pool->ring, page);
+ ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
else
- ret = ptr_ring_produce_bh(&pool->ring, page);
+ ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
if (!ret) {
recycle_stat_inc(pool, ring);
@@ -616,7 +722,7 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
*
* Caller must provide appropriate safe context.
*/
-static bool page_pool_recycle_in_cache(struct page *page,
+static bool page_pool_recycle_in_cache(netmem_ref netmem,
struct page_pool *pool)
{
if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
@@ -625,19 +731,26 @@ static bool page_pool_recycle_in_cache(struct page *page,
}
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
- pool->alloc.cache[pool->alloc.count++] = page;
+ pool->alloc.cache[pool->alloc.count++] = netmem;
recycle_stat_inc(pool, cached);
return true;
}
+static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
+{
+ return netmem_is_net_iov(netmem) ||
+ (page_ref_count(netmem_to_page(netmem)) == 1 &&
+ !page_is_pfmemalloc(netmem_to_page(netmem)));
+}
+
/* If the page refcnt == 1, this will try to recycle the page.
- * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
+ * If pool->dma_sync is set, we'll try to sync the DMA area for
* the configured size min(dma_sync_size, pool->max_len).
* If the page refcnt != 1, then the page will be returned to memory
* subsystem.
*/
-static __always_inline struct page *
-__page_pool_put_page(struct page_pool *pool, struct page *page,
+static __always_inline netmem_ref
+__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
unsigned int dma_sync_size, bool allow_direct)
{
lockdep_assert_no_hardirq();
@@ -651,20 +764,18 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* page is NOT reusable when allocated when system is under
* some pressure. (page_is_pfmemalloc)
*/
- if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
+ if (likely(__page_pool_page_can_be_recycled(netmem))) {
/* Read barrier done in page_ref_count / READ_ONCE */
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
- page_pool_dma_sync_for_device(pool, page,
- dma_sync_size);
+ page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
- if (allow_direct && in_softirq() &&
- page_pool_recycle_in_cache(page, pool))
- return NULL;
+ if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
+ return 0;
/* Page found as candidate for recycling */
- return page;
+ return netmem;
}
+
/* Fallback/non-XDP mode: API user have elevated refcnt.
*
* Many drivers split up the page into fragments, and some
@@ -679,174 +790,253 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* will be invoking put_page.
*/
recycle_stat_inc(pool, released_refcnt);
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
- return NULL;
+ return 0;
}
-void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+static bool page_pool_napi_local(const struct page_pool *pool)
{
- page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
- if (page && !page_pool_recycle_in_ring(pool, page)) {
+ const struct napi_struct *napi;
+ u32 cpuid;
+
+ if (unlikely(!in_softirq()))
+ return false;
+
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ * __page_pool_put_page() makes sure we're not in hardirq context
+ * and interrupts are enabled prior to accessing the cache.
+ */
+ cpuid = smp_processor_id();
+ if (READ_ONCE(pool->cpuid) == cpuid)
+ return true;
+
+ napi = READ_ONCE(pool->p.napi);
+
+ return napi && READ_ONCE(napi->list_owner) == cpuid;
+}
+
+void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ if (!allow_direct)
+ allow_direct = page_pool_napi_local(pool);
+
+ netmem =
+ __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
+ if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
/* Cache full, fallback to free pages */
recycle_stat_inc(pool, ring_full);
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
}
}
+EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
+
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
+ allow_direct);
+}
EXPORT_SYMBOL(page_pool_put_unrefed_page);
-/**
- * page_pool_put_page_bulk() - release references on multiple pages
- * @pool: pool from which pages were allocated
- * @data: array holding page pointers
- * @count: number of pages in @data
- *
- * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
- * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
- * will release leftover pages to the page allocator.
- * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
- * completion loop for the XDP_REDIRECT use case.
- *
- * Please note the caller must not use data area after running
- * page_pool_put_page_bulk(), as this function overwrites it.
- */
-void page_pool_put_page_bulk(struct page_pool *pool, void **data,
- int count)
+static void page_pool_recycle_ring_bulk(struct page_pool *pool,
+ netmem_ref *bulk,
+ u32 bulk_len)
{
- int i, bulk_len = 0;
bool in_softirq;
+ u32 i;
- for (i = 0; i < count; i++) {
- struct page *page = virt_to_head_page(data[i]);
-
- /* It is not the last user for the page frag case */
- if (!page_pool_is_last_ref(page))
- continue;
-
- page = __page_pool_put_page(pool, page, -1, false);
- /* Approved for bulk recycling in ptr_ring cache */
- if (page)
- data[bulk_len++] = page;
- }
-
- if (unlikely(!bulk_len))
- return;
-
- /* Bulk producer into ptr_ring page_pool cache */
+ /* Bulk produce into ptr_ring page_pool cache */
in_softirq = page_pool_producer_lock(pool);
+
for (i = 0; i < bulk_len; i++) {
- if (__ptr_ring_produce(&pool->ring, data[i])) {
+ if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
/* ring full */
recycle_stat_inc(pool, ring_full);
break;
}
}
- recycle_stat_add(pool, ring, i);
+
page_pool_producer_unlock(pool, in_softirq);
+ recycle_stat_add(pool, ring, i);
- /* Hopefully all pages was return into ptr_ring */
+ /* Hopefully all pages were returned into ptr_ring */
if (likely(i == bulk_len))
return;
- /* ptr_ring cache full, free remaining pages outside producer lock
- * since put_page() with refcnt == 1 can be an expensive operation
+ /*
+ * ptr_ring cache is full, free remaining pages outside producer lock
+ * since put_page() with refcnt == 1 can be an expensive operation.
*/
for (; i < bulk_len; i++)
- page_pool_return_page(pool, data[i]);
+ page_pool_return_page(pool, bulk[i]);
}
-EXPORT_SYMBOL(page_pool_put_page_bulk);
-static struct page *page_pool_drain_frag(struct page_pool *pool,
- struct page *page)
+/**
+ * page_pool_put_netmem_bulk() - release references on multiple netmems
+ * @data: array holding netmem references
+ * @count: number of entries in @data
+ *
+ * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
+ * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
+ * will release leftover netmems to the memory provider.
+ * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
+ * completion loop for the XDP_REDIRECT use case.
+ *
+ * Please note the caller must not use data area after running
+ * page_pool_put_netmem_bulk(), as this function overwrites it.
+ */
+void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
+{
+ u32 bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ netmem_ref netmem = netmem_compound_head(data[i]);
+
+ if (page_pool_unref_and_test(netmem))
+ data[bulk_len++] = netmem;
+ }
+
+ count = bulk_len;
+ while (count) {
+ netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
+ struct page_pool *pool = NULL;
+ bool allow_direct;
+ u32 foreign = 0;
+
+ bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ struct page_pool *netmem_pp;
+ netmem_ref netmem = data[i];
+
+ netmem_pp = netmem_get_pp(netmem);
+ if (unlikely(!pool)) {
+ pool = netmem_pp;
+ allow_direct = page_pool_napi_local(pool);
+ } else if (netmem_pp != pool) {
+ /*
+ * If the netmem belongs to a different
+ * page_pool, save it for another round.
+ */
+ data[foreign++] = netmem;
+ continue;
+ }
+
+ netmem = __page_pool_put_page(pool, netmem, -1,
+ allow_direct);
+ /* Approved for bulk recycling in ptr_ring cache */
+ if (netmem)
+ bulk[bulk_len++] = netmem;
+ }
+
+ if (bulk_len)
+ page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
+
+ count = foreign;
+ }
+}
+EXPORT_SYMBOL(page_pool_put_netmem_bulk);
+
+static netmem_ref page_pool_drain_frag(struct page_pool *pool,
+ netmem_ref netmem)
{
long drain_count = BIAS_MAX - pool->frag_users;
/* Some user is still using the page frag */
- if (likely(page_pool_unref_page(page, drain_count)))
- return NULL;
+ if (likely(page_pool_unref_netmem(netmem, drain_count)))
+ return 0;
- if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
- page_pool_dma_sync_for_device(pool, page, -1);
-
- return page;
+ if (__page_pool_page_can_be_recycled(netmem)) {
+ page_pool_dma_sync_for_device(pool, netmem, -1);
+ return netmem;
}
- page_pool_return_page(pool, page);
- return NULL;
+ page_pool_return_page(pool, netmem);
+ return 0;
}
static void page_pool_free_frag(struct page_pool *pool)
{
long drain_count = BIAS_MAX - pool->frag_users;
- struct page *page = pool->frag_page;
+ netmem_ref netmem = pool->frag_page;
- pool->frag_page = NULL;
+ pool->frag_page = 0;
- if (!page || page_pool_unref_page(page, drain_count))
+ if (!netmem || page_pool_unref_netmem(netmem, drain_count))
return;
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
}
-struct page *page_pool_alloc_frag(struct page_pool *pool,
- unsigned int *offset,
- unsigned int size, gfp_t gfp)
+netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
+ unsigned int *offset, unsigned int size,
+ gfp_t gfp)
{
unsigned int max_size = PAGE_SIZE << pool->p.order;
- struct page *page = pool->frag_page;
+ netmem_ref netmem = pool->frag_page;
if (WARN_ON(size > max_size))
- return NULL;
+ return 0;
size = ALIGN(size, dma_get_cache_alignment());
*offset = pool->frag_offset;
- if (page && *offset + size > max_size) {
- page = page_pool_drain_frag(pool, page);
- if (page) {
+ if (netmem && *offset + size > max_size) {
+ netmem = page_pool_drain_frag(pool, netmem);
+ if (netmem) {
+ recycle_stat_inc(pool, cached);
alloc_stat_inc(pool, fast);
goto frag_reset;
}
}
- if (!page) {
- page = page_pool_alloc_pages(pool, gfp);
- if (unlikely(!page)) {
- pool->frag_page = NULL;
- return NULL;
+ if (!netmem) {
+ netmem = page_pool_alloc_netmems(pool, gfp);
+ if (unlikely(!netmem)) {
+ pool->frag_page = 0;
+ return 0;
}
- pool->frag_page = page;
+ pool->frag_page = netmem;
frag_reset:
pool->frag_users = 1;
*offset = 0;
pool->frag_offset = size;
- page_pool_fragment_page(page, BIAS_MAX);
- return page;
+ page_pool_fragment_netmem(netmem, BIAS_MAX);
+ return netmem;
}
pool->frag_users++;
pool->frag_offset = *offset + size;
- alloc_stat_inc(pool, fast);
- return page;
+ return netmem;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
+
+struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
+ unsigned int size, gfp_t gfp)
+{
+ return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
+ gfp));
}
EXPORT_SYMBOL(page_pool_alloc_frag);
static void page_pool_empty_ring(struct page_pool *pool)
{
- struct page *page;
+ netmem_ref netmem;
/* Empty recycle ring */
- while ((page = ptr_ring_consume_bh(&pool->ring))) {
+ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
/* Verify the refcnt invariant of cached pages */
- if (!(page_ref_count(page) == 1))
+ if (!(netmem_ref_count(netmem) == 1))
pr_crit("%s() page_pool refcnt %d violation\n",
- __func__, page_ref_count(page));
+ __func__, netmem_ref_count(netmem));
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
}
}
@@ -857,12 +1047,18 @@ static void __page_pool_destroy(struct page_pool *pool)
page_pool_unlist(pool);
page_pool_uninit(pool);
+
+ if (pool->mp_priv) {
+ mp_dmabuf_devmem_destroy(pool);
+ static_branch_dec(&page_pool_mem_providers);
+ }
+
kfree(pool);
}
static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
{
- struct page *page;
+ netmem_ref netmem;
if (pool->destroy_cnt)
return;
@@ -872,8 +1068,8 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
* call concurrently.
*/
while (pool->alloc.count) {
- page = pool->alloc.cache[--pool->alloc.count];
- page_pool_return_page(pool, page);
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ page_pool_return_page(pool, netmem);
}
}
@@ -927,27 +1123,34 @@ static void page_pool_release_retry(struct work_struct *wq)
}
void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
- struct xdp_mem_info *mem)
+ const struct xdp_mem_info *mem)
{
refcount_inc(&pool->user_cnt);
pool->disconnect = disconnect;
pool->xdp_mem_id = mem->id;
}
-void page_pool_unlink_napi(struct page_pool *pool)
+void page_pool_disable_direct_recycling(struct page_pool *pool)
{
+ /* Disable direct recycling based on pool->cpuid.
+ * Paired with READ_ONCE() in page_pool_napi_local().
+ */
+ WRITE_ONCE(pool->cpuid, -1);
+
if (!pool->p.napi)
return;
/* To avoid races with recycling and additional barriers make sure
* pool and NAPI are unlinked when NAPI is disabled.
*/
- WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
- READ_ONCE(pool->p.napi->list_owner) != -1);
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
+ WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
+ mutex_lock(&page_pools_lock);
WRITE_ONCE(pool->p.napi, NULL);
+ mutex_unlock(&page_pools_lock);
}
-EXPORT_SYMBOL(page_pool_unlink_napi);
+EXPORT_SYMBOL(page_pool_disable_direct_recycling);
void page_pool_destroy(struct page_pool *pool)
{
@@ -957,7 +1160,7 @@ void page_pool_destroy(struct page_pool *pool)
if (!page_pool_put(pool))
return;
- page_pool_unlink_napi(pool);
+ page_pool_disable_direct_recycling(pool);
page_pool_free_frag(pool);
if (!page_pool_release(pool))
@@ -975,15 +1178,15 @@ EXPORT_SYMBOL(page_pool_destroy);
/* Caller must provide appropriate safe context, e.g. NAPI. */
void page_pool_update_nid(struct page_pool *pool, int new_nid)
{
- struct page *page;
+ netmem_ref netmem;
trace_page_pool_update_nid(pool, new_nid);
pool->p.nid = new_nid;
/* Flush pool alloc cache, as refill will check NUMA node */
while (pool->alloc.count) {
- page = pool->alloc.cache[--pool->alloc.count];
- page_pool_return_page(pool, page);
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ page_pool_return_page(pool, netmem);
}
}
EXPORT_SYMBOL(page_pool_update_nid);
diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h
index 90665d40f1eb..2fb06d5f6d55 100644
--- a/net/core/page_pool_priv.h
+++ b/net/core/page_pool_priv.h
@@ -3,10 +3,58 @@
#ifndef __PAGE_POOL_PRIV_H
#define __PAGE_POOL_PRIV_H
+#include <net/page_pool/helpers.h>
+
+#include "netmem_priv.h"
+
+extern struct mutex page_pools_lock;
+
s32 page_pool_inflight(const struct page_pool *pool, bool strict);
int page_pool_list(struct page_pool *pool);
void page_pool_detached(struct page_pool *pool);
void page_pool_unlist(struct page_pool *pool);
+static inline bool
+page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr)
+{
+ if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
+ netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT);
+
+ /* We assume page alignment to shave off bottom bits,
+ * if this "compression" doesn't work we need to drop.
+ */
+ return addr != (dma_addr_t)netmem_get_dma_addr(netmem)
+ << PAGE_SHIFT;
+ }
+
+ netmem_set_dma_addr(netmem, addr);
+ return false;
+}
+
+static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+ return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr);
+}
+
+#if defined(CONFIG_PAGE_POOL)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
+void page_pool_clear_pp_info(netmem_ref netmem);
+int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq);
+#else
+static inline void page_pool_set_pp_info(struct page_pool *pool,
+ netmem_ref netmem)
+{
+}
+static inline void page_pool_clear_pp_info(netmem_ref netmem)
+{
+}
+static inline int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
index 278294aca66a..6677e0c2e256 100644
--- a/net/core/page_pool_user.c
+++ b/net/core/page_pool_user.c
@@ -3,19 +3,23 @@
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/xarray.h>
+#include <net/busy_poll.h>
#include <net/net_debug.h>
-#include <net/page_pool/types.h>
+#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/types.h>
#include <net/sock.h>
+#include "devmem.h"
#include "page_pool_priv.h"
#include "netdev-genl-gen.h"
static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
-/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user.
+/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev,
+ * pool->user.
* Ordering: inside rtnl_lock
*/
-static DEFINE_MUTEX(page_pools_lock);
+DEFINE_MUTEX(page_pools_lock);
/* Page pools are only reachable from user space (via netlink) if they are
* linked to a netdev at creation time. Following page pool "visibility"
@@ -103,8 +107,6 @@ out:
mutex_unlock(&page_pools_lock);
rtnl_unlock();
- if (skb->len && err == -EMSGSIZE)
- return skb->len;
return err;
}
@@ -214,7 +216,9 @@ static int
page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
const struct genl_info *info)
{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
size_t inflight, refsz;
+ unsigned int napi_id;
void *hdr;
hdr = genlmsg_iput(rsp, info);
@@ -228,8 +232,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
pool->slow.netdev->ifindex))
goto err_cancel;
- if (pool->user.napi_id &&
- nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id))
+
+ napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0;
+ if (napi_id >= MIN_NAPI_ID &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id))
goto err_cancel;
inflight = page_pool_inflight(pool, false);
@@ -243,6 +249,9 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
pool->user.detach_time))
goto err_cancel;
+ if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
+ goto err_cancel;
+
genlmsg_end(rsp, hdr);
return 0;
@@ -315,8 +324,6 @@ int page_pool_list(struct page_pool *pool)
if (pool->slow.netdev) {
hlist_add_head(&pool->user.list,
&pool->slow.netdev->page_pools);
- pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0;
-
netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF);
}
@@ -346,6 +353,30 @@ void page_pool_unlist(struct page_pool *pool)
mutex_unlock(&page_pools_lock);
}
+int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv;
+ struct page_pool *pool;
+ struct hlist_node *n;
+
+ if (!binding)
+ return 0;
+
+ mutex_lock(&page_pools_lock);
+ hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) {
+ if (pool->mp_priv != binding)
+ continue;
+
+ if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) {
+ mutex_unlock(&page_pools_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&page_pools_lock);
+ return -ENODATA;
+}
+
static void page_pool_unreg_netdev_wipe(struct net_device *netdev)
{
struct page_pool *pool;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index ea55a758a475..82b6a2c3c141 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -69,7 +69,7 @@
*
* By design there should only be *one* "controlling" process. In practice
* multiple write accesses gives unpredictable result. Understood by "write"
- * to /proc gives result code thats should be read be the "writer".
+ * to /proc gives result code that should be read be the "writer".
* For practical use this should be no problem.
*
* Note when adding devices to a specific CPU there good idea to also assign
@@ -851,6 +851,9 @@ static ssize_t get_imix_entries(const char __user *buffer,
unsigned long weight;
unsigned long size;
+ if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES)
+ return -E2BIG;
+
len = num_arg(&buffer[i], max_digits, &size);
if (len < 0)
return len;
@@ -880,9 +883,6 @@ static ssize_t get_imix_entries(const char __user *buffer,
i++;
pkt_dev->n_imix_entries++;
-
- if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES)
- return -E2BIG;
} while (c == ' ');
return i;
@@ -2285,7 +2285,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
s64 remaining;
struct hrtimer_sleeper t;
- hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
hrtimer_set_expires(&t.timer, spin_until);
remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
@@ -2371,11 +2371,11 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
if (pkt_dev->spi) {
/* We need as quick as possible to find the right SA
- * Searching with minimum criteria to archieve this.
+ * Searching with minimum criteria to achieve, this.
*/
x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
} else {
- /* slow path: we dont already have xfrm_state */
+ /* slow path: we don't already have xfrm_state */
x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
(xfrm_address_t *)&pkt_dev->cur_daddr,
(xfrm_address_t *)&pkt_dev->cur_saddr,
@@ -3654,7 +3654,7 @@ static int pktgen_thread_worker(void *arg)
struct pktgen_dev *pkt_dev = NULL;
int cpu = t->cpu;
- WARN_ON(smp_processor_id() != cpu);
+ WARN_ON_ONCE(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
complete(&t->start_done);
@@ -3838,8 +3838,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
pkt_dev->ipsproto = IPPROTO_ESP;
- /* xfrm tunnel mode needs additional dst to extract outter
- * ip header protocol/ttl/id field, here creat a phony one.
+ /* xfrm tunnel mode needs additional dst to extract outer
+ * ip header protocol/ttl/id field, here create a phony one.
* instead of looking for a valid rt, which definitely hurting
* performance under such circumstance.
*/
@@ -3883,17 +3883,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
list_add_tail(&t->th_list, &pn->pktgen_threads);
init_completion(&t->start_done);
- p = kthread_create_on_node(pktgen_thread_worker,
- t,
- cpu_to_node(cpu),
- "kpktgend_%d", cpu);
+ p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d");
if (IS_ERR(p)) {
pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu);
list_del(&t->th_list);
kfree(t);
return PTR_ERR(p);
}
- kthread_bind(p, cpu);
+
t->tsk = p;
pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
@@ -3989,6 +3986,7 @@ static int __net_init pg_net_init(struct net *net)
goto remove;
}
+ cpus_read_lock();
for_each_online_cpu(cpu) {
int err;
@@ -3997,6 +3995,7 @@ static int __net_init pg_net_init(struct net *net)
pr_warn("Cannot create thread for cpu %d (%d)\n",
cpu, err);
}
+ cpus_read_unlock();
if (list_empty(&pn->pktgen_threads)) {
pr_err("Initialization failed for all threads\n");
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bd50e9fe3234..d1e559fce918 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -84,7 +84,6 @@ int rtnl_lock_killable(void)
{
return mutex_lock_killable(&rtnl_mutex);
}
-EXPORT_SYMBOL(rtnl_lock_killable);
static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
@@ -179,6 +178,176 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+void __rtnl_net_lock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_lock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_lock);
+
+void __rtnl_net_unlock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_unlock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_unlock);
+
+void rtnl_net_lock(struct net *net)
+{
+ rtnl_lock();
+ __rtnl_net_lock(net);
+}
+EXPORT_SYMBOL(rtnl_net_lock);
+
+void rtnl_net_unlock(struct net *net)
+{
+ __rtnl_net_unlock(net);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(rtnl_net_unlock);
+
+int rtnl_net_trylock(struct net *net)
+{
+ int ret = rtnl_trylock();
+
+ if (ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+EXPORT_SYMBOL(rtnl_net_trylock);
+
+int rtnl_net_lock_killable(struct net *net)
+{
+ int ret = rtnl_lock_killable();
+
+ if (!ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ if (net_eq(net_a, net_b))
+ return 0;
+
+ /* always init_net first */
+ if (net_eq(net_a, &init_net))
+ return -1;
+
+ if (net_eq(net_b, &init_net))
+ return 1;
+
+ /* otherwise lock in ascending order */
+ return net_a < net_b ? -1 : 1;
+}
+
+int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b)
+{
+ const struct net *net_a, *net_b;
+
+ net_a = container_of(a, struct net, rtnl_mutex.dep_map);
+ net_b = container_of(b, struct net, rtnl_mutex.dep_map);
+
+ return rtnl_net_cmp_locks(net_a, net_b);
+}
+
+bool rtnl_net_is_locked(struct net *net)
+{
+ return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_net_is_locked);
+
+bool lockdep_rtnl_net_is_held(struct net *net)
+{
+ return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_net_is_held);
+#else
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ /* No need to swap */
+ return -1;
+}
+#endif
+
+struct rtnl_nets {
+ /* ->newlink() needs to freeze 3 netns at most;
+ * 2 for the new device, 1 for its peer.
+ */
+ struct net *net[3];
+ unsigned char len;
+};
+
+static void rtnl_nets_init(struct rtnl_nets *rtnl_nets)
+{
+ memset(rtnl_nets, 0, sizeof(*rtnl_nets));
+}
+
+static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ for (i = 0; i < rtnl_nets->len; i++) {
+ put_net(rtnl_nets->net[i]);
+ rtnl_nets->net[i] = NULL;
+ }
+
+ rtnl_nets->len = 0;
+}
+
+/**
+ * rtnl_nets_add - Add netns to be locked before ->newlink().
+ *
+ * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net().
+ * @net: netns pointer with an extra refcnt held.
+ *
+ * The extra refcnt is released in rtnl_nets_destroy().
+ */
+static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net)
+{
+ int i;
+
+ DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net));
+
+ for (i = 0; i < rtnl_nets->len; i++) {
+ switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) {
+ case 0:
+ put_net(net);
+ return;
+ case 1:
+ swap(rtnl_nets->net[i], net);
+ }
+ }
+
+ rtnl_nets->net[i] = net;
+ rtnl_nets->len++;
+}
+
+static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ rtnl_lock();
+
+ for (i = 0; i < rtnl_nets->len; i++)
+ __rtnl_net_lock(rtnl_nets->net[i]);
+}
+
+static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ for (i = 0; i < rtnl_nets->len; i++)
+ __rtnl_net_unlock(rtnl_nets->net[i]);
+
+ rtnl_unlock();
+}
+
static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
@@ -269,64 +438,13 @@ unlock:
}
/**
- * rtnl_register_module - Register a rtnetlink message type
- *
- * @owner: module registering the hook (THIS_MODULE)
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
- *
- * Like rtnl_register, but for use by removable modules.
- */
-int rtnl_register_module(struct module *owner,
- int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- unsigned int flags)
-{
- return rtnl_register_internal(owner, protocol, msgtype,
- doit, dumpit, flags);
-}
-EXPORT_SYMBOL_GPL(rtnl_register_module);
-
-/**
- * rtnl_register - Register a rtnetlink message type
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
- *
- * Registers the specified function pointers (at least one of them has
- * to be non-NULL) to be called whenever a request message for the
- * specified protocol family and message type is received.
- *
- * The special protocol family PF_UNSPEC may be used to define fallback
- * function pointers for the case when no entry for the specific protocol
- * family exists.
- */
-void rtnl_register(int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- unsigned int flags)
-{
- int err;
-
- err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
- flags);
- if (err)
- pr_err("Unable to register rtnetlink message handler, "
- "protocol = %d, message type = %d\n", protocol, msgtype);
-}
-
-/**
* rtnl_unregister - Unregister a rtnetlink message type
* @protocol: Protocol family or PF_UNSPEC
* @msgtype: rtnetlink message type
*
* Returns 0 on success or a negative error code.
*/
-int rtnl_unregister(int protocol, int msgtype)
+static int rtnl_unregister(int protocol, int msgtype)
{
struct rtnl_link __rcu **tab;
struct rtnl_link *link;
@@ -349,7 +467,6 @@ int rtnl_unregister(int protocol, int msgtype)
return 0;
}
-EXPORT_SYMBOL_GPL(rtnl_unregister);
/**
* rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
@@ -384,46 +501,86 @@ void rtnl_unregister_all(int protocol)
}
EXPORT_SYMBOL_GPL(rtnl_unregister_all);
-static LIST_HEAD(link_ops);
-
-static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+/**
+ * __rtnl_register_many - Register rtnetlink message types
+ * @handlers: Array of struct rtnl_msg_handlers
+ * @n: The length of @handlers
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * When one element of @handlers fails to register,
+ * 1) built-in: panics.
+ * 2) modules : the previous successful registrations are unwinded
+ * and an error is returned.
+ *
+ * Use rtnl_register_many().
+ */
+int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n)
{
- const struct rtnl_link_ops *ops;
+ const struct rtnl_msg_handler *handler;
+ int i, err;
- list_for_each_entry(ops, &link_ops, list) {
- if (!strcmp(ops->kind, kind))
- return ops;
+ for (i = 0, handler = handlers; i < n; i++, handler++) {
+ err = rtnl_register_internal(handler->owner, handler->protocol,
+ handler->msgtype, handler->doit,
+ handler->dumpit, handler->flags);
+ if (err) {
+ if (!handler->owner)
+ panic("Unable to register rtnetlink message "
+ "handlers, %pS\n", handlers);
+
+ __rtnl_unregister_many(handlers, i);
+ break;
+ }
}
- return NULL;
+
+ return err;
}
+EXPORT_SYMBOL_GPL(__rtnl_register_many);
-/**
- * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
- * @ops: struct rtnl_link_ops * to register
- *
- * The caller must hold the rtnl_mutex. This function should be used
- * by drivers that create devices during module initialization. It
- * must be called before registering the devices.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_link_register(struct rtnl_link_ops *ops)
+void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n)
{
- if (rtnl_link_ops_get(ops->kind))
- return -EEXIST;
+ const struct rtnl_msg_handler *handler;
+ int i;
- /* The check for alloc/setup is here because if ops
- * does not have that filled up, it is not possible
- * to use the ops for creating device. So do not
- * fill up dellink as well. That disables rtnl_dellink.
- */
- if ((ops->alloc || ops->setup) && !ops->dellink)
- ops->dellink = unregister_netdevice_queue;
+ for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--)
+ rtnl_unregister(handler->protocol, handler->msgtype);
+}
+EXPORT_SYMBOL_GPL(__rtnl_unregister_many);
- list_add_tail(&ops->list, &link_ops);
- return 0;
+static DEFINE_MUTEX(link_ops_mutex);
+static LIST_HEAD(link_ops);
+
+static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index)
+{
+ struct rtnl_link_ops *ops;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind)) {
+ *srcu_index = srcu_read_lock(&ops->srcu);
+ goto unlock;
+ }
+ }
+
+ ops = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return ops;
+}
+
+static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index)
+{
+ srcu_read_unlock(&ops->srcu, srcu_index);
}
-EXPORT_SYMBOL_GPL(__rtnl_link_register);
/**
* rtnl_link_register - Register rtnl_link_ops with rtnetlink.
@@ -433,6 +590,7 @@ EXPORT_SYMBOL_GPL(__rtnl_link_register);
*/
int rtnl_link_register(struct rtnl_link_ops *ops)
{
+ struct rtnl_link_ops *tmp;
int err;
/* Sanity-check max sizes to avoid stack buffer overflow. */
@@ -440,9 +598,31 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
return -EINVAL;
- rtnl_lock();
- err = __rtnl_link_register(ops);
- rtnl_unlock();
+ /* The check for alloc/setup is here because if ops
+ * does not have that filled up, it is not possible
+ * to use the ops for creating device. So do not
+ * fill up dellink as well. That disables rtnl_dellink.
+ */
+ if ((ops->alloc || ops->setup) && !ops->dellink)
+ ops->dellink = unregister_netdevice_queue;
+
+ err = init_srcu_struct(&ops->srcu);
+ if (err)
+ return err;
+
+ mutex_lock(&link_ops_mutex);
+
+ list_for_each_entry(tmp, &link_ops, list) {
+ if (!strcmp(ops->kind, tmp->kind)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ list_add_tail_rcu(&ops->list, &link_ops);
+unlock:
+ mutex_unlock(&link_ops_mutex);
+
return err;
}
EXPORT_SYMBOL_GPL(rtnl_link_register);
@@ -459,48 +639,20 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
unregister_netdevice_many(&list_kill);
}
-/**
- * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
- * @ops: struct rtnl_link_ops * to unregister
- *
- * The caller must hold the rtnl_mutex and guarantee net_namespace_list
- * integrity (hold pernet_ops_rwsem for writing to close the race
- * with setup_net() and cleanup_net()).
- */
-void __rtnl_link_unregister(struct rtnl_link_ops *ops)
-{
- struct net *net;
-
- for_each_net(net) {
- __rtnl_kill_links(net, ops);
- }
- list_del(&ops->list);
-}
-EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
-
/* Return with the rtnl_lock held when there are no network
* devices unregistering in any network namespace.
*/
static void rtnl_lock_unregistering_all(void)
{
- struct net *net;
- bool unregistering;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
- unregistering = false;
rtnl_lock();
/* We held write locked pernet_ops_rwsem, and parallel
* setup_net() and cleanup_net() are not possible.
*/
- for_each_net(net) {
- if (atomic_read(&net->dev_unreg_count) > 0) {
- unregistering = true;
- break;
- }
- }
- if (!unregistering)
+ if (!atomic_read(&dev_unreg_count))
break;
__rtnl_unlock();
@@ -515,10 +667,22 @@ static void rtnl_lock_unregistering_all(void)
*/
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
+ struct net *net;
+
+ mutex_lock(&link_ops_mutex);
+ list_del_rcu(&ops->list);
+ mutex_unlock(&link_ops_mutex);
+
+ synchronize_srcu(&ops->srcu);
+ cleanup_srcu_struct(&ops->srcu);
+
/* Close the race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
rtnl_lock_unregistering_all();
- __rtnl_link_unregister(ops);
+
+ for_each_net(net)
+ __rtnl_kill_links(net, ops);
+
rtnl_unlock();
up_write(&pernet_ops_rwsem);
}
@@ -575,31 +739,51 @@ static size_t rtnl_link_get_size(const struct net_device *dev)
static LIST_HEAD(rtnl_af_ops);
-static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
+static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index)
{
- const struct rtnl_af_ops *ops;
+ struct rtnl_af_ops *ops;
ASSERT_RTNL();
- list_for_each_entry(ops, &rtnl_af_ops, list) {
- if (ops->family == family)
- return ops;
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
+ if (ops->family == family) {
+ *srcu_index = srcu_read_lock(&ops->srcu);
+ goto unlock;
+ }
}
- return NULL;
+ ops = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return ops;
+}
+
+static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index)
+{
+ srcu_read_unlock(&ops->srcu, srcu_index);
}
/**
* rtnl_af_register - Register rtnl_af_ops with rtnetlink.
* @ops: struct rtnl_af_ops * to register
*
- * Returns 0 on success or a negative error code.
+ * Return: 0 on success or a negative error code.
*/
-void rtnl_af_register(struct rtnl_af_ops *ops)
+int rtnl_af_register(struct rtnl_af_ops *ops)
{
+ int err = init_srcu_struct(&ops->srcu);
+
+ if (err)
+ return err;
+
rtnl_lock();
list_add_tail_rcu(&ops->list, &rtnl_af_ops);
rtnl_unlock();
+
+ return 0;
}
EXPORT_SYMBOL_GPL(rtnl_af_register);
@@ -614,6 +798,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops)
rtnl_unlock();
synchronize_rcu();
+ synchronize_srcu(&ops->srcu);
+ cleanup_srcu_struct(&ops->srcu);
}
EXPORT_SYMBOL_GPL(rtnl_af_unregister);
@@ -851,9 +1037,22 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
}
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
+void netdev_set_operstate(struct net_device *dev, int newstate)
+{
+ unsigned int old = READ_ONCE(dev->operstate);
+
+ do {
+ if (old == newstate)
+ return;
+ } while (!try_cmpxchg(&dev->operstate, &old, newstate));
+
+ netdev_state_change(dev);
+}
+EXPORT_SYMBOL(netdev_set_operstate);
+
static void set_operstate(struct net_device *dev, unsigned char transition)
{
- unsigned char operstate = dev->operstate;
+ unsigned char operstate = READ_ONCE(dev->operstate);
switch (transition) {
case IF_OPER_UP:
@@ -875,12 +1074,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
break;
}
- if (dev->operstate != operstate) {
- write_lock(&dev_base_lock);
- dev->operstate = operstate;
- write_unlock(&dev_base_lock);
- netdev_state_change(dev);
- }
+ netdev_set_operstate(dev, operstate);
}
static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
@@ -1037,8 +1231,8 @@ static size_t rtnl_proto_down_size(const struct net_device *dev)
{
size_t size = nla_total_size(1);
- if (dev->proto_down_reason)
- size += nla_total_size(0) + nla_total_size(4);
+ /* Assume dev->proto_down_reason is not zero. */
+ size += nla_total_size(0) + nla_total_size(4);
return size;
}
@@ -1119,6 +1313,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ rtnl_devlink_port_size(dev)
+ rtnl_dpll_pin_size(dev)
+ + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
+ 0;
}
@@ -1456,17 +1651,18 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
return 0;
}
-static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
+static int rtnl_fill_link_ifmap(struct sk_buff *skb,
+ const struct net_device *dev)
{
struct rtnl_link_ifmap map;
memset(&map, 0, sizeof(map));
- map.mem_start = dev->mem_start;
- map.mem_end = dev->mem_end;
- map.base_addr = dev->base_addr;
- map.irq = dev->irq;
- map.dma = dev->dma;
- map.port = dev->if_port;
+ map.mem_start = READ_ONCE(dev->mem_start);
+ map.mem_end = READ_ONCE(dev->mem_end);
+ map.base_addr = READ_ONCE(dev->base_addr);
+ map.irq = READ_ONCE(dev->irq);
+ map.dma = READ_ONCE(dev->dma);
+ map.port = READ_ONCE(dev->if_port);
if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD))
return -EMSGSIZE;
@@ -1477,13 +1673,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
static u32 rtnl_xdp_prog_skb(struct net_device *dev)
{
const struct bpf_prog *generic_xdp_prog;
+ u32 res = 0;
- ASSERT_RTNL();
+ rcu_read_lock();
+ generic_xdp_prog = rcu_dereference(dev->xdp_prog);
+ if (generic_xdp_prog)
+ res = generic_xdp_prog->aux->id;
+ rcu_read_unlock();
- generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
- if (!generic_xdp_prog)
- return 0;
- return generic_xdp_prog->aux->id;
+ return res;
}
static u32 rtnl_xdp_prog_drv(struct net_device *dev)
@@ -1603,7 +1801,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
upper_dev = netdev_master_upper_dev_get_rcu(dev);
if (upper_dev)
- ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex);
+ ret = nla_put_u32(skb, IFLA_MASTER,
+ READ_ONCE(upper_dev->ifindex));
rcu_read_unlock();
return ret;
@@ -1612,10 +1811,10 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev,
bool force)
{
- int ifindex = dev_get_iflink(dev);
+ int iflink = dev_get_iflink(dev);
- if (force || dev->ifindex != ifindex)
- return nla_put_u32(skb, IFLA_LINK, ifindex);
+ if (force || READ_ONCE(dev->ifindex) != iflink)
+ return nla_put_u32(skb, IFLA_LINK, iflink);
return 0;
}
@@ -1699,7 +1898,7 @@ static int rtnl_fill_alt_ifnames(struct sk_buff *skb,
struct netdev_name_node *name_node;
int count = 0;
- list_for_each_entry(name_node, &dev->name_node->list, list) {
+ list_for_each_entry_rcu(name_node, &dev->name_node->list, list) {
if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name))
return -EMSGSIZE;
count++;
@@ -1707,6 +1906,7 @@ static int rtnl_fill_alt_ifnames(struct sk_buff *skb,
return count;
}
+/* RCU protected. */
static int rtnl_fill_prop_list(struct sk_buff *skb,
const struct net_device *dev)
{
@@ -1735,10 +1935,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb,
struct nlattr *pr;
u32 preason;
- if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
+ if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down)))
goto nla_put_failure;
- preason = dev->proto_down_reason;
+ preason = READ_ONCE(dev->proto_down_reason);
if (!preason)
return 0;
@@ -1811,6 +2011,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
u32 event, int *new_nsid, int new_ifindex,
int tgt_netnsid, gfp_t gfp)
{
+ char devname[IFNAMSIZ];
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
struct Qdisc *qdisc;
@@ -1823,41 +2024,53 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
ifm = nlmsg_data(nlh);
ifm->ifi_family = AF_UNSPEC;
ifm->__ifi_pad = 0;
- ifm->ifi_type = dev->type;
- ifm->ifi_index = dev->ifindex;
+ ifm->ifi_type = READ_ONCE(dev->type);
+ ifm->ifi_index = READ_ONCE(dev->ifindex);
ifm->ifi_flags = dev_get_flags(dev);
ifm->ifi_change = change;
if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
goto nla_put_failure;
- qdisc = rtnl_dereference(dev->qdisc);
- if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
- nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
+ netdev_copy_name(dev, devname);
+ if (nla_put_string(skb, IFLA_IFNAME, devname))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) ||
nla_put_u8(skb, IFLA_OPERSTATE,
- netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
- nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
- nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
- nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
- nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
- nla_put_u32(skb, IFLA_GROUP, dev->group) ||
- nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
- nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) ||
- nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
- nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
- nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
- nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) ||
- nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) ||
- nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) ||
- nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) ||
- nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) ||
+ netif_running(dev) ? READ_ONCE(dev->operstate) :
+ IF_OPER_DOWN) ||
+ nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) ||
+ nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
+ nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) ||
+ nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) ||
+ nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) ||
+ nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) ||
+ nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) ||
+ nla_put_u32(skb, IFLA_NUM_TX_QUEUES,
+ READ_ONCE(dev->num_tx_queues)) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SEGS,
+ READ_ONCE(dev->gso_max_segs)) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SIZE,
+ READ_ONCE(dev->gso_max_size)) ||
+ nla_put_u32(skb, IFLA_GRO_MAX_SIZE,
+ READ_ONCE(dev->gro_max_size)) ||
+ nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE,
+ READ_ONCE(dev->gso_ipv4_max_size)) ||
+ nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE,
+ READ_ONCE(dev->gro_ipv4_max_size)) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SIZE,
+ READ_ONCE(dev->tso_max_size)) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
+ READ_ONCE(dev->tso_max_segs)) ||
+ nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON,
+ READ_ONCE(dev->max_pacing_offload_horizon)) ||
#ifdef CONFIG_RPS
- nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
+ nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
+ READ_ONCE(dev->num_rx_queues)) ||
#endif
put_master_ifindex(skb, dev) ||
nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
- (qdisc &&
- nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) ||
nla_put_ifalias(skb, dev) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
atomic_read(&dev->carrier_up_count) +
@@ -1876,9 +2089,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
}
- if (rtnl_fill_link_ifmap(skb, dev))
- goto nla_put_failure;
-
if (dev->addr_len) {
if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
@@ -1911,9 +2121,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
}
- if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp))
- goto nla_put_failure;
-
if (new_nsid &&
nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
goto nla_put_failure;
@@ -1926,12 +2133,18 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
rcu_read_lock();
+ if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC))
+ goto nla_put_failure_rcu;
+ qdisc = rcu_dereference(dev->qdisc);
+ if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id))
+ goto nla_put_failure_rcu;
if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
goto nla_put_failure_rcu;
- rcu_read_unlock();
-
+ if (rtnl_fill_link_ifmap(skb, dev))
+ goto nla_put_failure_rcu;
if (rtnl_fill_prop_list(skb, dev))
- goto nla_put_failure;
+ goto nla_put_failure_rcu;
+ rcu_read_unlock();
if (dev->dev.parent &&
nla_put_string(skb, IFLA_PARENT_DEV_NAME,
@@ -1960,6 +2173,7 @@ nla_put_failure:
}
static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN },
[IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
[IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
@@ -1988,7 +2202,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
[IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 },
- [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
[IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
@@ -2013,7 +2227,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT },
[IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT },
[IFLA_ALLMULTI] = { .type = NLA_REJECT },
- [IFLA_GSO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
[IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
};
@@ -2068,10 +2282,11 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
[IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
};
-static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
+static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla,
+ int *ops_srcu_index)
{
- const struct rtnl_link_ops *ops = NULL;
struct nlattr *linfo[IFLA_INFO_MAX + 1];
+ struct rtnl_link_ops *ops = NULL;
if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0)
return NULL;
@@ -2080,7 +2295,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla
char kind[MODULE_NAME_LEN];
nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
- ops = rtnl_link_ops_get(kind);
+ ops = rtnl_link_ops_get(kind, ops_srcu_index);
}
return ops;
@@ -2201,24 +2416,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netlink_ext_ack *extack = cb->extack;
+ struct rtnl_link_ops *kind_ops = NULL;
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- struct net *tgt_net = net;
- int h, s_h;
- int idx = 0, s_idx;
- struct net_device *dev;
- struct hlist_head *head;
+ unsigned int flags = NLM_F_MULTI;
struct nlattr *tb[IFLA_MAX+1];
+ struct {
+ unsigned long ifindex;
+ } *ctx = (void *)cb->ctx;
+ struct net *tgt_net = net;
u32 ext_filter_mask = 0;
- const struct rtnl_link_ops *kind_ops = NULL;
- unsigned int flags = NLM_F_MULTI;
+ struct net_device *dev;
+ int ops_srcu_index;
int master_idx = 0;
int netnsid = -1;
int err, i;
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
if (err < 0) {
if (cb->strict_check)
@@ -2238,7 +2451,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
if (IS_ERR(tgt_net)) {
NL_SET_ERR_MSG(extack, "Invalid target network namespace id");
- return PTR_ERR(tgt_net);
+ err = PTR_ERR(tgt_net);
+ netnsid = -1;
+ goto out;
}
break;
case IFLA_EXT_MASK:
@@ -2248,12 +2463,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
master_idx = nla_get_u32(tb[i]);
break;
case IFLA_LINKINFO:
- kind_ops = linkinfo_to_kind_ops(tb[i]);
+ kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index);
break;
default:
if (cb->strict_check) {
NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request");
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
}
}
@@ -2262,38 +2478,27 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
flags |= NLM_F_DUMP_FILTERED;
walk_entries:
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &tgt_net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
- if (link_dump_filtered(dev, master_idx, kind_ops))
- goto cont;
- if (idx < s_idx)
- goto cont;
- err = rtnl_fill_ifinfo(skb, dev, net,
- RTM_NEWLINK,
- NETLINK_CB(cb->skb).portid,
- nlh->nlmsg_seq, 0, flags,
- ext_filter_mask, 0, NULL, 0,
- netnsid, GFP_KERNEL);
-
- if (err < 0) {
- if (likely(skb->len))
- goto out;
-
- goto out_err;
- }
-cont:
- idx++;
- }
+ err = 0;
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+ if (link_dump_filtered(dev, master_idx, kind_ops))
+ continue;
+ err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq, 0, flags,
+ ext_filter_mask, 0, NULL, 0,
+ netnsid, GFP_KERNEL);
+ if (err < 0)
+ break;
}
-out:
- err = skb->len;
-out_err:
- cb->args[1] = idx;
- cb->args[0] = h;
+
+
cb->seq = tgt_net->dev_base_seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+out:
+
+ if (kind_ops)
+ rtnl_link_ops_put(kind_ops, ops_srcu_index);
if (netnsid >= 0)
put_net(tgt_net);
@@ -2322,9 +2527,10 @@ int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
}
EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg);
-struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[])
{
- struct net *net;
+ struct net *net = NULL;
+
/* Examine the link attributes and figure out which
* network namespace we are talking about.
*/
@@ -2332,8 +2538,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
else if (tb[IFLA_NET_NS_FD])
net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
- else
+
+ return net;
+}
+
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+ struct net *net = rtnl_link_get_net_ifla(tb);
+
+ if (!net)
net = get_net(src_net);
+
return net;
}
EXPORT_SYMBOL(rtnl_link_get_net);
@@ -2473,20 +2688,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
int rem, err;
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
- const struct rtnl_af_ops *af_ops;
+ struct rtnl_af_ops *af_ops;
+ int af_ops_srcu_index;
- af_ops = rtnl_af_lookup(nla_type(af));
+ af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
if (!af_ops)
return -EAFNOSUPPORT;
if (!af_ops->set_link_af)
- return -EOPNOTSUPP;
-
- if (af_ops->validate_link_af) {
+ err = -EOPNOTSUPP;
+ else if (af_ops->validate_link_af)
err = af_ops->validate_link_af(dev, af, extack);
- if (err < 0)
- return err;
- }
+ else
+ err = 0;
+
+ rtnl_af_put(af_ops, af_ops_srcu_index);
+
+ if (err < 0)
+ return err;
}
}
@@ -2552,7 +2771,7 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
- nla_len(attr) < NLA_HDRLEN) {
+ nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) {
return -EINVAL;
}
if (len >= MAX_VLAN_LIST_LEN)
@@ -2730,7 +2949,7 @@ static int do_set_proto_down(struct net_device *dev,
bool proto_down;
int err;
- if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) {
+ if (!dev->change_proto_down) {
NL_SET_ERR_MSG(extack, "Protodown not supported by device");
return -EOPNOTSUPP;
}
@@ -2777,8 +2996,8 @@ static int do_set_proto_down(struct net_device *dev,
#define DO_SETLINK_MODIFIED 0x01
/* notify flag means notify + modified. */
#define DO_SETLINK_NOTIFY 0x03
-static int do_setlink(const struct sk_buff *skb,
- struct net_device *dev, struct ifinfomsg *ifm,
+static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
+ struct net *tgt_net, struct ifinfomsg *ifm,
struct netlink_ext_ack *extack,
struct nlattr **tb, int status)
{
@@ -2786,32 +3005,25 @@ static int do_setlink(const struct sk_buff *skb,
char ifname[IFNAMSIZ];
int err;
+ err = validate_linkmsg(dev, tb, extack);
+ if (err < 0)
+ goto errout;
+
if (tb[IFLA_IFNAME])
nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
else
ifname[0] = '\0';
- if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
+ if (!net_eq(tgt_net, dev_net(dev))) {
const char *pat = ifname[0] ? ifname : NULL;
- struct net *net;
int new_ifindex;
- net = rtnl_link_get_net_capable(skb, dev_net(dev),
- tb, CAP_NET_ADMIN);
- if (IS_ERR(net)) {
- err = PTR_ERR(net);
- goto errout;
- }
-
- if (tb[IFLA_NEW_IFINDEX])
- new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]);
- else
- new_ifindex = 0;
+ new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0);
- err = __dev_change_net_namespace(dev, net, pat, new_ifindex);
- put_net(net);
+ err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex);
if (err)
goto errout;
+
status |= DO_SETLINK_MODIFIED;
}
@@ -2983,11 +3195,9 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_LINKMODE]) {
unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
- write_lock(&dev_base_lock);
if (dev->link_mode ^ value)
status |= DO_SETLINK_NOTIFY;
- dev->link_mode = value;
- write_unlock(&dev_base_lock);
+ WRITE_ONCE(dev->link_mode, value);
}
if (tb[IFLA_VFINFO_LIST]) {
@@ -3072,11 +3282,18 @@ static int do_setlink(const struct sk_buff *skb,
int rem;
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
- const struct rtnl_af_ops *af_ops;
+ struct rtnl_af_ops *af_ops;
+ int af_ops_srcu_index;
- BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af))));
+ af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
+ if (!af_ops) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
err = af_ops->set_link_af(dev, af, extack);
+ rtnl_af_put(af_ops, af_ops_srcu_index);
+
if (err < 0)
goto errout;
@@ -3173,11 +3390,13 @@ static struct net_device *rtnl_dev_get(struct net *net,
static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct ifinfomsg *ifm = nlmsg_data(nlh);
struct net *net = sock_net(skb->sk);
- struct ifinfomsg *ifm;
- struct net_device *dev;
- int err;
struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev = NULL;
+ struct rtnl_nets rtnl_nets;
+ struct net *tgt_net;
+ int err;
err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
ifla_policy, extack);
@@ -3188,25 +3407,32 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
- err = -EINVAL;
- ifm = nlmsg_data(nlh);
+ tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
+ if (IS_ERR(tgt_net)) {
+ err = PTR_ERR(tgt_net);
+ goto errout;
+ }
+
+ rtnl_nets_init(&rtnl_nets);
+ rtnl_nets_add(&rtnl_nets, get_net(net));
+ rtnl_nets_add(&rtnl_nets, tgt_net);
+
+ rtnl_nets_lock(&rtnl_nets);
+
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
dev = rtnl_dev_get(net, tb);
else
- goto errout;
+ err = -EINVAL;
- if (dev == NULL) {
+ if (dev)
+ err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
+ else if (!err)
err = -ENODEV;
- goto errout;
- }
- err = validate_linkmsg(dev, tb, extack);
- if (err < 0)
- goto errout;
-
- err = do_setlink(skb, dev, ifm, extack, tb, 0);
+ rtnl_nets_unlock(&rtnl_nets);
+ rtnl_nets_destroy(&rtnl_nets);
errout:
return err;
}
@@ -3266,14 +3492,14 @@ EXPORT_SYMBOL_GPL(rtnl_delete_link);
static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct ifinfomsg *ifm = nlmsg_data(nlh);
struct net *net = sock_net(skb->sk);
u32 portid = NETLINK_CB(skb).portid;
- struct net *tgt_net = net;
- struct net_device *dev = NULL;
- struct ifinfomsg *ifm;
struct nlattr *tb[IFLA_MAX+1];
- int err;
+ struct net_device *dev = NULL;
+ struct net *tgt_net = net;
int netnsid = -1;
+ int err;
err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
ifla_policy, extack);
@@ -3291,27 +3517,24 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(tgt_net);
}
- err = -EINVAL;
- ifm = nlmsg_data(nlh);
+ rtnl_net_lock(tgt_net);
+
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
- dev = rtnl_dev_get(net, tb);
+ dev = rtnl_dev_get(tgt_net, tb);
+
+ if (dev)
+ err = rtnl_delete_link(dev, portid, nlh);
+ else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ err = -ENODEV;
else if (tb[IFLA_GROUP])
err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
else
- goto out;
-
- if (!dev) {
- if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0)
- err = -ENODEV;
-
- goto out;
- }
+ err = -EINVAL;
- err = rtnl_delete_link(dev, portid, nlh);
+ rtnl_net_unlock(tgt_net);
-out:
if (netnsid >= 0)
put_net(tgt_net);
@@ -3438,21 +3661,90 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
}
EXPORT_SYMBOL(rtnl_create_link);
+struct rtnl_newlink_tbs {
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
+};
+
+static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh,
+ const struct rtnl_link_ops *ops,
+ struct net_device *dev, struct net *tgt_net,
+ struct rtnl_newlink_tbs *tbs,
+ struct nlattr **data,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr ** const linkinfo = tbs->linkinfo;
+ struct nlattr ** const tb = tbs->tb;
+ int status = 0;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops || !ops->changelink)
+ return -EOPNOTSUPP;
+
+ err = ops->changelink(dev, tb, data, extack);
+ if (err < 0)
+ return err;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ const struct rtnl_link_ops *m_ops = NULL;
+ struct nlattr **slave_data = NULL;
+ struct net_device *master_dev;
+
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ m_ops = master_dev->rtnl_link_ops;
+
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
+
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
+
+ if (m_ops->slave_maxtype) {
+ err = nla_parse_nested_deprecated(tbs->slave_attr,
+ m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy, extack);
+ if (err < 0)
+ return err;
+
+ slave_data = tbs->slave_attr;
+ }
+
+ err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack);
+ if (err < 0)
+ return err;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status);
+}
+
static int rtnl_group_changelink(const struct sk_buff *skb,
- struct net *net, int group,
- struct ifinfomsg *ifm,
- struct netlink_ext_ack *extack,
- struct nlattr **tb)
+ struct net *net, struct net *tgt_net,
+ int group, struct ifinfomsg *ifm,
+ struct netlink_ext_ack *extack,
+ struct nlattr **tb)
{
struct net_device *dev, *aux;
int err;
for_each_netdev_safe(net, dev, aux) {
if (dev->group == group) {
- err = validate_linkmsg(dev, tb, extack);
- if (err < 0)
- return err;
- err = do_setlink(skb, dev, ifm, extack, tb, 0);
+ err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
if (err < 0)
return err;
}
@@ -3463,6 +3755,8 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
const struct rtnl_link_ops *ops,
+ struct net *tgt_net, struct net *link_net,
+ struct net *peer_net,
const struct nlmsghdr *nlh,
struct nlattr **tb, struct nlattr **data,
struct netlink_ext_ack *extack)
@@ -3470,7 +3764,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
unsigned char name_assign_type = NET_NAME_USER;
struct net *net = sock_net(skb->sk);
u32 portid = NETLINK_CB(skb).portid;
- struct net *dest_net, *link_net;
struct net_device *dev;
char ifname[IFNAMSIZ];
int err;
@@ -3485,27 +3778,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
name_assign_type = NET_NAME_ENUM;
}
- dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
- if (IS_ERR(dest_net))
- return PTR_ERR(dest_net);
-
- if (tb[IFLA_LINK_NETNSID]) {
- int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
-
- link_net = get_net_ns_by_id(dest_net, id);
- if (!link_net) {
- NL_SET_ERR_MSG(extack, "Unknown network namespace id");
- err = -EINVAL;
- goto out;
- }
- err = -EPERM;
- if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
- goto out;
- } else {
- link_net = NULL;
- }
-
- dev = rtnl_create_link(link_net ? : dest_net, ifname,
+ dev = rtnl_create_link(link_net ? : tgt_net, ifname,
name_assign_type, ops, tb, extack);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
@@ -3514,8 +3787,13 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
dev->ifindex = ifm->ifi_index;
+ if (link_net)
+ net = link_net;
+ if (peer_net)
+ net = peer_net;
+
if (ops->newlink)
- err = ops->newlink(link_net ? : net, dev, tb, data, extack);
+ err = ops->newlink(net, dev, tb, data, extack);
else
err = register_netdevice(dev);
if (err < 0) {
@@ -3527,7 +3805,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
if (err < 0)
goto out_unregister;
if (link_net) {
- err = dev_change_net_namespace(dev, dest_net, ifname);
+ err = dev_change_net_namespace(dev, tgt_net, ifname);
if (err < 0)
goto out_unregister;
}
@@ -3537,9 +3815,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
goto out_unregister;
}
out:
- if (link_net)
- put_net(link_net);
- put_net(dest_net);
return err;
out_unregister:
if (ops->newlink) {
@@ -3553,41 +3828,43 @@ out_unregister:
goto out;
}
-struct rtnl_newlink_tbs {
+static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops,
+ struct nlattr *tbp[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
struct nlattr *tb[IFLA_MAX + 1];
- struct nlattr *attr[RTNL_MAX_TYPE + 1];
- struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
-};
+ int err;
+
+ if (!data || !data[ops->peer_type])
+ return rtnl_link_get_net_ifla(tbp);
+
+ err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (ops->validate) {
+ err = ops->validate(tb, NULL, extack);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ return rtnl_link_get_net_ifla(tb);
+}
static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ const struct rtnl_link_ops *ops,
+ struct net *tgt_net, struct net *link_net,
+ struct net *peer_net,
struct rtnl_newlink_tbs *tbs,
+ struct nlattr **data,
struct netlink_ext_ack *extack)
{
- struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
struct nlattr ** const tb = tbs->tb;
- const struct rtnl_link_ops *m_ops;
- struct net_device *master_dev;
struct net *net = sock_net(skb->sk);
- const struct rtnl_link_ops *ops;
- struct nlattr **slave_data;
- char kind[MODULE_NAME_LEN];
struct net_device *dev;
struct ifinfomsg *ifm;
- struct nlattr **data;
bool link_specified;
- int err;
-
-#ifdef CONFIG_MODULES
-replay:
-#endif
- err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
- ifla_policy, extack);
- if (err < 0)
- return err;
-
- err = rtnl_ensure_unique_netns(tb, extack, false);
- if (err < 0)
- return err;
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0) {
@@ -3604,151 +3881,153 @@ replay:
dev = NULL;
}
- master_dev = NULL;
- m_ops = NULL;
- if (dev) {
- master_dev = netdev_master_upper_dev_get(dev);
- if (master_dev)
- m_ops = master_dev->rtnl_link_ops;
+ if (dev)
+ return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack);
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
+ * or it's for a group
+ */
+ if (link_specified || !tb[IFLA_GROUP])
+ return -ENODEV;
+
+ return rtnl_group_changelink(skb, net, tgt_net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, extack, tb);
+ }
+
+ if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
+
+ if (!ops) {
+ NL_SET_ERR_MSG(extack, "Unknown device type");
+ return -EOPNOTSUPP;
}
+ return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh,
+ tb, data, extack);
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *tgt_net, *link_net = NULL, *peer_net = NULL;
+ struct nlattr **tb, **linkinfo, **data = NULL;
+ struct rtnl_link_ops *ops = NULL;
+ struct rtnl_newlink_tbs *tbs;
+ struct rtnl_nets rtnl_nets;
+ int ops_srcu_index;
+ int ret;
+
+ tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
+ if (!tbs)
+ return -ENOMEM;
+
+ tb = tbs->tb;
+ ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, extack);
+ if (ret < 0)
+ goto free;
+
+ ret = rtnl_ensure_unique_netns(tb, extack, false);
+ if (ret < 0)
+ goto free;
+
+ linkinfo = tbs->linkinfo;
if (tb[IFLA_LINKINFO]) {
- err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
+ ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
tb[IFLA_LINKINFO],
ifla_info_policy, NULL);
- if (err < 0)
- return err;
- } else
- memset(linkinfo, 0, sizeof(linkinfo));
+ if (ret < 0)
+ goto free;
+ } else {
+ memset(linkinfo, 0, sizeof(tbs->linkinfo));
+ }
if (linkinfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
+
nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
- ops = rtnl_link_ops_get(kind);
- } else {
- kind[0] = '\0';
- ops = NULL;
+ ops = rtnl_link_ops_get(kind, &ops_srcu_index);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ request_module("rtnl-link-%s", kind);
+ ops = rtnl_link_ops_get(kind, &ops_srcu_index);
+ }
+#endif
}
- data = NULL;
+ rtnl_nets_init(&rtnl_nets);
+
if (ops) {
- if (ops->maxtype > RTNL_MAX_TYPE)
- return -EINVAL;
+ if (ops->maxtype > RTNL_MAX_TYPE) {
+ ret = -EINVAL;
+ goto put_ops;
+ }
if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
- err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
+ ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
linkinfo[IFLA_INFO_DATA],
ops->policy, extack);
- if (err < 0)
- return err;
+ if (ret < 0)
+ goto put_ops;
+
data = tbs->attr;
}
+
if (ops->validate) {
- err = ops->validate(tb, data, extack);
- if (err < 0)
- return err;
+ ret = ops->validate(tb, data, extack);
+ if (ret < 0)
+ goto put_ops;
}
- }
- slave_data = NULL;
- if (m_ops) {
- if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
- return -EINVAL;
-
- if (m_ops->slave_maxtype &&
- linkinfo[IFLA_INFO_SLAVE_DATA]) {
- err = nla_parse_nested_deprecated(tbs->slave_attr,
- m_ops->slave_maxtype,
- linkinfo[IFLA_INFO_SLAVE_DATA],
- m_ops->slave_policy,
- extack);
- if (err < 0)
- return err;
- slave_data = tbs->slave_attr;
+ if (ops->peer_type) {
+ peer_net = rtnl_get_peer_net(ops, tb, data, extack);
+ if (IS_ERR(peer_net)) {
+ ret = PTR_ERR(peer_net);
+ goto put_ops;
+ }
+ if (peer_net)
+ rtnl_nets_add(&rtnl_nets, peer_net);
}
}
- if (dev) {
- int status = 0;
-
- if (nlh->nlmsg_flags & NLM_F_EXCL)
- return -EEXIST;
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
- return -EOPNOTSUPP;
-
- err = validate_linkmsg(dev, tb, extack);
- if (err < 0)
- return err;
-
- if (linkinfo[IFLA_INFO_DATA]) {
- if (!ops || ops != dev->rtnl_link_ops ||
- !ops->changelink)
- return -EOPNOTSUPP;
+ tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN);
+ if (IS_ERR(tgt_net)) {
+ ret = PTR_ERR(tgt_net);
+ goto put_net;
+ }
- err = ops->changelink(dev, tb, data, extack);
- if (err < 0)
- return err;
- status |= DO_SETLINK_NOTIFY;
- }
+ rtnl_nets_add(&rtnl_nets, tgt_net);
- if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
- if (!m_ops || !m_ops->slave_changelink)
- return -EOPNOTSUPP;
+ if (tb[IFLA_LINK_NETNSID]) {
+ int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
- err = m_ops->slave_changelink(master_dev, dev, tb,
- slave_data, extack);
- if (err < 0)
- return err;
- status |= DO_SETLINK_NOTIFY;
+ link_net = get_net_ns_by_id(tgt_net, id);
+ if (!link_net) {
+ NL_SET_ERR_MSG(extack, "Unknown network namespace id");
+ ret = -EINVAL;
+ goto put_net;
}
- return do_setlink(skb, dev, ifm, extack, tb, status);
- }
-
- if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
- * or it's for a group
- */
- if (link_specified)
- return -ENODEV;
- if (tb[IFLA_GROUP])
- return rtnl_group_changelink(skb, net,
- nla_get_u32(tb[IFLA_GROUP]),
- ifm, extack, tb);
- return -ENODEV;
- }
-
- if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
- return -EOPNOTSUPP;
+ rtnl_nets_add(&rtnl_nets, link_net);
- if (!ops) {
-#ifdef CONFIG_MODULES
- if (kind[0]) {
- __rtnl_unlock();
- request_module("rtnl-link-%s", kind);
- rtnl_lock();
- ops = rtnl_link_ops_get(kind);
- if (ops)
- goto replay;
+ if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ goto put_net;
}
-#endif
- NL_SET_ERR_MSG(extack, "Unknown device type");
- return -EOPNOTSUPP;
}
- return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack);
-}
+ rtnl_nets_lock(&rtnl_nets);
+ ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack);
+ rtnl_nets_unlock(&rtnl_nets);
-static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
-{
- struct rtnl_newlink_tbs *tbs;
- int ret;
-
- tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
- if (!tbs)
- return -ENOMEM;
-
- ret = __rtnl_newlink(skb, nlh, tbs, extack);
+put_net:
+ rtnl_nets_destroy(&rtnl_nets);
+put_ops:
+ if (ops)
+ rtnl_link_ops_put(ops, ops_srcu_index);
+free:
kfree(tbs);
return ret;
}
@@ -3977,22 +4256,28 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
}
-static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb,
+ struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
size_t min_ifinfo_dump_size = 0;
- struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
struct net_device *dev;
- int hdrlen;
+ struct nlattr *nla;
+ int hdrlen, rem;
/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
- if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) {
- if (tb[IFLA_EXT_MASK])
- ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+ return NLMSG_GOODSIZE;
+
+ nla_for_each_attr_type(nla, IFLA_EXT_MASK,
+ nlmsg_attrdata(nlh, hdrlen),
+ nlmsg_attrlen(nlh, hdrlen), rem) {
+ if (nla_len(nla) == sizeof(u32))
+ ext_filter_mask = nla_get_u32(nla);
}
if (!ext_filter_mask)
@@ -4089,8 +4374,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
}
return skb;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
return NULL;
}
@@ -4315,9 +4599,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
+ bool notified = false;
err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
- nlh->nlmsg_flags, extack);
+ nlh->nlmsg_flags, &notified, extack);
if (err)
goto out;
else
@@ -4326,16 +4611,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Embedded bridge, macvlan, and any other device support */
if ((ndm->ndm_flags & NTF_SELF)) {
+ bool notified = false;
+
if (dev->netdev_ops->ndo_fdb_add)
err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
vid,
nlh->nlmsg_flags,
- extack);
+ &notified, extack);
else
err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
nlh->nlmsg_flags);
- if (!err) {
+ if (!err && !notified) {
rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
@@ -4435,11 +4722,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ bool notified = false;
ops = br_dev->netdev_ops;
if (!del_bulk) {
if (ops->ndo_fdb_del)
- err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+ &notified, extack);
} else {
if (ops->ndo_fdb_del_bulk)
err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
@@ -4453,10 +4742,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Embedded bridge, macvlan, and any other device support */
if (ndm->ndm_flags & NTF_SELF) {
+ bool notified = false;
+
ops = dev->netdev_ops;
if (!del_bulk) {
if (ops->ndo_fdb_del)
- err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+ &notified, extack);
else
err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
} else {
@@ -4467,7 +4759,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
}
if (!err) {
- if (!del_bulk)
+ if (!del_bulk && !notified)
rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
@@ -4483,15 +4775,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
int *idx,
struct netdev_hw_addr_list *list)
{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
struct netdev_hw_addr *ha;
- int err;
u32 portid, seq;
+ int err;
portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
list_for_each_entry(ha, &list->list, list) {
- if (*idx < cb->args[2])
+ if (*idx < ctx->fdb_idx)
goto skip;
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
@@ -4630,18 +4923,16 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct net_device *dev;
- struct net_device *br_dev = NULL;
- const struct net_device_ops *ops = NULL;
- const struct net_device_ops *cops = NULL;
+ const struct net_device_ops *ops = NULL, *cops = NULL;
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
+ struct net_device *dev, *br_dev = NULL;
struct net *net = sock_net(skb->sk);
- struct hlist_head *head;
int brport_idx = 0;
int br_idx = 0;
- int h, s_h;
- int idx = 0, s_idx;
- int err = 0;
int fidx = 0;
+ int err;
+
+ NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context);
if (cb->strict_check)
err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
@@ -4660,70 +4951,51 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
-
- if (brport_idx && (dev->ifindex != brport_idx))
- continue;
-
- if (!br_idx) { /* user did not specify a specific bridge */
- if (netif_is_bridge_port(dev)) {
- br_dev = netdev_master_upper_dev_get(dev);
- cops = br_dev->netdev_ops;
- }
- } else {
- if (dev != br_dev &&
- !netif_is_bridge_port(dev))
- continue;
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
- if (br_dev != netdev_master_upper_dev_get(dev) &&
- !netif_is_bridge_master(dev))
- continue;
- cops = ops;
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (netif_is_bridge_port(dev)) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
}
+ } else {
+ if (dev != br_dev &&
+ !netif_is_bridge_port(dev))
+ continue;
- if (idx < s_idx)
- goto cont;
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !netif_is_bridge_master(dev))
+ continue;
+ cops = ops;
+ }
- if (netif_is_bridge_port(dev)) {
- if (cops && cops->ndo_fdb_dump) {
- err = cops->ndo_fdb_dump(skb, cb,
- br_dev, dev,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
- }
+ if (netif_is_bridge_port(dev)) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ break;
}
+ }
- if (dev->netdev_ops->ndo_fdb_dump)
- err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
- dev, NULL,
- &fidx);
- else
- err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx);
+ if (err == -EMSGSIZE)
+ break;
- cops = NULL;
+ cops = NULL;
- /* reset fdb offset to 0 for rest of the interfaces */
- cb->args[2] = 0;
- fidx = 0;
-cont:
- idx++;
- }
+ /* reset fdb offset to 0 for rest of the interfaces */
+ ctx->fdb_idx = 0;
+ fidx = 0;
}
-out:
- cb->args[0] = h;
- cb->args[1] = idx;
- cb->args[2] = fidx;
+ ctx->fdb_idx = fidx;
return skb->len;
}
@@ -5269,15 +5541,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
if (br_spec) {
- nla_for_each_nested(attr, br_spec, rem) {
- if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
- if (nla_len(attr) < sizeof(flags))
- return -EINVAL;
+ nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec,
+ rem) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
- have_flags = true;
- flags = nla_get_u16(attr);
- break;
- }
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
}
}
@@ -5986,19 +6257,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netlink_ext_ack *extack = cb->extack;
- int h, s_h, err, s_idx, s_idxattr, s_prividx;
struct rtnl_stats_dump_filters filters;
struct net *net = sock_net(skb->sk);
unsigned int flags = NLM_F_MULTI;
struct if_stats_msg *ifsm;
- struct hlist_head *head;
+ struct {
+ unsigned long ifindex;
+ int idxattr;
+ int prividx;
+ } *ctx = (void *)cb->ctx;
struct net_device *dev;
- int idx = 0;
-
- s_h = cb->args[0];
- s_idx = cb->args[1];
- s_idxattr = cb->args[2];
- s_prividx = cb->args[3];
+ int err;
cb->seq = net->dev_base_seq;
@@ -6017,39 +6286,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (err)
return err;
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0,
- flags, &filters,
- &s_idxattr, &s_prividx,
- extack);
- /* If we ran out of room on the first message,
- * we're in trouble
- */
- WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ flags, &filters,
+ &ctx->idxattr, &ctx->prividx,
+ extack);
+ /* If we ran out of room on the first message,
+ * we're in trouble.
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
- if (err < 0)
- goto out;
- s_prividx = 0;
- s_idxattr = 0;
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-cont:
- idx++;
- }
+ if (err < 0)
+ break;
+ ctx->prividx = 0;
+ ctx->idxattr = 0;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
}
-out:
- cb->args[3] = s_prividx;
- cb->args[2] = s_idxattr;
- cb->args[1] = idx;
- cb->args[0] = h;
- return skb->len;
+ return err;
}
void rtnl_offload_xstats_notify(struct net_device *dev)
@@ -6188,7 +6444,7 @@ static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
int idx, s_idx;
int err;
- NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx);
+ NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx);
if (cb->strict_check) {
err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack);
@@ -6508,6 +6764,52 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Process one rtnetlink message. */
+static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED);
+ rtnl_dumpit_func dumpit = cb->data;
+ int err;
+
+ /* Previous iteration have already finished, avoid calling->dumpit()
+ * again, it may not expect to be called after it reached the end.
+ */
+ if (!dumpit)
+ return 0;
+
+ if (needs_lock)
+ rtnl_lock();
+ err = dumpit(skb, cb);
+ if (needs_lock)
+ rtnl_unlock();
+
+ /* Old dump handlers used to send NLM_DONE as in a separate recvmsg().
+ * Some applications which parse netlink manually depend on this.
+ */
+ if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) {
+ if (err < 0 && err != -EMSGSIZE)
+ return err;
+ if (!err)
+ cb->data = NULL;
+
+ return skb->len;
+ }
+ return err;
+}
+
+static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *control)
+{
+ if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE ||
+ !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) {
+ WARN_ON(control->data);
+ control->data = control->dump;
+ control->dump = rtnl_dumpit;
+ }
+
+ return netlink_dump_start(ssk, skb, nlh, control);
+}
+
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -6552,6 +6854,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
owner = link->owner;
dumpit = link->dumpit;
+ flags = link->flags;
if (type == RTM_GETLINK - RTM_BASE)
min_dump_alloc = rtnl_calcit(skb, nlh);
@@ -6569,8 +6872,9 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
.dump = dumpit,
.min_dump_alloc = min_dump_alloc,
.module = owner,
+ .flags = flags,
};
- err = netlink_dump_start(rtnl, skb, nlh, &c);
+ err = rtnetlink_dump_start(rtnl, skb, nlh, &c);
/* netlink_dump_start() will keep a reference on
* module if dump is still in progress.
*/
@@ -6685,7 +6989,6 @@ static int __net_init rtnetlink_net_init(struct net *net)
struct netlink_kernel_cfg cfg = {
.groups = RTNLGRP_MAX,
.input = rtnetlink_rcv,
- .cb_mutex = &rtnl_mutex,
.flags = NL_CFG_F_NONROOT_RECV,
.bind = rtnetlink_bind,
};
@@ -6708,6 +7011,41 @@ static struct pernet_operations rtnetlink_net_ops = {
.exit = rtnetlink_net_exit,
};
+static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELLINK, .doit = rtnl_dellink,
+ .flags = RTNL_FLAG_DOIT_PERNET_WIP},
+ {.msgtype = RTM_GETLINK, .doit = rtnl_getlink,
+ .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+ {.msgtype = RTM_SETLINK, .doit = rtnl_setlink,
+ .flags = RTNL_FLAG_DOIT_PERNET_WIP},
+ {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get,
+ .dumpit = rtnl_stats_dump},
+ {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set},
+ {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop},
+ {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK,
+ .dumpit = rtnl_bridge_getlink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK,
+ .doit = rtnl_bridge_dellink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK,
+ .doit = rtnl_bridge_setlink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del,
+ .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get,
+ .dumpit = rtnl_fdb_dump},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del,
+ .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get,
+ .dumpit = rtnl_mdb_dump},
+};
+
void __init rtnetlink_init(void)
{
if (register_pernet_subsys(&rtnetlink_net_ops))
@@ -6715,34 +7053,5 @@ void __init rtnetlink_init(void)
register_netdevice_notifier(&rtnetlink_dev_notifier);
- rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
- rtnl_dump_ifinfo, 0);
- rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
- rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
- rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);
-
- rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0);
-
- rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL,
- RTNL_FLAG_BULK_DEL_SUPPORTED);
- rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);
-
- rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
- rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
- 0);
- rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
-
- rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0);
- rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL,
- RTNL_FLAG_BULK_DEL_SUPPORTED);
+ rtnl_register_many(rtnetlink_rtnl_msg_handlers);
}
diff --git a/net/core/rtnl_net_debug.c b/net/core/rtnl_net_debug.c
new file mode 100644
index 000000000000..7ecd28cc1c22
--- /dev/null
+++ b/net/core/rtnl_net_debug.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright Amazon.com Inc. or its affiliates. */
+
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+static int rtnl_net_debug_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ enum netdev_cmd cmd = event;
+
+ /* Keep enum and don't add default to trigger -Werror=switch */
+ switch (cmd) {
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_REBOOT:
+ case NETDEV_CHANGE:
+ case NETDEV_REGISTER:
+ case NETDEV_UNREGISTER:
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_PRE_CHANGEADDR:
+ case NETDEV_GOING_DOWN:
+ case NETDEV_FEAT_CHANGE:
+ case NETDEV_BONDING_FAILOVER:
+ case NETDEV_PRE_UP:
+ case NETDEV_PRE_TYPE_CHANGE:
+ case NETDEV_POST_TYPE_CHANGE:
+ case NETDEV_POST_INIT:
+ case NETDEV_PRE_UNINIT:
+ case NETDEV_RELEASE:
+ case NETDEV_NOTIFY_PEERS:
+ case NETDEV_JOIN:
+ case NETDEV_CHANGEUPPER:
+ case NETDEV_RESEND_IGMP:
+ case NETDEV_PRECHANGEMTU:
+ case NETDEV_CHANGEINFODATA:
+ case NETDEV_BONDING_INFO:
+ case NETDEV_PRECHANGEUPPER:
+ case NETDEV_CHANGELOWERSTATE:
+ case NETDEV_UDP_TUNNEL_PUSH_INFO:
+ case NETDEV_UDP_TUNNEL_DROP_INFO:
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ case NETDEV_CVLAN_FILTER_PUSH_INFO:
+ case NETDEV_CVLAN_FILTER_DROP_INFO:
+ case NETDEV_SVLAN_FILTER_PUSH_INFO:
+ case NETDEV_SVLAN_FILTER_DROP_INFO:
+ case NETDEV_OFFLOAD_XSTATS_ENABLE:
+ case NETDEV_OFFLOAD_XSTATS_DISABLE:
+ case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
+ case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
+ case NETDEV_XDP_FEAT_CHANGE:
+ ASSERT_RTNL();
+ break;
+
+ case NETDEV_CHANGENAME:
+ ASSERT_RTNL_NET(net);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int rtnl_net_debug_net_id;
+
+static int __net_init rtnl_net_debug_net_init(struct net *net)
+{
+ struct notifier_block *nb;
+
+ nb = net_generic(net, rtnl_net_debug_net_id);
+ nb->notifier_call = rtnl_net_debug_event;
+
+ return register_netdevice_notifier_net(net, nb);
+}
+
+static void __net_exit rtnl_net_debug_net_exit(struct net *net)
+{
+ struct notifier_block *nb;
+
+ nb = net_generic(net, rtnl_net_debug_net_id);
+ unregister_netdevice_notifier_net(net, nb);
+}
+
+static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = {
+ .init = rtnl_net_debug_net_init,
+ .exit = rtnl_net_debug_net_exit,
+ .id = &rtnl_net_debug_net_id,
+ .size = sizeof(struct notifier_block),
+};
+
+static struct notifier_block rtnl_net_debug_block = {
+ .notifier_call = rtnl_net_debug_event,
+};
+
+static int __init rtnl_net_debug_init(void)
+{
+ int ret;
+
+ ret = register_pernet_device(&rtnl_net_debug_net_ops);
+ if (ret)
+ return ret;
+
+ ret = register_netdevice_notifier(&rtnl_net_debug_block);
+ if (ret)
+ unregister_pernet_subsys(&rtnl_net_debug_net_ops);
+
+ return ret;
+}
+
+subsys_initcall(rtnl_net_debug_init);
diff --git a/net/core/scm.c b/net/core/scm.c
index d0e0852a24d5..733c0cbd393d 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -36,6 +36,7 @@
#include <net/compat.h>
#include <net/scm.h>
#include <net/cls_cgroup.h>
+#include <net/af_unix.h>
/*
@@ -85,8 +86,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
return -ENOMEM;
*fplp = fpl;
fpl->count = 0;
+ fpl->count_unix = 0;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
+#if IS_ENABLED(CONFIG_UNIX)
+ fpl->inflight = false;
+ fpl->dead = false;
+ fpl->edges = NULL;
+ INIT_LIST_HEAD(&fpl->vertices);
+#endif
}
fpp = &fpl->fp[fpl->count];
@@ -109,6 +117,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
fput(file);
return -EINVAL;
}
+ if (unix_get_socket(file))
+ fpl->count_unix++;
+
*fpp++ = file;
fpl->count++;
}
@@ -271,6 +282,16 @@ efault:
}
EXPORT_SYMBOL(put_cmsg);
+int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len,
+ void *data)
+{
+ /* Don't produce truncated CMSGs */
+ if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len))
+ return -ETOOSMALL;
+
+ return put_cmsg(msg, level, type, len, data);
+}
+
void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
{
struct scm_timestamping64 tss;
@@ -371,8 +392,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
+
new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user);
+#if IS_ENABLED(CONFIG_UNIX)
+ new_fpl->inflight = false;
+ new_fpl->edges = NULL;
+ INIT_LIST_HEAD(&new_fpl->vertices);
+#endif
}
return new_fpl;
}
diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c
new file mode 100644
index 000000000000..4235db6bdfad
--- /dev/null
+++ b/net/core/skb_fault_injection.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+static struct {
+ struct fault_attr attr;
+ char devname[IFNAMSIZ];
+ bool filtered;
+} skb_realloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .filtered = false,
+};
+
+static bool should_fail_net_realloc_skb(struct sk_buff *skb)
+{
+ struct net_device *net = skb->dev;
+
+ if (skb_realloc.filtered &&
+ strncmp(net->name, skb_realloc.devname, IFNAMSIZ))
+ /* device name filter set, but names do not match */
+ return false;
+
+ if (!should_fail(&skb_realloc.attr, 1))
+ return false;
+
+ return true;
+}
+ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE);
+
+void skb_might_realloc(struct sk_buff *skb)
+{
+ if (!should_fail_net_realloc_skb(skb))
+ return;
+
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_might_realloc);
+
+static int __init fail_skb_realloc_setup(char *str)
+{
+ return setup_fault_attr(&skb_realloc.attr, str);
+}
+__setup("fail_skb_realloc=", fail_skb_realloc_setup);
+
+static void reset_settings(void)
+{
+ skb_realloc.filtered = false;
+ memset(&skb_realloc.devname, 0, IFNAMSIZ);
+}
+
+static ssize_t devname_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ ssize_t ret;
+
+ reset_settings();
+ ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ,
+ ppos, buffer, count);
+ if (ret < 0)
+ return ret;
+
+ skb_realloc.devname[IFNAMSIZ - 1] = '\0';
+ /* Remove a possible \n at the end of devname */
+ strim(skb_realloc.devname);
+
+ if (strnlen(skb_realloc.devname, IFNAMSIZ))
+ skb_realloc.filtered = true;
+
+ return count;
+}
+
+static ssize_t devname_read(struct file *file,
+ char __user *buffer,
+ size_t size, loff_t *ppos)
+{
+ if (!skb_realloc.filtered)
+ return 0;
+
+ return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname,
+ strlen(skb_realloc.devname));
+}
+
+static const struct file_operations devname_ops = {
+ .write = devname_write,
+ .read = devname_read,
+};
+
+static int __init fail_skb_realloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_skb_realloc", NULL,
+ &skb_realloc.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_file("devname", mode, dir, NULL, &devname_ops);
+
+ return 0;
+}
+
+late_initcall(fail_skb_realloc_debugfs);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index edbbef563d4d..b1c81687e9d8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -51,6 +51,7 @@
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
@@ -68,7 +69,9 @@
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/gro.h>
#include <net/gso.h>
+#include <net/hotdata.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
@@ -86,18 +89,16 @@
#include <linux/textsearch.h>
#include "dev.h"
+#include "netmem_priv.h"
#include "sock_destructor.h"
-struct kmem_cache *skbuff_cache __ro_after_init;
-static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
-
-static struct kmem_cache *skb_small_head_cache __ro_after_init;
-
-#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
+#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
+ GRO_MAX_HEAD_PAD))
/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
* This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
@@ -112,8 +113,23 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init;
#define SKB_SMALL_HEAD_HEADROOM \
SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
-int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
-EXPORT_SYMBOL(sysctl_max_skb_frags);
+/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
+ * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
+ * netmem is a page.
+ */
+static_assert(offsetof(struct bio_vec, bv_page) ==
+ offsetof(skb_frag_t, netmem));
+static_assert(sizeof_field(struct bio_vec, bv_page) ==
+ sizeof_field(skb_frag_t, netmem));
+
+static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
+static_assert(sizeof_field(struct bio_vec, bv_len) ==
+ sizeof_field(skb_frag_t, len));
+
+static_assert(offsetof(struct bio_vec, bv_offset) ==
+ offsetof(skb_frag_t, offset));
+static_assert(sizeof_field(struct bio_vec, bv_offset) ==
+ sizeof_field(skb_frag_t, offset));
#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
@@ -207,97 +223,31 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
#define NAPI_SKB_CACHE_BULK 16
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
-#if PAGE_SIZE == SZ_4K
-
-#define NAPI_HAS_SMALL_PAGE_FRAG 1
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
-
-/* specialized page frag allocator using a single order 0 page
- * and slicing it into 1K sized fragment. Constrained to systems
- * with a very limited amount of 1K fragments fitting a single
- * page - to avoid excessive truesize underestimation
- */
-
-struct page_frag_1k {
- void *va;
- u16 offset;
- bool pfmemalloc;
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
-{
- struct page *page;
- int offset;
-
- offset = nc->offset - SZ_1K;
- if (likely(offset >= 0))
- goto use_frag;
-
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
- if (!page)
- return NULL;
-
- nc->va = page_address(page);
- nc->pfmemalloc = page_is_pfmemalloc(page);
- offset = PAGE_SIZE - SZ_1K;
- page_ref_add(page, offset / SZ_1K);
-
-use_frag:
- nc->offset = offset;
- return nc->va + offset;
-}
-#else
-
-/* the small page is actually unused in this build; add dummy helpers
- * to please the compiler and avoid later preprocessor's conditionals
- */
-#define NAPI_HAS_SMALL_PAGE_FRAG 0
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
-
-struct page_frag_1k {
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
-{
- return NULL;
-}
-
-#endif
-
struct napi_alloc_cache {
+ local_lock_t bh_lock;
struct page_frag_cache page;
- struct page_frag_1k page_small;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
-
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-void napi_get_frags_check(struct napi_struct *napi)
-{
- struct sk_buff *skb;
-
- local_bh_disable();
- skb = napi_get_frags(napi);
- WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
- napi_free_frags(napi);
- local_bh_enable();
-}
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ void *data;
fragsz = SKB_DATA_ALIGN(fragsz);
- return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ data = __page_frag_alloc_align(&nc->page, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN, align_mask);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ return data;
+
}
EXPORT_SYMBOL(__napi_alloc_frag_align);
@@ -305,17 +255,16 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
void *data;
- fragsz = SKB_DATA_ALIGN(fragsz);
if (in_hardirq() || irqs_disabled()) {
struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
+ fragsz = SKB_DATA_ALIGN(fragsz);
+ data = __page_frag_alloc_align(nc, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN,
+ align_mask);
} else {
- struct napi_alloc_cache *nc;
-
local_bh_disable();
- nc = this_cpu_ptr(&napi_alloc_cache);
- data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+ data = __napi_alloc_frag_align(fragsz, align_mask);
local_bh_enable();
}
return data;
@@ -327,17 +276,21 @@ static struct sk_buff *napi_skb_cache_get(void)
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!nc->skb_count)) {
- nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache,
- GFP_ATOMIC,
+ nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
- if (unlikely(!nc->skb_count))
+ if (unlikely(!nc->skb_count)) {
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return NULL;
+ }
}
skb = nc->skb_cache[--nc->skb_count];
- kasan_mempool_unpoison_object(skb, kmem_cache_size(skbuff_cache));
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
return skb;
}
@@ -395,7 +348,8 @@ struct sk_buff *slab_build_skb(void *data)
struct sk_buff *skb;
unsigned int size;
- skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
@@ -446,7 +400,8 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
- skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
@@ -557,7 +512,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
obj_size = SKB_HEAD_ALIGN(*size);
if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
!(flags & KMALLOC_NOT_NORMAL_BITS)) {
- obj = kmem_cache_alloc_node(skb_small_head_cache,
+ obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
*size = SKB_SMALL_HEAD_CACHE_SIZE;
@@ -565,7 +520,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
goto out;
/* Try again but now we are using pfmemalloc reserves */
ret_pfmemalloc = true;
- obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node);
+ obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
goto out;
}
@@ -628,7 +583,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
u8 *data;
cache = (flags & SKB_ALLOC_FCLONE)
- ? skbuff_fclone_cache : skbuff_cache;
+ ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
gfp_mask |= __GFP_MEMALLOC;
@@ -709,7 +664,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
*/
- if (len <= SKB_WITH_OVERHEAD(1024) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
@@ -726,12 +681,16 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
+
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
}
@@ -758,10 +717,9 @@ skb_fail:
EXPORT_SYMBOL(__netdev_alloc_skb);
/**
- * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
* @napi: napi instance this buffer was allocated for
* @len: length to allocate
- * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
*
* Allocate a new sk_buff for use in NAPI receive. This buffer will
* attempt to allocate the head from a special reserved region used
@@ -770,9 +728,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
- gfp_t gfp_mask)
+struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
+ gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
struct napi_alloc_cache *nc;
struct sk_buff *skb;
bool pfmemalloc;
@@ -783,10 +741,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
- * When the small frag allocator is available, prefer it over kmalloc
- * for small fragments
*/
- if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -796,32 +752,17 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
goto skb_success;
}
- nc = this_cpu_ptr(&napi_alloc_cache);
+ len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
- /* we are artificially inflating the allocation size, but
- * that is not as bad as it may look like, as:
- * - 'len' less than GRO_MAX_HEAD makes little sense
- * - On most systems, larger 'len' values lead to fragment
- * size above 512 bytes
- * - kmalloc would use the kmalloc-1k slab for such values
- * - Builds with smaller GRO_MAX_HEAD will very likely do
- * little networking, as that implies no WiFi and no
- * tunnels support, and 32 bits arches.
- */
- len = SZ_1K;
-
- data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
- pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
- } else {
- len = SKB_HEAD_ALIGN(len);
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ nc = this_cpu_ptr(&napi_alloc_cache);
- data = page_frag_alloc(&nc->page, len, gfp_mask);
- pfmemalloc = nc->page.pfmemalloc;
- }
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!data))
return NULL;
@@ -843,19 +784,19 @@ skb_success:
skb_fail:
return skb;
}
-EXPORT_SYMBOL(__napi_alloc_skb);
+EXPORT_SYMBOL(napi_alloc_skb);
-void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
- int size, unsigned int truesize)
+void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
+ int off, int size, unsigned int truesize)
{
DEBUG_NET_WARN_ON_ONCE(size > truesize);
- skb_fill_page_desc(skb, i, page, off, size);
+ skb_fill_netmem_desc(skb, i, netmem, off, size);
skb->len += size;
skb->data_len += size;
skb->truesize += truesize;
}
-EXPORT_SYMBOL(skb_add_rx_frag);
+EXPORT_SYMBOL(skb_add_rx_frag_netmem);
void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
unsigned int truesize)
@@ -890,18 +831,107 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool is_pp_page(struct page *page)
+static bool is_pp_netmem(netmem_ref netmem)
{
- return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
+ return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE;
}
+int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
+ unsigned int headroom)
+{
#if IS_ENABLED(CONFIG_PAGE_POOL)
-bool napi_pp_put_page(struct page *page, bool napi_safe)
+ u32 size, truesize, len, max_head_size, off;
+ struct sk_buff *skb = *pskb, *nskb;
+ int err, i, head_off;
+ void *data;
+
+ /* XDP does not support fraglist so we need to linearize
+ * the skb.
+ */
+ if (skb_has_frag_list(skb))
+ return -EOPNOTSUPP;
+
+ max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
+ if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
+ return -ENOMEM;
+
+ size = min_t(u32, skb->len, max_head_size);
+ truesize = SKB_HEAD_ALIGN(size) + headroom;
+ data = page_pool_dev_alloc_va(pool, &truesize);
+ if (!data)
+ return -ENOMEM;
+
+ nskb = napi_build_skb(data, truesize);
+ if (!nskb) {
+ page_pool_free_va(pool, data, true);
+ return -ENOMEM;
+ }
+
+ skb_reserve(nskb, headroom);
+ skb_copy_header(nskb, skb);
+ skb_mark_for_recycle(nskb);
+
+ err = skb_copy_bits(skb, 0, nskb->data, size);
+ if (err) {
+ consume_skb(nskb);
+ return err;
+ }
+ skb_put(nskb, size);
+
+ head_off = skb_headroom(nskb) - skb_headroom(skb);
+ skb_headers_offset_update(nskb, head_off);
+
+ off = size;
+ len = skb->len - off;
+ for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+ struct page *page;
+ u32 page_off;
+
+ size = min_t(u32, len, PAGE_SIZE);
+ truesize = size;
+
+ page = page_pool_dev_alloc(pool, &page_off, &truesize);
+ if (!page) {
+ consume_skb(nskb);
+ return -ENOMEM;
+ }
+
+ skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
+ err = skb_copy_bits(skb, off, page_address(page) + page_off,
+ size);
+ if (err) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ len -= size;
+ off += size;
+ }
+
+ consume_skb(skb);
+ *pskb = nskb;
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+EXPORT_SYMBOL(skb_pp_cow_data);
+
+int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
+ const struct bpf_prog *prog)
{
- bool allow_direct = false;
- struct page_pool *pp;
+ if (!prog->aux->xdp_has_frags)
+ return -EINVAL;
- page = compound_head(page);
+ return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
+}
+EXPORT_SYMBOL(skb_cow_data_for_xdp);
+
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+bool napi_pp_put_page(netmem_ref netmem)
+{
+ netmem = netmem_compound_head(netmem);
/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
* in order to preserve any existing bits, such as bit 0 for the
@@ -910,41 +940,21 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
* and page_is_pfmemalloc() is checked in __page_pool_put_page()
* to avoid recycling the pfmemalloc page.
*/
- if (unlikely(!is_pp_page(page)))
+ if (unlikely(!is_pp_netmem(netmem)))
return false;
- pp = page->pp;
-
- /* Allow direct recycle if we have reasons to believe that we are
- * in the same context as the consumer would run, so there's
- * no possible race.
- * __page_pool_put_page() makes sure we're not in hardirq context
- * and interrupts are enabled prior to accessing the cache.
- */
- if (napi_safe || in_softirq()) {
- const struct napi_struct *napi = READ_ONCE(pp->p.napi);
-
- allow_direct = napi &&
- READ_ONCE(napi->list_owner) == smp_processor_id();
- }
-
- /* Driver set this to memory recycling info. Reset it on recycle.
- * This will *not* work for NIC using a split-page memory model.
- * The page will be returned to the pool here regardless of the
- * 'flipped' fragment being in use or not.
- */
- page_pool_put_full_page(pp, page, allow_direct);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif
-static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return napi_pp_put_page(virt_to_page(data), napi_safe);
+ return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
}
/**
@@ -960,7 +970,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
static int skb_pp_frag_ref(struct sk_buff *skb)
{
struct skb_shared_info *shinfo;
- struct page *head_page;
+ netmem_ref head_netmem;
int i;
if (!skb->pp_recycle)
@@ -969,11 +979,11 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
shinfo = skb_shinfo(skb);
for (i = 0; i < shinfo->nr_frags; i++) {
- head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
- if (likely(is_pp_page(head_page)))
- page_pool_ref_page(head_page);
+ head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
+ if (likely(is_pp_netmem(head_netmem)))
+ page_pool_ref_netmem(head_netmem);
else
- page_ref_inc(head_page);
+ page_ref_inc(netmem_to_page(head_netmem));
}
return 0;
}
@@ -981,17 +991,17 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
static void skb_kfree_head(void *head, unsigned int end_offset)
{
if (end_offset == SKB_SMALL_HEAD_HEADROOM)
- kmem_cache_free(skb_small_head_cache, head);
+ kmem_cache_free(net_hotdata.skb_small_head_cache, head);
else
kfree(head);
}
-static void skb_free_head(struct sk_buff *skb, bool napi_safe)
+static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
if (skb->head_frag) {
- if (skb_pp_recycle(skb, head, napi_safe))
+ if (skb_pp_recycle(skb, head))
return;
skb_free_frag(head);
} else {
@@ -999,15 +1009,12 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)
}
}
-static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
- bool napi_safe)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
- if (skb->cloned &&
- atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
- &shinfo->dataref))
+ if (!skb_data_unref(skb, shinfo))
goto exit;
if (skb_zcopy(skb)) {
@@ -1019,13 +1026,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
}
for (i = 0; i < shinfo->nr_frags; i++)
- napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
+ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
free_head:
if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason);
- skb_free_head(skb, napi_safe);
+ skb_free_head(skb);
exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races
@@ -1048,7 +1055,7 @@ static void kfree_skbmem(struct sk_buff *skb)
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
- kmem_cache_free(skbuff_cache, skb);
+ kmem_cache_free(net_hotdata.skbuff_cache, skb);
return;
case SKB_FCLONE_ORIG:
@@ -1069,7 +1076,7 @@ static void kfree_skbmem(struct sk_buff *skb)
if (!refcount_dec_and_test(&fclones->fclone_ref))
return;
fastpath:
- kmem_cache_free(skbuff_fclone_cache, fclones);
+ kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
}
void skb_release_head_state(struct sk_buff *skb)
@@ -1086,12 +1093,11 @@ void skb_release_head_state(struct sk_buff *skb)
}
/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
- bool napi_safe)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
skb_release_head_state(skb);
if (likely(skb->head))
- skb_release_data(skb, reason, napi_safe);
+ skb_release_data(skb, reason);
}
/**
@@ -1105,13 +1111,14 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
void __kfree_skb(struct sk_buff *skb)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
static __always_inline
-bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason reason)
{
if (unlikely(!skb_unref(skb)))
return false;
@@ -1124,26 +1131,27 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
if (reason == SKB_CONSUMED)
trace_consume_skb(skb, __builtin_return_address(0));
else
- trace_kfree_skb(skb, __builtin_return_address(0), reason);
+ trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
return true;
}
/**
- * kfree_skb_reason - free an sk_buff with special reason
+ * sk_skb_reason_drop - free an sk_buff with special reason
+ * @sk: the socket to receive @skb, or NULL if not applicable
* @skb: buffer to free
* @reason: reason why this skb is dropped
*
- * Drop a reference to the buffer and free it if the usage count has
- * hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
- * tracepoint.
+ * Drop a reference to the buffer and free it if the usage count has hit
+ * zero. Meanwhile, pass the receiving socket and drop reason to
+ * 'kfree_skb' tracepoint.
*/
void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (__kfree_skb_reason(skb, reason))
+ if (__sk_skb_reason_drop(sk, skb, reason))
__kfree_skb(skb);
}
-EXPORT_SYMBOL(kfree_skb_reason);
+EXPORT_SYMBOL(sk_skb_reason_drop);
#define KFREE_SKB_BULK_SIZE 16
@@ -1162,11 +1170,11 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return;
}
- skb_release_all(skb, reason, false);
+ skb_release_all(skb, reason);
sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
- kmem_cache_free_bulk(skbuff_cache, KFREE_SKB_BULK_SIZE,
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
sa->skb_array);
sa->skb_count = 0;
}
@@ -1182,7 +1190,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
while (segs) {
struct sk_buff *next = segs->next;
- if (__kfree_skb_reason(segs, reason)) {
+ if (__sk_skb_reason_drop(NULL, segs, reason)) {
skb_poison_list(segs);
kfree_skb_add_bulk(segs, &sa, reason);
}
@@ -1191,7 +1199,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
}
if (sa.skb_count)
- kmem_cache_free_bulk(skbuff_cache, sa.skb_count, sa.skb_array);
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
}
EXPORT_SYMBOL(kfree_skb_list_reason);
@@ -1223,22 +1231,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
has_trans = skb_transport_header_was_set(skb);
printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
- "mac=(%d,%d) net=(%d,%d) trans=%d\n"
+ "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
"shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
- "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
- "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
+ "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
+ "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
+ "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
level, skb->len, headroom, skb_headlen(skb), tailroom,
has_mac ? skb->mac_header : -1,
has_mac ? skb_mac_header_len(skb) : -1,
+ skb->mac_len,
skb->network_header,
has_trans ? skb_network_header_len(skb) : -1,
has_trans ? skb->transport_header : -1,
sh->tx_flags, sh->nr_frags,
sh->gso_size, sh->gso_type, sh->gso_segs,
- skb->csum, skb->ip_summed, skb->csum_complete_sw,
- skb->csum_valid, skb->csum_level,
+ skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
+ skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
skb->hash, skb->sw_hash, skb->l4_hash,
- ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
+ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
+ skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
+ skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
+ skb->inner_network_header, skb->inner_transport_header);
if (dev)
printk("%sdev name=%s feat=%pNF\n",
@@ -1267,6 +1281,14 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
struct page *p;
u8 *vaddr;
+ if (skb_frag_is_net_iov(frag)) {
+ printk("%sskb frag %d: not readable\n", level, i);
+ len -= skb_frag_size(frag);
+ if (!len)
+ break;
+ continue;
+ }
+
skb_frag_foreach_page(frag, skb_frag_off(frag),
skb_frag_size(frag), p, p_off, p_len,
copied) {
@@ -1336,7 +1358,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb, __builtin_return_address(0));
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb);
}
@@ -1348,22 +1370,24 @@ static void napi_skb_cache_put(struct sk_buff *skb)
if (!kasan_mempool_poison_object(skb))
return;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
kasan_mempool_unpoison_object(nc->skb_cache[i],
- kmem_cache_size(skbuff_cache));
+ kmem_cache_size(net_hotdata.skbuff_cache));
- kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF,
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF,
nc->skb_cache + NAPI_SKB_CACHE_HALF);
nc->skb_count = NAPI_SKB_CACHE_HALF;
}
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
- skb_release_all(skb, reason, true);
+ skb_release_all(skb, reason);
napi_skb_cache_put(skb);
}
@@ -1401,7 +1425,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- skb_release_all(skb, SKB_CONSUMED, !!budget);
+ skb_release_all(skb, SKB_CONSUMED);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1532,7 +1556,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- skb_release_all(dst, SKB_CONSUMED, false);
+ skb_release_all(dst, SKB_CONSUMED);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -1600,7 +1624,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
return NULL;
}
- uarg->ubuf.callback = msg_zerocopy_callback;
+ uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
@@ -1626,7 +1650,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
u32 bytelen, next;
/* there might be non MSG_ZEROCOPY users */
- if (uarg->callback != msg_zerocopy_callback)
+ if (uarg->ops != &msg_zerocopy_ubuf_ops)
return NULL;
/* realloc only when socket is locked (TCP, UDP cork),
@@ -1737,8 +1761,8 @@ release:
sock_put(sk);
}
-void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
- bool success)
+static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success)
{
struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
@@ -1747,7 +1771,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
if (refcount_dec_and_test(&uarg->refcnt))
__msg_zerocopy_callback(uarg_zc);
}
-EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
@@ -1757,22 +1780,35 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
uarg_to_msgzc(uarg)->len--;
if (have_uref)
- msg_zerocopy_callback(NULL, uarg, true);
+ msg_zerocopy_complete(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
+const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
+ .complete = msg_zerocopy_complete,
+};
+EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
+
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
{
- struct ubuf_info *orig_uarg = skb_zcopy(skb);
int err, orig_len = skb->len;
- /* An skb can only point to one uarg. This edge case happens when
- * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
- */
- if (orig_uarg && uarg != orig_uarg)
- return -EEXIST;
+ if (uarg->ops->link_skb) {
+ err = uarg->ops->link_skb(skb, uarg);
+ if (err)
+ return err;
+ } else {
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
+
+ /* An skb can only point to one uarg. This edge case happens
+ * when TCP appends to an skb, but zerocopy_realloc triggered
+ * a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+ }
err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
@@ -1846,6 +1882,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
return -EINVAL;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
if (!num_frags)
goto release;
@@ -1906,10 +1945,11 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
/* skb frags point to kernel buffers */
for (i = 0; i < new_frags - 1; i++) {
- __skb_fill_page_desc(skb, i, head, 0, psize);
+ __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
head = (struct page *)page_private(head);
}
- __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+ __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
+ d_off);
skb_shinfo(skb)->nr_frags = new_frags;
release:
@@ -1951,7 +1991,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- n = kmem_cache_alloc(skbuff_cache, gfp_mask);
+ n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
if (!n)
return NULL;
@@ -2014,11 +2054,20 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
- int headerlen = skb_headroom(skb);
- unsigned int size = skb_end_offset(skb) + skb->data_len;
- struct sk_buff *n = __alloc_skb(size, gfp_mask,
- skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+ struct sk_buff *n;
+ unsigned int size;
+ int headerlen;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+ headerlen = skb_headroom(skb);
+ size = skb_end_offset(skb) + skb->data_len;
+ n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -2163,9 +2212,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
- skb_free_head(skb, false);
+ skb_free_head(skb);
}
off = (data + nhead) - skb->head;
@@ -2346,12 +2395,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask, skb_alloc_rx_flag(skb),
- NUMA_NO_NODE);
- int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
+ struct sk_buff *n;
+ int oldheadroom;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+
+ oldheadroom = skb_headroom(skb);
+ n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
if (!n)
return NULL;
@@ -2689,6 +2746,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
*/
int i, k, eat = (skb->tail + delta) - skb->end;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
if (eat > 0 || skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
GFP_ATOMIC))
@@ -2842,6 +2902,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
to += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -3030,9 +3093,15 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
/*
* then map the fragments
*/
+ if (!skb_frags_readable(skb))
+ return false;
+
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+ if (WARN_ON_ONCE(!skb_frag_page(f)))
+ return false;
+
if (__splice_segment(skb_frag_page(f),
skb_frag_off(f), skb_frag_size(f),
offset, len, spd, false, sk, pipe))
@@ -3250,6 +3319,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
from += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
@@ -3329,6 +3401,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
pos = copy;
}
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
@@ -3429,6 +3504,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
pos = copy;
}
+ if (!skb_frags_readable(skb))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
@@ -3647,7 +3725,8 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
if (plen) {
page = virt_to_head_page(from->head);
offset = from->data - (unsigned char *)page_address(page);
- __skb_fill_page_desc(to, 0, page, offset, plen);
+ __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
+ offset, plen);
get_page(page);
j = 1;
len -= plen;
@@ -3919,6 +3998,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb1->unreadable = skb->unreadable;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
@@ -3966,6 +4046,8 @@ static inline void skb_split_no_header(struct sk_buff *skb,
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
+
+ skb1->unreadable = skb->unreadable;
}
/**
@@ -4029,6 +4111,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
if (skb_zcopy(tgt) || skb_zcopy(skb))
return 0;
+ DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
+ DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));
+
todo = shiftlen;
from = 0;
to = skb_shinfo(tgt)->nr_frags;
@@ -4037,8 +4122,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
/* Actual merge is delayed until the point when we know we can
* commit all, so that we don't have to undo partial changes
*/
- if (!to ||
- !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
skb_frag_off(fragfrom))) {
merge = -1;
} else {
@@ -4201,6 +4285,9 @@ next_skb:
return block_limit - abs_offset;
}
+ if (!skb_frags_readable(st->cur_skb))
+ return 0;
+
if (st->frag_idx == 0 && !st->frag_data)
st->stepped_offset += skb_headlen(st->cur_skb);
@@ -4277,6 +4364,41 @@ void skb_abort_seq_read(struct skb_seq_state *st)
}
EXPORT_SYMBOL(skb_abort_seq_read);
+/**
+ * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
+ * @st: source skb_seq_state
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @offset bytes into the source @st to the destination
+ * buffer @to. `offset` should increase (or be unchanged) with each subsequent
+ * call to this function. If offset needs to decrease from the previous use `st`
+ * should be reset first.
+ *
+ * Return: 0 on success or -EINVAL if the copy ended early
+ */
+int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
+{
+ const u8 *data;
+ u32 sqlen;
+
+ for (;;) {
+ sqlen = skb_seq_read(offset, &data, st);
+ if (sqlen == 0)
+ return -EINVAL;
+ if (sqlen >= len) {
+ memcpy(to, data, len);
+ return 0;
+ }
+ memcpy(to, data, sqlen);
+ to += sqlen;
+ offset += sqlen;
+ len -= sqlen;
+ }
+}
+EXPORT_SYMBOL(skb_copy_seq_read);
+
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
@@ -4889,7 +5011,7 @@ static void skb_extensions_init(void) {}
void __init skb_init(void)
{
- skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
+ net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
sizeof(struct sk_buff),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|
@@ -4897,7 +5019,7 @@ void __init skb_init(void)
offsetof(struct sk_buff, cb),
sizeof_field(struct sk_buff, cb),
NULL);
- skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+ net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
sizeof(struct sk_buff_fclones),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
@@ -4906,7 +5028,7 @@ void __init skb_init(void)
* struct skb_shared_info is located at the end of skb->head,
* and should not be copied to/from user.
*/
- skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
+ net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
SKB_SMALL_HEAD_CACHE_SIZE,
0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC,
@@ -5029,7 +5151,7 @@ EXPORT_SYMBOL_GPL(skb_to_sgvec);
* 3. sg_unmark_end
* 4. skb_to_sgvec(payload2)
*
- * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * When mapping multiple payload conditionally, skb_to_sgvec_nomark
* is more preferable.
*/
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
@@ -5294,7 +5416,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
- if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
+ if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
return true;
read_lock_bh(&sk->sk_callback_lock);
@@ -5779,7 +5901,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
if (head_stolen) {
skb_release_head_state(skb);
- kmem_cache_free(skbuff_cache, skb);
+ kmem_cache_free(net_hotdata.skbuff_cache, skb);
} else {
__kfree_skb(skb);
}
@@ -5813,7 +5935,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (to->pp_recycle != from->pp_recycle)
return false;
- if (len <= skb_tailroom(to)) {
+ if (skb_frags_readable(from) != skb_frags_readable(to))
+ return false;
+
+ if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
@@ -5887,7 +6012,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
* @skb: buffer to clean
* @xnet: packet is crossing netns
*
- * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * skb_scrub_packet can be used after encapsulating or decapsulating a packet
* into/from a tunnel. Some information have to be cleared during these
* operations.
* skb_scrub_packet can also be used to clean a skb before injecting it in
@@ -5908,11 +6033,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->offload_fwd_mark = 0;
skb->offload_l3_fwd_mark = 0;
#endif
+ ipvs_reset(skb);
if (!xnet)
return;
- ipvs_reset(skb);
skb->mark = 0;
skb_clear_tstamp(skb);
}
@@ -5990,6 +6115,9 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
if (!pskb_may_pull(skb, write_len))
return -ENOMEM;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
return 0;
@@ -6109,7 +6237,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
return err;
skb->protocol = skb->vlan_proto;
- skb->mac_len += VLAN_HLEN;
+ skb->network_header -= VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
}
@@ -6465,12 +6593,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
- skb_free_head(skb, false);
+ skb_free_head(skb);
}
skb->head = data;
@@ -6605,7 +6733,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size);
return -ENOMEM;
}
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
skb->head = data;
skb->head_frag = 0;
@@ -6669,7 +6797,7 @@ void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
- skb_cloned(skb))
+ skb_cloned(skb) || !skb_frags_readable(skb))
return;
/* Nice, we can free page frag(s) right now */
@@ -6737,6 +6865,14 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
xfrm_state_hold(sp->xvec[i]);
}
#endif
+#ifdef CONFIG_MCTP_FLOWS
+ if (old_active & (1 << SKB_EXT_MCTP)) {
+ struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);
+
+ if (flow->key)
+ refcount_inc(&flow->key->refs);
+ }
+#endif
__skb_ext_put(old);
return new;
}
@@ -6877,6 +7013,19 @@ free_now:
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */
+static void kfree_skb_napi_cache(struct sk_buff *skb)
+{
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ local_bh_disable();
+ __napi_kfree_skb(skb, SKB_CONSUMED);
+ local_bh_enable();
+}
+
/**
* skb_attempt_defer_free - queue skb for remote freeing
* @skb: buffer
@@ -6892,10 +7041,10 @@ void skb_attempt_defer_free(struct sk_buff *skb)
unsigned int defer_max;
bool kick;
- if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
- !cpu_online(cpu) ||
- cpu == raw_smp_processor_id()) {
-nodefer: __kfree_skb(skb);
+ if (cpu == raw_smp_processor_id() ||
+ WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+ !cpu_online(cpu)) {
+nodefer: kfree_skb_napi_cache(skb);
return;
}
@@ -6903,7 +7052,7 @@ nodefer: __kfree_skb(skb);
DEBUG_NET_WARN_ON_ONCE(skb->destructor);
sd = &per_cpu(softnet_data, cpu);
- defer_max = READ_ONCE(sysctl_skb_defer_max);
+ defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
if (READ_ONCE(sd->defer_count) >= defer_max)
goto nodefer;
@@ -6921,8 +7070,8 @@ nodefer: __kfree_skb(skb);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
- if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
- smp_call_function_single_async(cpu, &sd->defer_csd);
+ if (unlikely(kick))
+ kick_defer_list_purge(sd, cpu);
}
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
@@ -6955,7 +7104,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
ssize_t maxsize, gfp_t gfp)
{
- size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
+ size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
struct page *pages[8], **ppages = pages;
ssize_t spliced = 0, ret = 0;
unsigned int i;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 4d75ef9d24bf..0ddc4c718833 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -293,7 +293,7 @@ out:
/* If we trim data a full sg elem before curr pointer update
* copybreak and current so that any future copy operations
* start at new copy location.
- * However trimed data that has not yet been used in a copy op
+ * However trimmed data that has not yet been used in a copy op
* does not require an update.
*/
if (!msg->sg.size) {
@@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes)
{
int ret = -ENOSPC, i = msg->sg.curr;
+ u32 copy, buf_size, copied = 0;
struct scatterlist *sge;
- u32 copy, buf_size;
void *to;
do {
@@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
goto out;
}
bytes -= copy;
+ copied += copy;
if (!bytes)
break;
msg->sg.copybreak = 0;
@@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
} while (i != msg->sg.end);
out:
msg->sg.curr = i;
- return ret;
+ return (ret < 0) ? ret : copied;
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
@@ -434,7 +435,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
page = sg_page(sge);
if (copied + copy > len)
copy = len - copied;
- copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (copy)
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
if (!copy) {
copied = copied ? copied : -EFAULT;
goto out;
@@ -444,8 +446,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
if (likely(!peek)) {
sge->offset += copy;
sge->length -= copy;
- if (!msg_rx->skb)
+ if (!msg_rx->skb) {
sk_mem_uncharge(sk, copy);
+ atomic_sub(copy, &sk->sk_rmem_alloc);
+ }
msg_rx->sg.size -= copy;
if (!sge->length) {
@@ -545,6 +549,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
return num_sge;
}
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ psock->ingress_bytes += len;
+#endif
copied = len;
msg->sg.start = 0;
msg->sg.size = copied;
@@ -771,6 +778,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
list_del(&msg->list);
+ if (!msg->skb)
+ atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
sk_msg_free(psock->sk, msg);
kfree(msg);
}
@@ -1116,9 +1125,9 @@ static void sk_psock_strp_data_ready(struct sock *sk)
if (tls_sw_has_ctx_rx(sk)) {
psock->saved_data_ready(sk);
} else {
- write_lock_bh(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
strp_data_ready(&psock->strp);
- write_unlock_bh(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
}
rcu_read_unlock();
@@ -1138,6 +1147,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
if (!ret)
sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
+ if (sk_is_tcp(sk)) {
+ psock->strp.cb.read_sock = tcp_bpf_strp_read_sock;
+ psock->copied_seq = tcp_sk(sk)->copied_seq;
+ }
return ret;
}
@@ -1226,11 +1239,8 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
rcu_read_lock();
psock = sk_psock(sk);
- if (psock) {
- read_lock_bh(&sk->sk_callback_lock);
+ if (psock)
sk_psock_data_ready(sk, psock);
- read_unlock_bh(&sk->sk_callback_lock);
- }
rcu_read_unlock();
}
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 5e78798456fd..6c0e87f97fa4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -85,7 +85,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
@@ -124,9 +124,11 @@
#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
+#include <net/proto_memory.h>
#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
@@ -284,8 +286,6 @@ EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
-int sysctl_tstamp_allow_data __read_mostly = 1;
-
DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);
@@ -454,6 +454,13 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
return 0;
}
+static bool sk_set_prio_allowed(const struct sock *sk, int val)
+{
+ return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
+}
+
static bool sock_needs_netstamp(const struct sock *sk)
{
switch (sk->sk_family) {
@@ -481,7 +488,7 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
+ if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
atomic_inc(&sk->sk_drops);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
@@ -551,7 +558,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
skb->dev = NULL;
- if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
+ if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
}
@@ -820,14 +827,11 @@ EXPORT_SYMBOL(sock_set_sndtimeo);
static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
{
+ sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
+ sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
if (val) {
sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
- sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
- sock_set_flag(sk, SOCK_RCVTSTAMP);
sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- } else {
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
}
}
@@ -1048,6 +1052,75 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
return 0;
}
+#ifdef CONFIG_PAGE_POOL
+
+/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
+ * in 1 syscall. The limit exists to limit the amount of memory the kernel
+ * allocates to copy these tokens, and to prevent looping over the frags for
+ * too long.
+ */
+#define MAX_DONTNEED_TOKENS 128
+#define MAX_DONTNEED_FRAGS 1024
+
+static noinline_for_stack int
+sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+ unsigned int num_tokens, i, j, k, netmem_num = 0;
+ struct dmabuf_token *tokens;
+ int ret = 0, num_frags = 0;
+ netmem_ref netmems[16];
+
+ if (!sk_is_tcp(sk))
+ return -EBADF;
+
+ if (optlen % sizeof(*tokens) ||
+ optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
+ return -EINVAL;
+
+ num_tokens = optlen / sizeof(*tokens);
+ tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
+ if (!tokens)
+ return -ENOMEM;
+
+ if (copy_from_sockptr(tokens, optval, optlen)) {
+ kvfree(tokens);
+ return -EFAULT;
+ }
+
+ xa_lock_bh(&sk->sk_user_frags);
+ for (i = 0; i < num_tokens; i++) {
+ for (j = 0; j < tokens[i].token_count; j++) {
+ if (++num_frags > MAX_DONTNEED_FRAGS)
+ goto frag_limit_reached;
+
+ netmem_ref netmem = (__force netmem_ref)__xa_erase(
+ &sk->sk_user_frags, tokens[i].token_start + j);
+
+ if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ continue;
+
+ netmems[netmem_num++] = netmem;
+ if (netmem_num == ARRAY_SIZE(netmems)) {
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+ netmem_num = 0;
+ xa_lock_bh(&sk->sk_user_frags);
+ }
+ ret++;
+ }
+ }
+
+frag_limit_reached:
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+
+ kvfree(tokens);
+ return ret;
+}
+#endif
+
void sockopt_lock_sock(struct sock *sk)
{
/* When current->bpf_ctx is set, the setsockopt is called from
@@ -1082,6 +1155,17 @@ bool sockopt_capable(int cap)
}
EXPORT_SYMBOL(sockopt_capable);
+static int sockopt_validate_clockid(__kernel_clockid_t value)
+{
+ switch (value) {
+ case CLOCK_REALTIME:
+ case CLOCK_MONOTONIC:
+ case CLOCK_TAI:
+ return 0;
+ }
+ return -EINVAL;
+}
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
@@ -1116,9 +1200,7 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
/* handle options which do not require locking the socket. */
switch (optname) {
case SO_PRIORITY:
- if ((val >= 0 && val <= 6) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (sk_set_prio_allowed(sk, val)) {
sock_set_priority(sk, val);
return 0;
}
@@ -1199,6 +1281,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
ret = -EOPNOTSUPP;
return ret;
}
+#ifdef CONFIG_PAGE_POOL
+ case SO_DEVMEM_DONTNEED:
+ return sock_devmem_dontneed(sk, optval, optlen);
+#endif
}
sockopt_lock_sock(sk);
@@ -1214,7 +1300,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
break;
case SO_REUSEPORT:
- sk->sk_reuseport = valbool;
+ if (valbool && !sk_is_inet(sk))
+ ret = -EOPNOTSUPP;
+ else
+ sk->sk_reuseport = valbool;
break;
case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
@@ -1433,6 +1522,10 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
break;
+ case SO_RCVPRIORITY:
+ sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
+ break;
+
case SO_RXQ_OVFL:
sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
break;
@@ -1496,6 +1589,11 @@ set_sndbuf:
ret = -EPERM;
break;
}
+
+ ret = sockopt_validate_clockid(sk_txtime.clockid);
+ if (ret)
+ break;
+
sock_valbool_flag(sk, SOCK_TXTIME, true);
sk->sk_clockid = sk_txtime.clockid;
sk->sk_txtime_deadline_mode =
@@ -1856,6 +1954,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
v.val = sock_flag(sk, SOCK_RCVMARK);
break;
+ case SO_RCVPRIORITY:
+ v.val = sock_flag(sk, SOCK_RCVPRIORITY);
+ break;
+
case SO_RXQ_OVFL:
v.val = sock_flag(sk, SOCK_RXQ_OVFL);
break;
@@ -2031,7 +2133,7 @@ static inline void sock_lock_init(struct sock *sk)
/*
* Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
- * even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * even temporarily, because of RCU lookups. sk_node should also be left as is.
* We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
@@ -2052,8 +2154,9 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
- memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
- prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+ unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
+ /* alloc is larger than struct, see sk_prot_alloc() */);
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
@@ -2143,6 +2246,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
} else {
+ net_passive_inc(net);
__netns_tracker_alloc(net, &sk->ns_tracker,
false, priority);
}
@@ -2167,6 +2271,7 @@ EXPORT_SYMBOL(sk_alloc);
static void __sk_destruct(struct rcu_head *head)
{
struct sock *sk = container_of(head, struct sock, sk_rcu);
+ struct net *net = sock_net(sk);
struct sk_filter *filter;
if (sk->sk_destruct)
@@ -2198,14 +2303,28 @@ static void __sk_destruct(struct rcu_head *head)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
- if (likely(sk->sk_net_refcnt))
- put_net_track(sock_net(sk), &sk->ns_tracker);
- else
- __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
-
+ if (likely(sk->sk_net_refcnt)) {
+ put_net_track(net, &sk->ns_tracker);
+ } else {
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ }
sk_prot_free(sk->sk_prot_creator, sk);
}
+void sk_net_refcnt_upgrade(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ WARN_ON_ONCE(sk->sk_net_refcnt);
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+}
+EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
+
void sk_destruct(struct sock *sk)
{
bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
@@ -2260,7 +2379,12 @@ static void sk_init_common(struct sock *sk)
lockdep_set_class_and_name(&sk->sk_error_queue.lock,
af_elock_keys + sk->sk_family,
af_family_elock_key_strings[sk->sk_family]);
- lockdep_set_class_and_name(&sk->sk_callback_lock,
+ if (sk->sk_kern_sock)
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_kern_callback_keys + sk->sk_family,
+ af_family_kern_clock_key_strings[sk->sk_family]);
+ else
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
}
@@ -2297,6 +2421,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
* is not properly dismantling its kernel sockets at netns
* destroy time.
*/
+ net_passive_inc(sock_net(newsk));
__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
false, priority);
}
@@ -2503,19 +2628,16 @@ void __sock_wfree(struct sk_buff *skb)
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
- skb->sk = sk;
#ifdef CONFIG_INET
- if (unlikely(!sk_fullsock(sk))) {
- skb->destructor = sock_edemux;
- sock_hold(sk);
- return;
- }
+ if (unlikely(!sk_fullsock(sk)))
+ return skb_set_owner_edemux(skb, sk);
#endif
+ skb->sk = sk;
skb->destructor = sock_wfree;
skb_set_hash_from_sk(skb, sk);
/*
* We used to take a refcount on sk, but following operation
- * is enough to guarantee sk_free() wont free this sock until
+ * is enough to guarantee sk_free() won't free this sock until
* all in-flight packets are completed
*/
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
@@ -2524,13 +2646,12 @@ EXPORT_SYMBOL(skb_set_owner_w);
static bool can_skb_orphan_partial(const struct sk_buff *skb)
{
-#ifdef CONFIG_TLS_DEVICE
/* Drivers depend on in-order delivery for crypto offload,
* partial orphan breaks out-of-order-OK logic.
*/
- if (skb->decrypted)
+ if (skb_is_decrypted(skb))
return false;
-#endif
+
return (skb->destructor == sock_wfree ||
(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
}
@@ -2582,8 +2703,18 @@ EXPORT_SYMBOL(sock_efree);
#ifdef CONFIG_INET
void sock_pfree(struct sk_buff *skb)
{
- if (sk_is_refcounted(skb->sk))
- sock_gen_put(skb->sk);
+ struct sock *sk = skb->sk;
+
+ if (!sk_is_refcounted(sk))
+ return;
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
+ inet_reqsk(sk)->rsk_listener = NULL;
+ reqsk_free(inet_reqsk(sk));
+ return;
+ }
+
+ sock_gen_put(sk);
}
EXPORT_SYMBOL(sock_pfree);
#endif /* CONFIG_INET */
@@ -2799,6 +2930,8 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
{
u32 tsflags;
+ BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
+
switch (cmsg->cmsg_type) {
case SO_MARK:
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
@@ -2827,10 +2960,28 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
return -EINVAL;
sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
break;
+ case SCM_TS_OPT_ID:
+ if (sk_is_tcp(sk))
+ return -EINVAL;
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
+ return -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
+ sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
+ break;
/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
case SCM_RIGHTS:
case SCM_CREDENTIALS:
break;
+ case SO_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
+ return -EPERM;
+ sockc->priority = *(u32 *)CMSG_DATA(cmsg);
+ break;
default:
return -EINVAL;
}
@@ -3230,8 +3381,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
}
EXPORT_SYMBOL(sock_no_socketpair);
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+int sock_no_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
return -EOPNOTSUPP;
}
@@ -3326,7 +3477,7 @@ static void sock_def_error_report(struct sock *sk)
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, EPOLLERR);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
rcu_read_unlock();
}
@@ -3341,7 +3492,7 @@ void sock_def_readable(struct sock *sk)
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND);
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
@@ -3361,7 +3512,7 @@ static void sock_def_write_space(struct sock *sk)
EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
@@ -3386,7 +3537,7 @@ static void sock_def_write_space_wfree(struct sock *sk)
EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
}
@@ -3397,7 +3548,7 @@ static void sock_def_destruct(struct sock *sk)
void sk_send_sigurg(struct sock *sk)
{
if (sk->sk_socket && sk->sk_socket->file)
- if (send_sigurg(&sk->sk_socket->file->f_owner))
+ if (send_sigurg(sk->sk_socket->file))
sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);
@@ -3449,18 +3600,6 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
}
sk->sk_uid = uid;
- rwlock_init(&sk->sk_callback_lock);
- if (sk->sk_kern_sock)
- lockdep_set_class_and_name(
- &sk->sk_callback_lock,
- af_kern_callback_keys + sk->sk_family,
- af_family_kern_clock_key_strings[sk->sk_family]);
- else
- lockdep_set_class_and_name(
- &sk->sk_callback_lock,
- af_callback_keys + sk->sk_family,
- af_family_clock_key_strings[sk->sk_family]);
-
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
@@ -3677,7 +3816,7 @@ EXPORT_SYMBOL(sock_recv_errqueue);
*
* FIX: POSIX 1003.1g is very ambiguous here. It states that
* asynchronous errors should be reported by getsockopt. We assume
- * this means if you specify SO_ERROR (otherwise whats the point of it).
+ * this means if you specify SO_ERROR (otherwise what is the point of it).
*/
int sock_common_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
@@ -4223,3 +4362,65 @@ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
return sock_ioctl_out(sk, cmd, arg);
}
EXPORT_SYMBOL(sk_ioctl);
+
+static int __init sock_struct_check(void)
+{
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
+ return 0;
+}
+
+core_initcall(sock_struct_check);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index b1e29e18d1d6..a08eed9b9142 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -16,9 +16,10 @@
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
-static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
-static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
-static DEFINE_MUTEX(sock_diag_table_mutex);
+static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];
+
+static const struct sock_diag_inet_compat __rcu *inet_rcv_compat;
+
static struct workqueue_struct *broadcast_wq;
DEFINE_COOKIE(sock_cookie);
@@ -122,6 +123,24 @@ static size_t sock_diag_nlmsg_size(void)
+ nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
}
+static const struct sock_diag_handler *sock_diag_lock_handler(int family)
+{
+ const struct sock_diag_handler *handler;
+
+ rcu_read_lock();
+ handler = rcu_dereference(sock_diag_handlers[family]);
+ if (handler && !try_module_get(handler->owner))
+ handler = NULL;
+ rcu_read_unlock();
+
+ return handler;
+}
+
+static void sock_diag_unlock_handler(const struct sock_diag_handler *handler)
+{
+ module_put(handler->owner);
+}
+
static void sock_diag_broadcast_destroy_work(struct work_struct *work)
{
struct broadcast_sk *bsk =
@@ -138,12 +157,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work)
if (!skb)
goto out;
- mutex_lock(&sock_diag_table_mutex);
- hndl = sock_diag_handlers[sk->sk_family];
- if (hndl && hndl->get_info)
- err = hndl->get_info(skb, sk);
- mutex_unlock(&sock_diag_table_mutex);
-
+ hndl = sock_diag_lock_handler(sk->sk_family);
+ if (hndl) {
+ if (hndl->get_info)
+ err = hndl->get_info(skb, sk);
+ sock_diag_unlock_handler(hndl);
+ }
if (!err)
nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
GFP_KERNEL);
@@ -166,51 +185,43 @@ void sock_diag_broadcast_destroy(struct sock *sk)
queue_work(broadcast_wq, &bsk->work);
}
-void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)
{
- mutex_lock(&sock_diag_table_mutex);
- inet_rcv_compat = fn;
- mutex_unlock(&sock_diag_table_mutex);
+ xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr));
}
EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
-void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)
{
- mutex_lock(&sock_diag_table_mutex);
- inet_rcv_compat = NULL;
- mutex_unlock(&sock_diag_table_mutex);
+ const struct sock_diag_inet_compat *old;
+
+ old = unrcu_pointer(xchg(&inet_rcv_compat, NULL));
+ WARN_ON_ONCE(old != ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
int sock_diag_register(const struct sock_diag_handler *hndl)
{
- int err = 0;
+ int family = hndl->family;
- if (hndl->family >= AF_MAX)
+ if (family >= AF_MAX)
return -EINVAL;
- mutex_lock(&sock_diag_table_mutex);
- if (sock_diag_handlers[hndl->family])
- err = -EBUSY;
- else
- sock_diag_handlers[hndl->family] = hndl;
- mutex_unlock(&sock_diag_table_mutex);
-
- return err;
+ return !cmpxchg((const struct sock_diag_handler **)
+ &sock_diag_handlers[family],
+ NULL, hndl) ? 0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(sock_diag_register);
-void sock_diag_unregister(const struct sock_diag_handler *hnld)
+void sock_diag_unregister(const struct sock_diag_handler *hndl)
{
- int family = hnld->family;
+ int family = hndl->family;
if (family >= AF_MAX)
return;
- mutex_lock(&sock_diag_table_mutex);
- BUG_ON(sock_diag_handlers[family] != hnld);
- sock_diag_handlers[family] = NULL;
- mutex_unlock(&sock_diag_table_mutex);
+ xchg((const struct sock_diag_handler **)&sock_diag_handlers[family],
+ NULL);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
@@ -227,20 +238,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
- if (sock_diag_handlers[req->sdiag_family] == NULL)
+ if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family]))
sock_load_diag_module(req->sdiag_family, 0);
- mutex_lock(&sock_diag_table_mutex);
- hndl = sock_diag_handlers[req->sdiag_family];
+ hndl = sock_diag_lock_handler(req->sdiag_family);
if (hndl == NULL)
- err = -ENOENT;
- else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
+ return -ENOENT;
+
+ if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
err = hndl->dump(skb, nlh);
else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
err = hndl->destroy(skb, nlh);
else
err = -EOPNOTSUPP;
- mutex_unlock(&sock_diag_table_mutex);
+ sock_diag_unlock_handler(hndl);
return err;
}
@@ -248,20 +259,27 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ const struct sock_diag_inet_compat *ptr;
int ret;
switch (nlh->nlmsg_type) {
case TCPDIAG_GETSOCK:
case DCCPDIAG_GETSOCK:
- if (inet_rcv_compat == NULL)
+
+ if (!rcu_access_pointer(inet_rcv_compat))
sock_load_diag_module(AF_INET, 0);
- mutex_lock(&sock_diag_table_mutex);
- if (inet_rcv_compat != NULL)
- ret = inet_rcv_compat(skb, nlh);
- else
- ret = -EOPNOTSUPP;
- mutex_unlock(&sock_diag_table_mutex);
+ rcu_read_lock();
+ ptr = rcu_dereference(inet_rcv_compat);
+ if (ptr && !try_module_get(ptr->owner))
+ ptr = NULL;
+ rcu_read_unlock();
+
+ ret = -EOPNOTSUPP;
+ if (ptr) {
+ ret = ptr->fn(skb, nlh);
+ module_put(ptr->owner);
+ }
return ret;
case SOCK_DIAG_BY_FAMILY:
@@ -272,13 +290,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
-static DEFINE_MUTEX(sock_diag_mutex);
-
static void sock_diag_rcv(struct sk_buff *skb)
{
- mutex_lock(&sock_diag_mutex);
netlink_rcv_skb(skb, &sock_diag_rcv_msg);
- mutex_unlock(&sock_diag_mutex);
}
static int sock_diag_bind(struct net *net, int group)
@@ -286,12 +300,12 @@ static int sock_diag_bind(struct net *net, int group)
switch (group) {
case SKNLGRP_INET_TCP_DESTROY:
case SKNLGRP_INET_UDP_DESTROY:
- if (!sock_diag_handlers[AF_INET])
+ if (!rcu_access_pointer(sock_diag_handlers[AF_INET]))
sock_load_diag_module(AF_INET, 0);
break;
case SKNLGRP_INET6_TCP_DESTROY:
case SKNLGRP_INET6_UDP_DESTROY:
- if (!sock_diag_handlers[AF_INET6])
+ if (!rcu_access_pointer(sock_diag_handlers[AF_INET6]))
sock_load_diag_module(AF_INET6, 0);
break;
}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 27d733c0f65e..82a14f131d00 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,8 +24,16 @@ struct bpf_stab {
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+/* This mutex is used to
+ * - protect race between prog/link attach/detach and link prog update, and
+ * - protect race between releasing and accessing map in bpf_link.
+ * A single global mutex lock is used since it is expected contention is low.
+ */
+static DEFINE_MUTEX(sockmap_mutex);
+
static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which);
+ struct bpf_prog *old, struct bpf_link *link,
+ u32 which);
static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
@@ -59,55 +67,50 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
{
- u32 ufd = attr->target_fd;
struct bpf_map *map;
- struct fd f;
int ret;
if (attr->attach_flags || attr->replace_bpf_fd)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->target_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);
- fdput(f);
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
+ mutex_unlock(&sockmap_mutex);
return ret;
}
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
{
- u32 ufd = attr->target_fd;
struct bpf_prog *prog;
struct bpf_map *map;
- struct fd f;
int ret;
if (attr->attach_flags || attr->replace_bpf_fd)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->target_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
prog = bpf_prog_get(attr->attach_bpf_fd);
- if (IS_ERR(prog)) {
- ret = PTR_ERR(prog);
- goto put_map;
- }
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
if (prog->type != ptype) {
ret = -EINVAL;
goto put_prog;
}
- ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
+ mutex_unlock(&sockmap_mutex);
put_prog:
bpf_prog_put(prog);
-put_map:
- fdput(f);
return ret;
}
@@ -156,6 +159,7 @@ static void sock_map_del_link(struct sock *sk,
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
+ break;
}
}
spin_unlock_bh(&psock->link_lock);
@@ -299,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
write_lock_bh(&sk->sk_callback_lock);
if (stream_parser && stream_verdict && !psock->saved_data_ready) {
- ret = sk_psock_init_strp(sk, psock);
+ if (sk_is_tcp(sk))
+ ret = sk_psock_init_strp(sk, psock);
+ else
+ ret = -EOPNOTSUPP;
if (ret) {
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_put(sk, psock);
@@ -408,12 +415,11 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
struct sock **psk)
{
- struct sock *sk;
+ struct sock *sk = NULL;
int err = 0;
spin_lock_bh(&stab->lock);
- sk = *psk;
- if (!sk_test || sk_test == sk)
+ if (!sk_test || sk_test == *psk)
sk = xchg(psk, NULL);
if (likely(sk))
@@ -538,6 +544,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
if (sk_is_stream_unix(sk))
return (1 << sk->sk_state) & TCPF_ESTABLISHED;
+ if (sk_is_vsock(sk) &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET))
+ return (1 << sk->sk_state) & TCPF_ESTABLISHED;
return true;
}
@@ -644,6 +653,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
sk = __sock_map_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+ if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+ return SK_DROP;
skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
@@ -672,6 +683,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
return SK_DROP;
if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
return SK_DROP;
+ if (sk_is_vsock(sk))
+ return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
@@ -1171,6 +1184,7 @@ static void sock_hash_free(struct bpf_map *map)
sock_put(elem->sk);
sock_hash_free_elem(htab, elem);
}
+ cond_resched();
}
/* wait for psock readers accessing its map link */
@@ -1245,6 +1259,8 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
sk = __sock_hash_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+ if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+ return SK_DROP;
skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
@@ -1273,6 +1289,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
return SK_DROP;
if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
return SK_DROP;
+ if (sk_is_vsock(sk))
+ return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
@@ -1454,80 +1472,108 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
return NULL;
}
-static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
- u32 which)
+static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+ struct bpf_link ***plink, u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog **cur_pprog;
+ struct bpf_link **cur_plink;
if (!progs)
return -EOPNOTSUPP;
switch (which) {
case BPF_SK_MSG_VERDICT:
- *pprog = &progs->msg_parser;
+ cur_pprog = &progs->msg_parser;
+ cur_plink = &progs->msg_parser_link;
break;
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
- *pprog = &progs->stream_parser;
+ cur_pprog = &progs->stream_parser;
+ cur_plink = &progs->stream_parser_link;
break;
#endif
case BPF_SK_SKB_STREAM_VERDICT:
if (progs->skb_verdict)
return -EBUSY;
- *pprog = &progs->stream_verdict;
+ cur_pprog = &progs->stream_verdict;
+ cur_plink = &progs->stream_verdict_link;
break;
case BPF_SK_SKB_VERDICT:
if (progs->stream_verdict)
return -EBUSY;
- *pprog = &progs->skb_verdict;
+ cur_pprog = &progs->skb_verdict;
+ cur_plink = &progs->skb_verdict_link;
break;
default:
return -EOPNOTSUPP;
}
+ *pprog = cur_pprog;
+ if (plink)
+ *plink = cur_plink;
return 0;
}
+/* Handle the following four cases:
+ * prog_attach: prog != NULL, old == NULL, link == NULL
+ * prog_detach: prog == NULL, old != NULL, link == NULL
+ * link_attach: prog != NULL, old == NULL, link != NULL
+ * link_detach: prog == NULL, old != NULL, link != NULL
+ */
static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
+ struct bpf_prog *old, struct bpf_link *link,
+ u32 which)
{
struct bpf_prog **pprog;
+ struct bpf_link **plink;
int ret;
- ret = sock_map_prog_lookup(map, &pprog, which);
+ ret = sock_map_prog_link_lookup(map, &pprog, &plink, which);
if (ret)
return ret;
- if (old)
- return psock_replace_prog(pprog, prog, old);
+ /* for prog_attach/prog_detach/link_attach, return error if a bpf_link
+ * exists for that prog.
+ */
+ if ((!link || prog) && *plink)
+ return -EBUSY;
- psock_set_prog(pprog, prog);
- return 0;
+ if (old) {
+ ret = psock_replace_prog(pprog, prog, old);
+ if (!ret)
+ *plink = NULL;
+ } else {
+ psock_set_prog(pprog, prog);
+ if (link)
+ *plink = link;
+ }
+
+ return ret;
}
int sock_map_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
- u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd;
+ u32 prog_cnt = 0, flags = 0;
struct bpf_prog **pprog;
struct bpf_prog *prog;
struct bpf_map *map;
- struct fd f;
u32 id = 0;
int ret;
if (attr->query.query_flags)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->target_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
rcu_read_lock();
- ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type);
+ ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type);
if (ret)
goto end;
@@ -1552,7 +1598,6 @@ end:
copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
ret = -EFAULT;
- fdput(f);
return ret;
}
@@ -1633,19 +1678,23 @@ void sock_map_close(struct sock *sk, long timeout)
lock_sock(sk);
rcu_read_lock();
- psock = sk_psock_get(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- release_sock(sk);
- saved_close = READ_ONCE(sk->sk_prot)->close;
- } else {
+ psock = sk_psock(sk);
+ if (likely(psock)) {
saved_close = psock->saved_close;
sock_map_remove_links(sk, psock);
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ goto no_psock;
rcu_read_unlock();
sk_psock_stop(psock);
release_sock(sk);
cancel_delayed_work_sync(&psock->work);
sk_psock_put(sk, psock);
+ } else {
+ saved_close = READ_ONCE(sk->sk_prot)->close;
+no_psock:
+ rcu_read_unlock();
+ release_sock(sk);
}
/* Make sure we do not recurse. This is a bug.
@@ -1657,6 +1706,200 @@ void sock_map_close(struct sock *sk, long timeout)
}
EXPORT_SYMBOL_GPL(sock_map_close);
+struct sockmap_link {
+ struct bpf_link link;
+ struct bpf_map *map;
+ enum bpf_attach_type attach_type;
+};
+
+static void sock_map_link_release(struct bpf_link *link)
+{
+ struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+
+ mutex_lock(&sockmap_mutex);
+ if (!sockmap_link->map)
+ goto out;
+
+ WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
+ sockmap_link->attach_type));
+
+ bpf_map_put_with_uref(sockmap_link->map);
+ sockmap_link->map = NULL;
+out:
+ mutex_unlock(&sockmap_mutex);
+}
+
+static int sock_map_link_detach(struct bpf_link *link)
+{
+ sock_map_link_release(link);
+ return 0;
+}
+
+static void sock_map_link_dealloc(struct bpf_link *link)
+{
+ kfree(link);
+}
+
+/* Handle the following two cases:
+ * case 1: link != NULL, prog != NULL, old != NULL
+ * case 2: link != NULL, prog != NULL, old == NULL
+ */
+static int sock_map_link_update_prog(struct bpf_link *link,
+ struct bpf_prog *prog,
+ struct bpf_prog *old)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ struct bpf_prog **pprog, *old_link_prog;
+ struct bpf_link **plink;
+ int ret = 0;
+
+ mutex_lock(&sockmap_mutex);
+
+ /* If old prog is not NULL, ensure old prog is the same as link->prog. */
+ if (old && link->prog != old) {
+ ret = -EPERM;
+ goto out;
+ }
+ /* Ensure link->prog has the same type/attach_type as the new prog. */
+ if (link->prog->type != prog->type ||
+ link->prog->expected_attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sockmap_link->map) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink,
+ sockmap_link->attach_type);
+ if (ret)
+ goto out;
+
+ /* return error if the stored bpf_link does not match the incoming bpf_link. */
+ if (link != *plink) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (old) {
+ ret = psock_replace_prog(pprog, prog, old);
+ if (ret)
+ goto out;
+ } else {
+ psock_set_prog(pprog, prog);
+ }
+
+ bpf_prog_inc(prog);
+ old_link_prog = xchg(&link->prog, prog);
+ bpf_prog_put(old_link_prog);
+
+out:
+ mutex_unlock(&sockmap_mutex);
+ return ret;
+}
+
+static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
+{
+ u32 map_id = 0;
+
+ mutex_lock(&sockmap_mutex);
+ if (sockmap_link->map)
+ map_id = sockmap_link->map->id;
+ mutex_unlock(&sockmap_mutex);
+ return map_id;
+}
+
+static int sock_map_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+ info->sockmap.map_id = map_id;
+ info->sockmap.attach_type = sockmap_link->attach_type;
+ return 0;
+}
+
+static void sock_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+ seq_printf(seq, "map_id:\t%u\n", map_id);
+ seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type);
+}
+
+static const struct bpf_link_ops sock_map_link_ops = {
+ .release = sock_map_link_release,
+ .dealloc = sock_map_link_dealloc,
+ .detach = sock_map_link_detach,
+ .update_prog = sock_map_link_update_prog,
+ .fill_link_info = sock_map_link_fill_info,
+ .show_fdinfo = sock_map_link_show_fdinfo,
+};
+
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct sockmap_link *sockmap_link;
+ enum bpf_attach_type attach_type;
+ struct bpf_map *map;
+ int ret;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ map = bpf_map_get_with_uref(attr->link_create.target_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
+ if (!sockmap_link) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ attach_type = attr->link_create.attach_type;
+ bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog);
+ sockmap_link->map = map;
+ sockmap_link->attach_type = attach_type;
+
+ ret = bpf_link_prime(&sockmap_link->link, &link_primer);
+ if (ret) {
+ kfree(sockmap_link);
+ goto out;
+ }
+
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
+ mutex_unlock(&sockmap_mutex);
+ if (ret) {
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+
+ /* Increase refcnt for the prog since when old prog is replaced with
+ * psock_replace_prog() and psock_set_prog() its refcnt will be decreased.
+ *
+ * Actually, we do not need to increase refcnt for the prog since bpf_link
+ * will hold a reference. But in order to have less complexity w.r.t.
+ * replacing/setting prog, let us increase the refcnt to make things simpler.
+ */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out:
+ bpf_map_put_with_uref(map);
+ return ret;
+}
+
static int sock_map_iter_attach_target(struct bpf_prog *prog,
union bpf_iter_link_info *linfo,
struct bpf_iter_aux_info *aux)
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 5a165286e4d8..4211710393a8 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -173,10 +173,9 @@ static bool __reuseport_detach_closed_sock(struct sock *sk,
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
- unsigned int size = sizeof(struct sock_reuseport) +
- sizeof(struct sock *) * max_socks;
- struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
+ struct sock_reuseport *reuse;
+ reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC);
if (!reuse)
return NULL;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0f0cb1465e08..c7769ee0d9c5 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -23,6 +23,9 @@
#include <net/net_ratelimit.h>
#include <net/busy_poll.h>
#include <net/pkt_sched.h>
+#include <net/hotdata.h>
+#include <net/proto_memory.h>
+#include <net/rps.h>
#include "dev.h"
@@ -30,6 +33,8 @@ static int int_3600 = 3600;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
+static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
+static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -47,29 +52,45 @@ int sysctl_devconf_inherit_init_net __read_mostly;
EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
-static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
- struct cpumask *mask)
+static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
+ struct cpumask *mask)
{
- char kbuf[128];
+ char *kbuf;
int len;
if (*ppos || !*lenp) {
*lenp = 0;
- return;
+ return 0;
+ }
+
+ /* CPUs are displayed as a hex bitmap + a comma between each groups of 8
+ * nibbles (except the last one which has a newline instead).
+ * Guesstimate the buffer size at the group granularity level.
+ */
+ len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp);
+ kbuf = kmalloc(len, GFP_KERNEL);
+ if (!kbuf) {
+ *lenp = 0;
+ return -ENOMEM;
}
- len = min(sizeof(kbuf) - 1, *lenp);
len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
if (!len) {
*lenp = 0;
- return;
+ goto free_buf;
}
- if (len < *lenp)
- kbuf[len++] = '\n';
+ /* scnprintf writes a trailing null char not counted in the returned
+ * length, override it with a newline.
+ */
+ kbuf[len++] = '\n';
memcpy(buffer, kbuf, len);
*lenp = len;
*ppos += len;
+
+free_buf:
+ kfree(kbuf);
+ return 0;
}
#endif
@@ -91,7 +112,7 @@ static struct cpumask *rps_default_mask_cow_alloc(struct net *net)
return rps_default_mask;
}
-static int rps_default_mask_sysctl(struct ctl_table *table, int write,
+static int rps_default_mask_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = (struct net *)table->data;
@@ -113,8 +134,8 @@ static int rps_default_mask_sysctl(struct ctl_table *table, int write,
if (err)
goto done;
} else {
- dump_cpumask(buffer, lenp, ppos,
- net->core.rps_default_mask ? : cpu_none_mask);
+ err = dump_cpumask(buffer, lenp, ppos,
+ net->core.rps_default_mask ? : cpu_none_mask);
}
done:
@@ -122,7 +143,7 @@ done:
return err;
}
-static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
+static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int orig_size, size;
@@ -137,7 +158,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
mutex_lock(&sock_flow_mutex);
- orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
+ orig_sock_table = rcu_dereference_protected(
+ net_hotdata.rps_sock_flow_table,
lockdep_is_held(&sock_flow_mutex));
size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
@@ -158,7 +180,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
mutex_unlock(&sock_flow_mutex);
return -ENOMEM;
}
- rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
+ net_hotdata.rps_cpu_mask =
+ roundup_pow_of_two(nr_cpu_ids) - 1;
sock_table->mask = size - 1;
} else
sock_table = orig_sock_table;
@@ -169,7 +192,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
sock_table = NULL;
if (sock_table != orig_sock_table) {
- rcu_assign_pointer(rps_sock_flow_table, sock_table);
+ rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
+ sock_table);
if (sock_table) {
static_branch_inc(&rps_needed);
static_branch_inc(&rfs_needed);
@@ -191,7 +215,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_NET_FLOW_LIMIT
static DEFINE_MUTEX(flow_limit_update_mutex);
-static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
+static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct sd_flow_limit *cur;
@@ -240,7 +264,7 @@ write_unlock:
}
rcu_read_unlock();
- dump_cpumask(buffer, lenp, ppos, mask);
+ ret = dump_cpumask(buffer, lenp, ppos, mask);
}
done:
@@ -248,7 +272,7 @@ done:
return ret;
}
-static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
+static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old, *ptr;
@@ -270,7 +294,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
#endif /* CONFIG_NET_FLOW_LIMIT */
#ifdef CONFIG_NET_SCHED
-static int set_default_qdisc(struct ctl_table *table, int write,
+static int set_default_qdisc(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char id[IFNAMSIZ];
@@ -289,25 +313,25 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
-static int proc_do_dev_weight(struct ctl_table *table, int write,
+static int proc_do_dev_weight(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
static DEFINE_MUTEX(dev_weight_mutex);
int ret, weight;
mutex_lock(&dev_weight_mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
weight = READ_ONCE(weight_p);
- WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias);
- WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias);
+ WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias);
+ WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias);
}
mutex_unlock(&dev_weight_mutex);
return ret;
}
-static int proc_do_rss_key(struct ctl_table *table, int write,
+static int proc_do_rss_key(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
@@ -320,7 +344,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
}
#ifdef CONFIG_BPF_JIT
-static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -353,7 +377,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
# ifdef CONFIG_HAVE_EBPF_JIT
static int
-proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
@@ -364,7 +388,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
# endif /* CONFIG_HAVE_EBPF_JIT */
static int
-proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
@@ -376,36 +400,12 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
static struct ctl_table net_core_table[] = {
{
- .procname = "wmem_max",
- .data = &sysctl_wmem_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_sndbuf,
- },
- {
- .procname = "rmem_max",
- .data = &sysctl_rmem_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_rcvbuf,
- },
- {
- .procname = "wmem_default",
- .data = &sysctl_wmem_default,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_sndbuf,
- },
- {
- .procname = "rmem_default",
- .data = &sysctl_rmem_default,
+ .procname = "mem_pcpu_rsv",
+ .data = &net_hotdata.sysctl_mem_pcpu_rsv,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &min_rcvbuf,
+ .extra1 = &min_mem_pcpu_rsv,
},
{
.procname = "dev_weight",
@@ -413,6 +413,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "dev_weight_rx_bias",
@@ -420,6 +421,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "dev_weight_tx_bias",
@@ -427,10 +429,11 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "netdev_max_backlog",
- .data = &netdev_max_backlog,
+ .data = &net_hotdata.max_backlog,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
@@ -489,7 +492,7 @@ static struct ctl_table net_core_table[] = {
#endif
{
.procname = "netdev_tstamp_prequeue",
- .data = &netdev_tstamp_prequeue,
+ .data = &net_hotdata.tstamp_prequeue,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
@@ -508,15 +511,6 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "tstamp_allow_data",
- .data = &sysctl_tstamp_allow_data,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE
- },
#ifdef CONFIG_RPS
{
.procname = "rps_sock_flow_entries",
@@ -567,7 +561,7 @@ static struct ctl_table net_core_table[] = {
#endif
{
.procname = "netdev_budget",
- .data = &netdev_budget,
+ .data = &net_hotdata.netdev_budget,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
@@ -581,7 +575,7 @@ static struct ctl_table net_core_table[] = {
},
{
.procname = "max_skb_frags",
- .data = &sysctl_max_skb_frags,
+ .data = &net_hotdata.sysctl_max_skb_frags,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -590,11 +584,11 @@ static struct ctl_table net_core_table[] = {
},
{
.procname = "netdev_budget_usecs",
- .data = &netdev_budget_usecs,
+ .data = &net_hotdata.netdev_budget_usecs,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .extra1 = &netdev_budget_usecs_min,
},
{
.procname = "fb_tunnels_only_for_init_net",
@@ -623,7 +617,7 @@ static struct ctl_table net_core_table[] = {
},
{
.procname = "gro_normal_batch",
- .data = &gro_normal_batch,
+ .data = &net_hotdata.gro_normal_batch,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -640,13 +634,12 @@ static struct ctl_table net_core_table[] = {
},
{
.procname = "skb_defer_max",
- .data = &sysctl_skb_defer_max,
+ .data = &net_hotdata.sysctl_skb_defer_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
- { }
};
static struct ctl_table netns_core_table[] = {
@@ -683,7 +676,50 @@ static struct ctl_table netns_core_table[] = {
.extra2 = SYSCTL_ONE,
.proc_handler = proc_dou8vec_minmax,
},
- { }
+ {
+ .procname = "tstamp_allow_data",
+ .data = &init_net.core.sysctl_tstamp_allow_data,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ /* sysctl_core_net_init() will set the values after this
+ * to readonly in network namespaces
+ */
+ {
+ .procname = "wmem_max",
+ .data = &sysctl_wmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_max",
+ .data = &sysctl_rmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ {
+ .procname = "wmem_default",
+ .data = &sysctl_wmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_default",
+ .data = &sysctl_rmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
};
static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
@@ -701,20 +737,27 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
static __net_init int sysctl_core_net_init(struct net *net)
{
- struct ctl_table *tbl, *tmp;
+ size_t table_size = ARRAY_SIZE(netns_core_table);
+ struct ctl_table *tbl;
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
+ int i;
tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
if (tbl == NULL)
goto err_dup;
- for (tmp = tbl; tmp->procname; tmp++)
- tmp->data += (char *)net - (char *)&init_net;
+ for (i = 0; i < table_size; ++i) {
+ if (tbl[i].data == &sysctl_wmem_max)
+ break;
+
+ tbl[i].data += (char *)net - (char *)&init_net;
+ }
+ for (; i < table_size; ++i)
+ tbl[i].mode &= ~0222;
}
- net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl,
- ARRAY_SIZE(netns_core_table));
+ net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size);
if (net->core.sysctl_hdr == NULL)
goto err_reg;
@@ -729,7 +772,7 @@ err_dup:
static __net_exit void sysctl_core_net_exit(struct net *net)
{
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = net->core.sysctl_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->core.sysctl_hdr);
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 04840697fe79..a50a7ef49ae8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -9,6 +9,7 @@
#include <linux/ptp_classify.h>
#include <linux/skbuff.h>
#include <linux/export.h>
+#include <linux/ptp_clock_kernel.h>
static unsigned int classify(const struct sk_buff *skb)
{
@@ -21,18 +22,39 @@ static unsigned int classify(const struct sk_buff *skb)
void skb_clone_tx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
struct sk_buff *clone;
unsigned int type;
- if (!skb->sk)
+ if (!skb->sk || !skb->dev)
return;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
type = classify(skb);
if (type == PTP_CLASS_NONE)
return;
- mii_ts = skb->dev->phydev->mii_ts;
+ mii_ts = phydev->mii_ts;
if (likely(mii_ts->txtstamp)) {
clone = skb_clone_sk(skb);
if (!clone)
@@ -44,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
unsigned int type;
- if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts)
+ if (!skb->dev)
return false;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
@@ -62,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
if (type == PTP_CLASS_NONE)
return false;
- mii_ts = skb->dev->phydev->mii_ts;
+ mii_ts = phydev->mii_ts;
if (likely(mii_ts->rxtstamp))
return mii_ts->rxtstamp(mii_ts, skb, type);
diff --git a/net/core/tso.c b/net/core/tso.c
index e00796e3b146..6df997b9076e 100644
--- a/net/core/tso.c
+++ b/net/core/tso.c
@@ -3,7 +3,7 @@
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/tso.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
int size, bool is_last)
diff --git a/net/core/utils.c b/net/core/utils.c
index c994e95172ac..27f4cffaae05 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Generic address resultion entity
+ * Generic address resolution entity
*
* Authors:
* net_random Alan Cox
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 4869c1c2d8f3..2c6ab6fb452f 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -16,6 +16,7 @@
#include <linux/bug.h>
#include <net/page_pool/helpers.h>
+#include <net/hotdata.h>
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
@@ -75,7 +76,7 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
/* Allow this ID to be reused */
- ida_simple_remove(&mem_id_pool, xa->mem.id);
+ ida_free(&mem_id_pool, xa->mem.id);
kfree(xa);
}
@@ -126,10 +127,8 @@ void xdp_unreg_mem_model(struct xdp_mem_info *mem)
return;
if (type == MEM_TYPE_PAGE_POOL) {
- rcu_read_lock();
- xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
page_pool_destroy(xa->page_pool);
- rcu_read_unlock();
}
}
EXPORT_SYMBOL_GPL(xdp_unreg_mem_model);
@@ -187,7 +186,6 @@ int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq_info_init(xdp_rxq);
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
- xdp_rxq->napi_id = napi_id;
xdp_rxq->frag_size = frag_size;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
@@ -242,7 +240,7 @@ static int __mem_id_cyclic_get(gfp_t gfp)
int id;
again:
- id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+ id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp);
if (id < 0) {
if (id == -ENOSPC) {
/* Cyclic allocator, reset next id */
@@ -294,10 +292,8 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
mutex_lock(&mem_id_lock);
ret = __mem_id_init_hash_table();
mutex_unlock(&mem_id_lock);
- if (ret < 0) {
- WARN_ON(1);
+ if (ret < 0)
return ERR_PTR(ret);
- }
}
xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
@@ -317,7 +313,7 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
if (IS_ERR(ptr)) {
- ida_simple_remove(&mem_id_pool, mem->id);
+ ida_free(&mem_id_pool, mem->id);
mem->id = 0;
errno = PTR_ERR(ptr);
goto err;
@@ -361,6 +357,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
if (IS_ERR(xdp_alloc))
return PTR_ERR(xdp_alloc);
+ if (type == MEM_TYPE_XSK_BUFF_POOL && allocator)
+ xsk_pool_set_rxq_info(allocator, xdp_rxq);
+
if (trace_mem_connect_enabled() && xdp_alloc)
trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
@@ -368,33 +367,87 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+/**
+ * xdp_reg_page_pool - register &page_pool as a memory provider for XDP
+ * @pool: &page_pool to register
+ *
+ * Can be used to register pools manually without connecting to any XDP RxQ
+ * info, so that the XDP layer will be aware of them. Then, they can be
+ * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool().
+ *
+ * Return: %0 on success, -errno on error.
+ */
+int xdp_reg_page_pool(struct page_pool *pool)
+{
+ struct xdp_mem_info mem;
+
+ return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool);
+}
+EXPORT_SYMBOL_GPL(xdp_reg_page_pool);
+
+/**
+ * xdp_unreg_page_pool - unregister &page_pool from the memory providers list
+ * @pool: &page_pool to unregister
+ *
+ * A shorthand for manual unregistering page pools. If the pool was previously
+ * attached to an RxQ info, it must be detached first.
+ */
+void xdp_unreg_page_pool(const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_unreg_mem_model(&mem);
+}
+EXPORT_SYMBOL_GPL(xdp_unreg_page_pool);
+
+/**
+ * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info
+ * @xdp_rxq: XDP RxQ info to attach the pool to
+ * @pool: pool to attach
+ *
+ * If the pool was registered manually, this function must be called instead
+ * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info.
+ */
+void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
+ const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_rxq_info_attach_mem_model(xdp_rxq, &mem);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
+
/* XDP RX runs under NAPI protection, and in different delivery error
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
-void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- struct xdp_buff *xdp)
+void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
+ bool napi_direct, struct xdp_buff *xdp)
{
- struct page *page;
-
- switch (mem->type) {
+ switch (mem_type) {
case MEM_TYPE_PAGE_POOL:
- page = virt_to_head_page(data);
+ netmem = netmem_compound_head(netmem);
if (napi_direct && xdp_return_frame_no_direct())
napi_direct = false;
/* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
* as mem->type knows this a page_pool page
*/
- page_pool_put_full_page(page->pp, page, napi_direct);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
+ napi_direct);
break;
case MEM_TYPE_PAGE_SHARED:
- page_frag_free(data);
+ page_frag_free(__netmem_address(netmem));
break;
case MEM_TYPE_PAGE_ORDER0:
- page = virt_to_page(data); /* Assumes order0 page*/
- put_page(page);
+ put_page(__netmem_to_page(netmem));
break;
case MEM_TYPE_XSK_BUFF_POOL:
/* NB! Only valid from an xdp_buff! */
@@ -402,7 +455,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
break;
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
- WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem_type);
break;
}
}
@@ -410,38 +463,34 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
void xdp_return_frame(struct xdp_frame *xdpf)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_frame_has_frags(xdpf)))
goto out;
sinfo = xdp_get_shared_info_from_frame(xdpf);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ false, NULL);
- __xdp_return(page_address(page), &xdpf->mem, false, NULL);
- }
out:
- __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_frame_has_frags(xdpf)))
goto out;
sinfo = xdp_get_shared_info_from_frame(xdpf);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ true, NULL);
- __xdp_return(page_address(page), &xdpf->mem, true, NULL);
- }
out:
- __xdp_return(xdpf->data, &xdpf->mem, true, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
@@ -455,46 +504,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
* xdp_frame_bulk is usually stored/allocated on the function
* call-stack to avoid locking penalties.
*/
-void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
-{
- struct xdp_mem_allocator *xa = bq->xa;
-
- if (unlikely(!xa || !bq->count))
- return;
-
- page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count);
- /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */
- bq->count = 0;
-}
-EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
/* Must be called with rcu_read_lock held */
void xdp_return_frame_bulk(struct xdp_frame *xdpf,
struct xdp_frame_bulk *bq)
{
- struct xdp_mem_info *mem = &xdpf->mem;
- struct xdp_mem_allocator *xa;
-
- if (mem->type != MEM_TYPE_PAGE_POOL) {
+ if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) {
xdp_return_frame(xdpf);
return;
}
- xa = bq->xa;
- if (unlikely(!xa)) {
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- bq->count = 0;
- bq->xa = xa;
- }
-
if (bq->count == XDP_BULK_QUEUE_SIZE)
xdp_flush_frame_bulk(bq);
- if (unlikely(mem->id != xa->mem.id)) {
- xdp_flush_frame_bulk(bq);
- bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- }
-
if (unlikely(xdp_frame_has_frags(xdpf))) {
struct skb_shared_info *sinfo;
int i;
@@ -503,31 +525,40 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
for (i = 0; i < sinfo->nr_frags; i++) {
skb_frag_t *frag = &sinfo->frags[i];
- bq->q[bq->count++] = skb_frag_address(frag);
+ bq->q[bq->count++] = skb_frag_netmem(frag);
if (bq->count == XDP_BULK_QUEUE_SIZE)
xdp_flush_frame_bulk(bq);
}
}
- bq->q[bq->count++] = xdpf->data;
+ bq->q[bq->count++] = virt_to_netmem(xdpf->data);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+/**
+ * xdp_return_frag -- free one XDP frag or decrement its refcount
+ * @netmem: network memory reference to release
+ * @xdp: &xdp_buff to release the frag for
+ */
+void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
+{
+ __xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frag);
+
void xdp_return_buff(struct xdp_buff *xdp)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_buff_has_frags(xdp)))
goto out;
sinfo = xdp_get_shared_info_from_buff(xdp);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]),
+ xdp->rxq->mem.type, true, xdp);
- __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
- }
out:
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
+ __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
@@ -573,7 +604,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->headroom = 0;
xdpf->metasize = metasize;
xdpf->frame_sz = PAGE_SIZE;
- xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+ xdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
xsk_buff_free(xdp);
return xdpf;
@@ -589,7 +620,7 @@ EXPORT_SYMBOL_GPL(xdp_warn);
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
{
- n_skb = kmem_cache_alloc_bulk(skbuff_cache, gfp, n_skb, skbs);
+ n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs);
if (unlikely(!n_skb))
return -ENOMEM;
@@ -597,6 +628,173 @@ int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk);
+/**
+ * xdp_build_skb_from_buff - create an skb from &xdp_buff
+ * @xdp: &xdp_buff to convert to an skb
+ *
+ * Perform common operations to create a new skb to pass up the stack from
+ * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize
+ * skb data pointers and offsets, set the recycle bit if the buff is
+ * PP-backed, Rx queue index, protocol and update frags info.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
+{
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ const struct skb_shared_info *sinfo;
+ struct sk_buff *skb;
+ u32 nr_frags = 0;
+ int metalen;
+
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
+
+ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ __skb_put(skb, xdp->data_end - xdp->data);
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0)
+ skb_metadata_set(skb, metalen);
+
+ if (rxq->mem.type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(nr_frags)) {
+ u32 tsize;
+
+ tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
+ xdp_update_skb_shared_info(skb, nr_frags,
+ sinfo->xdp_frags_size, tsize,
+ xdp_buff_is_frag_pfmemalloc(xdp));
+ }
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff);
+
+/**
+ * xdp_copy_frags_from_zc - copy frags from XSk buff to skb
+ * @skb: skb to copy frags to
+ * @xdp: XSk &xdp_buff from which the frags will be copied
+ * @pp: &page_pool backing page allocation, if available
+ *
+ * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack.
+ * Allocate a new buffer for each frag, copy it and attach to the skb.
+ *
+ * Return: true on success, false on netmem allocation fail.
+ */
+static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
+ const struct xdp_buff *xdp,
+ struct page_pool *pp)
+{
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ const struct skb_shared_info *xinfo;
+ u32 nr_frags, tsize = 0;
+ bool pfmemalloc = false;
+
+ xinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = xinfo->nr_frags;
+
+ for (u32 i = 0; i < nr_frags; i++) {
+ u32 len = skb_frag_size(&xinfo->frags[i]);
+ u32 offset, truesize = len;
+ netmem_ref netmem;
+
+ netmem = page_pool_dev_alloc_netmem(pp, &offset, &truesize);
+ if (unlikely(!netmem)) {
+ sinfo->nr_frags = i;
+ return false;
+ }
+
+ memcpy(__netmem_address(netmem),
+ __netmem_address(xinfo->frags[i].netmem),
+ LARGEST_ALIGN(len));
+ __skb_fill_netmem_desc_noacc(sinfo, i, netmem, offset, len);
+
+ tsize += truesize;
+ pfmemalloc |= netmem_is_pfmemalloc(netmem);
+ }
+
+ xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size,
+ tsize, pfmemalloc);
+
+ return true;
+}
+
+/**
+ * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff
+ * @xdp: source XSk buff
+ *
+ * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb
+ * head, new buffer for the head, copy the data and initialize the skb fields.
+ * If there are frags, allocate new buffers for them and copy.
+ * Buffers are allocated from the system percpu pools to try recycling them.
+ * If new skb was built successfully, @xdp is returned to XSk pool's freelist.
+ * On error, it remains untouched and the caller must take care of this.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
+{
+ struct page_pool *pp = this_cpu_read(system_page_pool);
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ u32 len = xdp->data_end - xdp->data_meta;
+ u32 truesize = xdp->frame_sz;
+ struct sk_buff *skb;
+ int metalen;
+ void *data;
+
+ if (!IS_ENABLED(CONFIG_PAGE_POOL))
+ return NULL;
+
+ data = page_pool_dev_alloc_va(pp, &truesize);
+ if (unlikely(!data))
+ return NULL;
+
+ skb = napi_build_skb(data, truesize);
+ if (unlikely(!skb)) {
+ page_pool_free_va(pp, data, true);
+ return NULL;
+ }
+
+ skb_mark_for_recycle(skb);
+ skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
+
+ memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len));
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0) {
+ skb_metadata_set(skb, metalen);
+ __skb_pull(skb, metalen);
+ }
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(xdp_buff_has_frags(xdp)) &&
+ unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
+ napi_consume_skb(skb, true);
+ return NULL;
+ }
+
+ xsk_buff_free(xdp);
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);
+
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *skb,
struct net_device *dev)
@@ -643,7 +841,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
* - RX ring dev queue index (skb_record_rx_queue)
*/
- if (xdpf->mem.type == MEM_TYPE_PAGE_POOL)
+ if (xdpf->mem_type == MEM_TYPE_PAGE_POOL)
skb_mark_for_recycle(skb);
/* Allow SKB to reuse area used by xdp_frame */
@@ -658,7 +856,7 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
{
struct sk_buff *skb;
- skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
@@ -690,8 +888,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
nxdpf = addr;
nxdpf->data = addr + headroom;
nxdpf->frame_sz = PAGE_SIZE;
- nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
- nxdpf->mem.id = 0;
+ nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
return nxdpf;
}
@@ -771,11 +968,11 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
__bpf_kfunc_end_defs();
-BTF_SET8_START(xdp_metadata_kfunc_ids)
+BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
-BTF_SET8_END(xdp_metadata_kfunc_ids)
+BTF_KFUNCS_END(xdp_metadata_kfunc_ids)
static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
.owner = THIS_MODULE,