summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile9
-rw-r--r--net/core/bpf_sk_storage.c53
-rw-r--r--net/core/datagram.c178
-rw-r--r--net/core/dev.c2380
-rw-r--r--net/core/dev.h251
-rw-r--r--net/core/dev_addr_lists.c13
-rw-r--r--net/core/dev_addr_lists_test.c14
-rw-r--r--net/core/dev_api.c369
-rw-r--r--net/core/dev_ioctl.c178
-rw-r--r--net/core/devmem.c495
-rw-r--r--net/core/devmem.h246
-rw-r--r--net/core/drop_monitor.c70
-rw-r--r--net/core/dst.c31
-rw-r--r--net/core/dst_cache.c43
-rw-r--r--net/core/fib_notifier.c2
-rw-r--r--net/core/fib_rules.c351
-rw-r--r--net/core/filter.c914
-rw-r--r--net/core/flow_dissector.c162
-rw-r--r--net/core/gen_estimator.c6
-rw-r--r--net/core/gro.c154
-rw-r--r--net/core/hotdata.c8
-rw-r--r--net/core/ieee8021q_helpers.c242
-rw-r--r--net/core/link_watch.c43
-rw-r--r--net/core/lock_debug.c122
-rw-r--r--net/core/lwt_bpf.c21
-rw-r--r--net/core/lwtunnel.c105
-rw-r--r--net/core/mp_dmabuf_devmem.h44
-rw-r--r--net/core/neighbour.c506
-rw-r--r--net/core/net-procfs.c36
-rw-r--r--net/core/net-sysfs.c488
-rw-r--r--net/core/net-traces.c2
-rw-r--r--net/core/net_namespace.c353
-rw-r--r--net/core/net_test.c (renamed from net/core/gso_test.c)129
-rw-r--r--net/core/netdev-genl-gen.c70
-rw-r--r--net/core/netdev-genl-gen.h8
-rw-r--r--net/core/netdev-genl.c627
-rw-r--r--net/core/netdev_rx_queue.c187
-rw-r--r--net/core/netmem_priv.h62
-rw-r--r--net/core/netpoll.c185
-rw-r--r--net/core/page_pool.c752
-rw-r--r--net/core/page_pool_priv.h48
-rw-r--r--net/core/page_pool_user.c46
-rw-r--r--net/core/pktgen.c476
-rw-r--r--net/core/rtnetlink.c1595
-rw-r--r--net/core/scm.c144
-rw-r--r--net/core/secure_seq.c44
-rw-r--r--net/core/selftests.c27
-rw-r--r--net/core/skb_fault_injection.c106
-rw-r--r--net/core/skbuff.c823
-rw-r--r--net/core/skmsg.c88
-rw-r--r--net/core/sock.c405
-rw-r--r--net/core/sock_diag.c10
-rw-r--r--net/core/sock_map.c329
-rw-r--r--net/core/sock_reuseport.c5
-rw-r--r--net/core/sysctl_net_core.c185
-rw-r--r--net/core/timestamping.c51
-rw-r--r--net/core/tso.c2
-rw-r--r--net/core/utils.c14
-rw-r--r--net/core/xdp.c384
59 files changed, 10599 insertions, 4092 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 6e6548011fae..b2a76ce33932 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
+obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
fib_notifier.o xdp.o flow_offload.o gro.o \
@@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-y += hotdata.o
+obj-y += netdev_rx_queue.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
@@ -26,6 +27,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o
obj-$(CONFIG_NET_SELFTESTS) += selftests.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
@@ -41,4 +43,7 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
obj-$(CONFIG_OF) += of_net.o
-obj-$(CONFIG_NET_TEST) += gso_test.o
+obj-$(CONFIG_NET_TEST) += net_test.o
+obj-$(CONFIG_NET_DEVMEM) += devmem.o
+obj-$(CONFIG_DEBUG_NET) += lock_debug.o
+obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 6c4d90b24d46..2e538399757f 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
+ migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
- if (!sk_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!sk_storage)
+ goto out;
bpf_local_storage_destroy(sk_storage);
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
@@ -106,7 +107,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
if (sock) {
sdata = bpf_local_storage_update(
sock->sk, (struct bpf_local_storage_map *)map, value,
- map_flags, GFP_ATOMIC);
+ map_flags, false, GFP_ATOMIC);
sockfd_put(sock);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -137,7 +138,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
{
struct bpf_local_storage_elem *copy_selem;
- copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC);
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC);
if (!copy_selem)
return NULL;
@@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+ migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
out:
rcu_read_unlock();
+ migrate_enable();
/* In case of an error, don't free anything explicitly here, the
* caller is responsible to call bpf_sk_storage_free.
@@ -243,7 +246,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
refcount_inc_not_zero(&sk->sk_refcnt)) {
sdata = bpf_local_storage_update(
sk, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, gfp_flags);
+ BPF_NOEXIST, false, gfp_flags);
/* sk must be a fullsock (guaranteed by verifier),
* so sock_gen_put() is unnecessary.
*/
@@ -352,11 +355,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
{
- const struct btf *btf_vmlinux;
- const struct btf_type *t;
- const char *tname;
- u32 btf_id;
-
if (prog->aux->dst_prog)
return false;
@@ -371,13 +369,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
return true;
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- btf_vmlinux = bpf_get_btf_vmlinux();
- if (IS_ERR_OR_NULL(btf_vmlinux))
- return false;
- btf_id = prog->aux->attach_btf_id;
- t = btf_type_by_id(btf_vmlinux, btf_id);
- tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- return !!strncmp(tname, "bpf_sk_storage",
+ return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
strlen("bpf_sk_storage"));
default:
return false;
@@ -496,27 +488,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
if (!bpf_capable())
return ERR_PTR(-EPERM);
- nla_for_each_nested(nla, nla_stgs, rem) {
- if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) {
- if (nla_len(nla) != sizeof(u32))
- return ERR_PTR(-EINVAL);
- nr_maps++;
- }
+ nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+ nla_stgs, rem) {
+ if (nla_len(nla) != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+ nr_maps++;
}
diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
if (!diag)
return ERR_PTR(-ENOMEM);
- nla_for_each_nested(nla, nla_stgs, rem) {
- struct bpf_map *map;
- int map_fd;
-
- if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
- continue;
+ nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+ nla_stgs, rem) {
+ int map_fd = nla_get_u32(nla);
+ struct bpf_map *map = bpf_map_get(map_fd);
- map_fd = nla_get_u32(nla);
- map = bpf_map_get(map_fd);
if (IS_ERR(map)) {
err = PTR_ERR(map);
goto err_free;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a8b625abe242..94cc4705e91d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -52,6 +52,7 @@
#include <linux/pagemap.h>
#include <linux/iov_iter.h>
#include <linux/indirect_call_wrapper.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -61,7 +62,8 @@
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
-#include <crypto/hash.h>
+
+#include "devmem.h"
/*
* Is a socket 'connection oriented' ?
@@ -163,8 +165,7 @@ done:
return skb;
}
-struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
- struct sk_buff_head *queue,
+struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
unsigned int flags,
int *off, int *err,
struct sk_buff **last)
@@ -261,7 +262,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
* However, this function was correct in any case. 8)
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
+ skb = __skb_try_recv_from_queue(queue, flags, off, &error,
last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
@@ -324,25 +325,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(skb_free_datagram);
-void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
-{
- bool slow;
-
- if (!skb_unref(skb)) {
- sk_peek_offset_bwd(sk, len);
- return;
- }
-
- slow = lock_sock_fast(sk);
- sk_peek_offset_bwd(sk, len);
- skb_orphan(skb);
- unlock_sock_fast(sk, slow);
-
- /* skb is now orphaned, can be freed outside of locked section */
- __kfree_skb(skb);
-}
-EXPORT_SYMBOL(__skb_free_datagram_locked);
-
int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
struct sk_buff *skb, unsigned int flags,
void (*destructor)(struct sock *sk,
@@ -426,6 +408,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
return 0;
}
+ if (!skb_frags_readable(skb))
+ goto short_copy;
+
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
@@ -435,15 +420,23 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
- struct page *page = skb_frag_page(frag);
- u8 *vaddr = kmap(page);
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
if (copy > len)
copy = len;
- n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
- vaddr + skb_frag_off(frag) + offset - start,
- copy, data, to);
- kunmap(page);
+
+ n = 0;
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_local_page(p);
+ n += INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + p_off, p_len, data, to);
+ kunmap_local(vaddr);
+ }
+
offset += n;
if (n != copy)
goto short_copy;
@@ -490,41 +483,37 @@ short_copy:
return 0;
}
-static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
- struct iov_iter *i)
+#ifdef CONFIG_NET_CRC32C
+static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes,
+ void *_crcp, struct iov_iter *i)
{
-#ifdef CONFIG_CRYPTO_HASH
- struct ahash_request *hash = hashp;
- struct scatterlist sg;
+ u32 *crcp = _crcp;
size_t copied;
copied = copy_to_iter(addr, bytes, i);
- sg_init_one(&sg, addr, copied);
- ahash_request_set_crypt(hash, &sg, NULL, copied);
- crypto_ahash_update(hash);
+ *crcp = crc32c(*crcp, addr, copied);
return copied;
-#else
- return 0;
-#endif
}
/**
- * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
- * and update a hash.
+ * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator
+ * and update a CRC32C value.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: iovec iterator to copy to
* @len: amount of data to copy from buffer to iovec
- * @hash: hash request to update
+ * @crcp: pointer to CRC32C value to update
+ *
+ * Return: 0 on success, -EFAULT if there was a fault during copy.
*/
-int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
- struct iov_iter *to, int len,
- struct ahash_request *hash)
+int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len, u32 *crcp)
{
return __skb_datagram_iter(skb, offset, to, len, true,
- hash_and_copy_to_iter, hash);
+ crc32c_and_copy_to_iter, crcp);
}
-EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);
+EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter);
+#endif /* CONFIG_NET_CRC32C */
static size_t simple_copy_to_iter(const void *addr, size_t bytes,
void *data __always_unused, struct iov_iter *i)
@@ -629,16 +618,13 @@ fault:
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
- struct sk_buff *skb, struct iov_iter *from,
- size_t length)
+int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
{
- int frag;
-
- if (msg && msg->msg_ubuf && msg->sg_from_iter)
- return msg->sg_from_iter(sk, skb, from, length);
+ int frag = skb_shinfo(skb)->nr_frags;
- frag = skb_shinfo(skb)->nr_frags;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
while (length && iov_iter_count(from)) {
struct page *head, *last_head = NULL;
@@ -646,7 +632,6 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
int refs, order, n = 0;
size_t start;
ssize_t copied;
- unsigned long truesize;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
@@ -658,17 +643,9 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
length -= copied;
- truesize = PAGE_ALIGN(copied + start);
skb->data_len += copied;
skb->len += copied;
- skb->truesize += truesize;
- if (sk && sk->sk_type == SOCK_STREAM) {
- sk_wmem_queued_add(sk, truesize);
- if (!skb_zcopy_pure(skb))
- sk_mem_charge(sk, truesize);
- } else {
- refcount_add(truesize, &skb->sk->sk_wmem_alloc);
- }
+ skb->truesize += PAGE_ALIGN(copied + start);
head = compound_head(pages[n]);
order = compound_order(head);
@@ -711,6 +688,73 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
}
return 0;
}
+
+static int
+zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
+ int length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+ size_t virt_addr, size, off;
+ struct net_iov *niov;
+
+ /* Devmem filling works by taking an IOVEC from the user where the
+ * iov_addrs are interpreted as an offset in bytes into the dma-buf to
+ * send from. We do not support other iter types.
+ */
+ if (iov_iter_type(from) != ITER_IOVEC &&
+ iov_iter_type(from) != ITER_UBUF)
+ return -EFAULT;
+
+ while (length && iov_iter_count(from)) {
+ if (i == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ virt_addr = (size_t)iter_iov_addr(from);
+ niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
+ if (!niov)
+ return -EFAULT;
+
+ size = min_t(size_t, size, length);
+ size = min_t(size_t, size, iter_iov_len(from));
+
+ get_netmem(net_iov_to_netmem(niov));
+ skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
+ size, PAGE_SIZE);
+ iov_iter_advance(from, size);
+ length -= size;
+ i++;
+ }
+
+ return 0;
+}
+
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, struct iov_iter *from,
+ size_t length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ unsigned long orig_size = skb->truesize;
+ unsigned long truesize;
+ int ret;
+
+ if (msg && msg->msg_ubuf && msg->sg_from_iter)
+ ret = msg->sg_from_iter(skb, from, length);
+ else if (binding)
+ ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
+ else
+ ret = zerocopy_fill_skb_from_iter(skb, from, length);
+
+ truesize = skb->truesize - orig_size;
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk_wmem_queued_add(sk, truesize);
+ if (!skb_zcopy_pure(skb))
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
+ return ret;
+}
EXPORT_SYMBOL(__zerocopy_sg_from_iter);
/**
@@ -731,7 +775,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
+ return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
diff --git a/net/core/dev.c b/net/core/dev.c
index 9a67003e49db..be97c440ecd5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -77,7 +77,9 @@
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/isolation.h>
#include <linux/sched/mm.h>
+#include <linux/smpboot.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/string.h>
@@ -90,6 +92,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
@@ -103,6 +106,7 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
+#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
@@ -152,12 +156,16 @@
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
+#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
#include <net/rps.h>
+#include <linux/phy_link_topology.h>
#include "dev.h"
+#include "devmem.h"
#include "net-sysfs.h"
static DEFINE_SPINLOCK(ptype_lock);
@@ -176,8 +184,6 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static DECLARE_RWSEM(devnet_rename_sem);
-
static inline void dev_base_seq_inc(struct net *net)
{
unsigned int val = net->dev_base_seq + 1;
@@ -197,37 +203,62 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
-static inline void rps_lock_irqsave(struct softnet_data *sd,
- unsigned long *flags)
+#ifndef CONFIG_PREEMPT_RT
+
+static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
+
+static int __init setup_backlog_napi_threads(char *arg)
+{
+ static_branch_enable(&use_backlog_threads_key);
+ return 0;
+}
+early_param("thread_backlog_napi", setup_backlog_napi_threads);
+
+static bool use_backlog_threads(void)
{
- if (IS_ENABLED(CONFIG_RPS))
+ return static_branch_unlikely(&use_backlog_threads_key);
+}
+
+#else
+
+static bool use_backlog_threads(void)
+{
+ return true;
+}
+
+#endif
+
+static inline void backlog_lock_irq_save(struct softnet_data *sd,
+ unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_save(*flags);
}
-static inline void rps_lock_irq_disable(struct softnet_data *sd)
+static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
- if (IS_ENABLED(CONFIG_RPS))
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_lock_irq(&sd->input_pkt_queue.lock);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_disable();
}
-static inline void rps_unlock_irq_restore(struct softnet_data *sd,
- unsigned long *flags)
+static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
+ unsigned long *flags)
{
- if (IS_ENABLED(CONFIG_RPS))
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_restore(*flags);
}
-static inline void rps_unlock_irq_enable(struct softnet_data *sd)
+static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
- if (IS_ENABLED(CONFIG_RPS))
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
spin_unlock_irq(&sd->input_pkt_queue.lock);
- else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ else
local_irq_enable();
}
@@ -422,14 +453,18 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
* queue in the local softnet handler.
*/
-DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
+ .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
+};
EXPORT_PER_CPU_SYMBOL(softnet_data);
/* Page_pool has a lockless array/stack to alloc/recycle pages.
* PP consumers must pay attention to run APIs in the appropriate context
* (e.g. NAPI context).
*/
-static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool);
+DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#ifdef CONFIG_LOCKDEP
/*
@@ -539,10 +574,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
- if (pt->type == htons(ETH_P_ALL))
- return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
- else
- return pt->dev ? &pt->dev->ptype_specific :
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (!pt->af_packet_net && !pt->dev)
+ return NULL;
+
+ return pt->dev ? &pt->dev->ptype_all :
+ &pt->af_packet_net->ptype_all;
+ }
+
+ if (pt->dev)
+ return &pt->dev->ptype_specific;
+
+ return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
@@ -563,6 +606,9 @@ void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
+ if (WARN_ON_ONCE(!head))
+ return;
+
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
@@ -587,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt)
struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
+ if (!head)
+ return;
+
spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
@@ -722,6 +771,80 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
}
EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
+
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
+
+ return NULL;
+}
+
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *
+netdev_napi_by_id(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ return NULL;
+
+ if (WARN_ON_ONCE(!napi->dev))
+ return NULL;
+ if (!net_eq(net, dev_net(napi->dev)))
+ return NULL;
+
+ return napi;
+}
+
+/**
+ * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
+ * @net: the applicable net namespace
+ * @napi_id: ID of a NAPI of a target device
+ *
+ * Find a NAPI instance with @napi_id. Lock its device.
+ * The device must be in %NETREG_REGISTERED state for lookup to succeed.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to NAPI, its device with lock held, NULL if not found.
+ */
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+ struct net_device *dev;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ dev = napi->dev;
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev, net);
+ if (!dev)
+ return NULL;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (napi && napi->dev != dev)
+ napi = NULL;
+ rcu_read_unlock();
+
+ if (!napi)
+ netdev_unlock(dev);
+ return napi;
+}
+
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
@@ -896,21 +1019,150 @@ EXPORT_SYMBOL(netdev_get_by_index);
* its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
-
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
struct napi_struct *napi;
WARN_ON_ONCE(!rcu_read_lock_held());
- if (napi_id < MIN_NAPI_ID)
+ if (!napi_id_valid(napi_id))
return NULL;
napi = napi_by_id(napi_id);
return napi ? napi->dev : NULL;
}
-EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/* Release the held reference on the net_device, and if the net_device
+ * is still registered try to lock the instance lock. If device is being
+ * unregistered NULL will be returned (but the reference has been released,
+ * either way!)
+ *
+ * This helper is intended for locking net_device after it has been looked up
+ * using a lockless lookup helper. Lock prevents the instance from going away.
+ */
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
+{
+ netdev_lock(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+static struct net_device *
+__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
+{
+ netdev_lock_ops_compat(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock_ops_compat(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+/**
+ * netdev_get_by_index_lock() - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. If a valid device
+ * with @ifindex is found it will be returned with netdev->lock held.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to a device with lock held, NULL if not found.
+ */
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock(dev, net);
+}
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock_ops_compat(dev, net);
+}
+
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock_ops_compat(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock_ops_compat(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+static DEFINE_SEQLOCK(netdev_rename_lock);
+
+void netdev_copy_name(struct net_device *dev, char *name)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&netdev_rename_lock);
+ strscpy(name, dev->name, IFNAMSIZ);
+ } while (read_seqretry(&netdev_rename_lock, seq));
+}
/**
* netdev_get_name - get a netdevice name, knowing its ifindex.
@@ -923,7 +1175,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
struct net_device *dev;
int ret;
- down_read(&devnet_rename_sem);
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
@@ -932,15 +1183,20 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
goto out;
}
- strcpy(name, dev->name);
+ netdev_copy_name(dev, name);
ret = 0;
out:
rcu_read_unlock();
- up_read(&devnet_rename_sem);
return ret;
}
+static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
+ const char *ha)
+{
+ return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
+}
+
/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
@@ -949,7 +1205,7 @@ out:
*
* Search for an interface by MAC address. Returns NULL if the device
* is not found or a pointer to the device.
- * The caller must hold RCU or RTNL.
+ * The caller must hold RCU.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -961,14 +1217,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
struct net_device *dev;
for_each_netdev_rcu(net, dev)
- if (dev->type == type &&
- !memcmp(dev->dev_addr, ha, dev->addr_len))
+ if (dev_addr_cmp(dev, type, ha))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+/**
+ * dev_getbyhwaddr() - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
+ * rtnl_lock.
+ *
+ * Context: rtnl_lock() must be held.
+ * Return: pointer to the net_device, or NULL if not found
+ */
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
+ const char *ha)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev_addr_cmp(dev, type, ha))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr);
+
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
@@ -1159,41 +1440,27 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
return ret < 0 ? ret : 0;
}
-/**
- * dev_change_name - change name of a device
- * @dev: device
- * @newname: name (or format string) must be at least IFNAMSIZ
- *
- * Change name of a device, can pass format strings "eth%d".
- * for wildcarding.
- */
-int dev_change_name(struct net_device *dev, const char *newname)
+int netif_change_name(struct net_device *dev, const char *newname)
{
+ struct net *net = dev_net(dev);
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
- struct net *net;
-
- ASSERT_RTNL();
- BUG_ON(!dev_net(dev));
-
- net = dev_net(dev);
- down_write(&devnet_rename_sem);
+ ASSERT_RTNL_NET(net);
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- up_write(&devnet_rename_sem);
+ if (!strncmp(newname, dev->name, IFNAMSIZ))
return 0;
- }
memcpy(oldname, dev->name, IFNAMSIZ);
+ write_seqlock_bh(&netdev_rename_lock);
err = dev_get_valid_name(net, dev, newname);
- if (err < 0) {
- up_write(&devnet_rename_sem);
+ write_sequnlock_bh(&netdev_rename_lock);
+
+ if (err < 0)
return err;
- }
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s%s\n", oldname,
@@ -1205,14 +1472,13 @@ int dev_change_name(struct net_device *dev, const char *newname)
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
WRITE_ONCE(dev->name_assign_type, old_assign_type);
- up_write(&devnet_rename_sem);
return ret;
}
- up_write(&devnet_rename_sem);
-
netdev_adjacent_rename_links(dev, oldname);
netdev_name_node_del(dev->name_node);
@@ -1228,8 +1494,9 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- down_write(&devnet_rename_sem);
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
memcpy(oldname, newname, IFNAMSIZ);
WRITE_ONCE(dev->name_assign_type, old_assign_type);
old_assign_type = NET_NAME_RENAMED;
@@ -1243,15 +1510,7 @@ rollback:
return err;
}
-/**
- * dev_set_alias - change ifalias of a device
- * @dev: device
- * @alias: name up to IFALIASZ
- * @len: limit of bytes to copy from info
- *
- * Set ifalias for a device,
- */
-int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
{
struct dev_ifalias *new_alias = NULL;
@@ -1277,7 +1536,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
return len;
}
-EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
@@ -1314,16 +1572,10 @@ void netdev_features_change(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_features_change);
-/**
- * netdev_state_change - device changes state
- * @dev: device to cause notification
- *
- * Called to indicate a device has changed state. This function calls
- * the notifier chains for netdev_chain and sends a NEWLINK message
- * to the routing socket.
- */
-void netdev_state_change(struct net_device *dev)
+void netif_state_change(struct net_device *dev)
{
+ netdev_ops_assert_locked_or_invisible(dev);
+
if (dev->flags & IFF_UP) {
struct netdev_notifier_change_info change_info = {
.info.dev = dev,
@@ -1334,7 +1586,6 @@ void netdev_state_change(struct net_device *dev)
rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
}
}
-EXPORT_SYMBOL(netdev_state_change);
/**
* __netdev_notify_peers - notify network peers about existence of @dev,
@@ -1423,6 +1674,8 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
set_bit(__LINK_STATE_START, &dev->state);
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
@@ -1434,7 +1687,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- dev->flags |= IFF_UP;
+ netif_set_up(dev, true);
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1443,20 +1696,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
return ret;
}
-/**
- * dev_open - prepare an interface for use.
- * @dev: device to open
- * @extack: netlink extended ack
- *
- * Takes a device from down to up state. The device's private open
- * function is invoked and then the multicast lists are loaded. Finally
- * the device is moved into the up state and a %NETDEV_UP message is
- * sent to the netdev notifier chain.
- *
- * Calling this function on an active interface is a nop. On a failure
- * a negative errno code is returned.
- */
-int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
int ret;
@@ -1472,7 +1712,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
return ret;
}
-EXPORT_SYMBOL(dev_open);
static void __dev_close_many(struct list_head *head)
{
@@ -1510,10 +1749,13 @@ static void __dev_close_many(struct list_head *head)
* We allow it to be called even after a DETACH hot-plug
* event.
*/
+
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_stop)
ops->ndo_stop(dev);
- dev->flags &= ~IFF_UP;
+ netif_set_up(dev, false);
netpoll_poll_enable(dev);
}
}
@@ -1547,16 +1789,7 @@ void dev_close_many(struct list_head *head, bool unlink)
}
EXPORT_SYMBOL(dev_close_many);
-/**
- * dev_close - shutdown an interface.
- * @dev: device to shutdown
- *
- * This function moves an active device into down state. A
- * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
- * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
- * chain.
- */
-void dev_close(struct net_device *dev)
+void netif_close(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
@@ -1566,18 +1799,9 @@ void dev_close(struct net_device *dev)
list_del(&single);
}
}
-EXPORT_SYMBOL(dev_close);
-
+EXPORT_SYMBOL(netif_close);
-/**
- * dev_disable_lro - disable Large Receive Offload on a device
- * @dev: device
- *
- * Disable Large Receive Offload (LRO) on a net device. Must be
- * called under RTNL. This is needed if received packets may be
- * forwarded to another interface.
- */
-void dev_disable_lro(struct net_device *dev)
+void netif_disable_lro(struct net_device *dev)
{
struct net_device *lower_dev;
struct list_head *iter;
@@ -1588,10 +1812,13 @@ void dev_disable_lro(struct net_device *dev)
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
- netdev_for_each_lower_dev(dev, lower_dev, iter)
- dev_disable_lro(lower_dev);
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ netdev_lock_ops(lower_dev);
+ netif_disable_lro(lower_dev);
+ netdev_unlock_ops(lower_dev);
+ }
}
-EXPORT_SYMBOL(dev_disable_lro);
+EXPORT_IPV6_MOD(netif_disable_lro);
/**
* dev_disable_gro_hw - disable HW Generic Receive Offload on a device
@@ -1679,7 +1906,9 @@ static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
int err;
for_each_netdev(net, dev) {
+ netdev_lock_ops(dev);
err = call_netdevice_register_notifiers(nb, dev);
+ netdev_unlock_ops(dev);
if (err)
goto rollback;
}
@@ -1723,14 +1952,19 @@ int register_netdevice_notifier(struct notifier_block *nb)
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
+
+ /* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
+
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
+ __rtnl_net_lock(net);
err = call_netdevice_register_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
if (err)
goto rollback;
}
@@ -1741,8 +1975,11 @@ unlock:
return err;
rollback:
- for_each_net_continue_reverse(net)
+ for_each_net_continue_reverse(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
@@ -1775,8 +2012,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
if (err)
goto unlock;
- for_each_net(net)
+ for_each_net(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
unlock:
rtnl_unlock();
@@ -1840,9 +2080,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __register_netdevice_notifier_net(net, nb, false);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);
@@ -1868,9 +2109,10 @@ int unregister_netdevice_notifier_net(struct net *net,
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __unregister_netdevice_notifier_net(net, nb);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
@@ -1883,19 +2125,56 @@ static void __move_netdevice_notifier_net(struct net *src_net,
__register_netdevice_notifier_net(dst_net, nb, true);
}
+static void rtnl_net_dev_lock(struct net_device *dev)
+{
+ bool again;
+
+ do {
+ struct net *net;
+
+ again = false;
+
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ net_passive_inc(net);
+ rcu_read_unlock();
+
+ rtnl_net_lock(net);
+
+#ifdef CONFIG_NET_NS
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ again = true;
+ }
+#endif
+ } while (again);
+}
+
+static void rtnl_net_dev_unlock(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+}
+
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
if (!err) {
nn->nb = nb;
list_add(&nn->list, &dev->net_notifier_list);
}
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
@@ -1906,10 +2185,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev,
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
list_del(&nn->list);
err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
@@ -2057,6 +2337,11 @@ void net_dec_egress_queue(void)
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
+#ifdef CONFIG_NET_CLS_ACT
+DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
+EXPORT_SYMBOL(tcf_sw_enabled_key);
+#endif
+
DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
@@ -2113,7 +2398,7 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
- skb->mono_delivery_time = 0;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
if (static_branch_unlikely(&netstamp_needed_key))
skb->tstamp = ktime_get_real();
}
@@ -2220,16 +2505,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
}
/**
- * dev_nit_active - return true if any network interface taps are in use
+ * dev_nit_active_rcu - return true if any network interface taps are in use
+ *
+ * The caller must hold the RCU lock
*
* @dev: network device to check for the presence of taps
*/
-bool dev_nit_active(struct net_device *dev)
+bool dev_nit_active_rcu(const struct net_device *dev)
{
- return !list_empty(&net_hotdata.ptype_all) ||
+ /* Callers may hold either RCU or RCU BH lock */
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+ return !list_empty(&dev_net(dev)->ptype_all) ||
!list_empty(&dev->ptype_all);
}
-EXPORT_SYMBOL_GPL(dev_nit_active);
+EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
/*
* Support routine. Sends outgoing frames to any network
@@ -2238,11 +2528,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
- struct list_head *ptype_list = &net_hotdata.ptype_all;
struct packet_type *ptype, *pt_prev = NULL;
+ struct list_head *ptype_list;
struct sk_buff *skb2 = NULL;
rcu_read_lock();
+ ptype_list = &dev_net_rcu(dev)->ptype_all;
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (READ_ONCE(ptype->ignore_outgoing))
@@ -2286,7 +2577,7 @@ again:
pt_prev = ptype;
}
- if (ptype_list == &net_hotdata.ptype_all) {
+ if (ptype_list != &dev->ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
}
@@ -2889,6 +3180,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->reg_state == NETREG_REGISTERED ||
dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq);
@@ -2898,6 +3190,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
+ net_shaper_set_real_num_tx_queues(dev, txq);
+
dev_qdisc_change_real_num_tx(dev, txq);
dev->real_num_tx_queues = txq;
@@ -2917,7 +3211,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
-#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
@@ -2937,6 +3230,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
if (dev->reg_state == NETREG_REGISTERED) {
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
rxq);
@@ -2948,7 +3242,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
-#endif
/**
* netif_set_real_num_queues - set actual number of RX and TX queues used
@@ -3180,7 +3473,7 @@ void netif_device_attach(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
@@ -3261,6 +3554,10 @@ int skb_checksum_help(struct sk_buff *skb)
return -EINVAL;
}
+ if (!skb_frags_readable(skb)) {
+ return -EFAULT;
+ }
+
/* Before computing a checksum, we should make sure no frag could
* be modified by an external entity : checksum could be wrong.
*/
@@ -3299,9 +3596,10 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+#ifdef CONFIG_NET_CRC32C
int skb_crc32c_csum_help(struct sk_buff *skb)
{
- __le32 crc32c_csum;
+ u32 crc;
int ret = 0, offset, start;
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -3329,14 +3627,14 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
if (ret)
goto out;
- crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
- skb->len - start, ~(__u32)0,
- crc32c_csum_stub));
- *(__le32 *)(skb->data + offset) = crc32c_csum;
+ crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
+ *(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
skb_reset_csum_not_inet(skb);
out:
return ret;
}
+EXPORT_SYMBOL(skb_crc32c_csum_help);
+#endif /* CONFIG_NET_CRC32C */
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
@@ -3382,8 +3680,9 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = skb_frag_page(frag);
- if (PageHighMem(skb_frag_page(frag)))
+ if (page && PageHighMem(page))
return 1;
}
}
@@ -3455,7 +3754,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (gso_segs > READ_ONCE(dev->gso_max_segs))
return features & ~NETIF_F_GSO_MASK;
- if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
+ if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
return features & ~NETIF_F_GSO_MASK;
if (!skb_shinfo(skb)->gso_type) {
@@ -3523,7 +3822,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len;
int rc;
- if (dev_nit_active(dev))
+ if (dev_nit_active_rcu(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
@@ -3582,6 +3881,11 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
return 0;
if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
+ if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ goto sw_checksum;
+
switch (skb->csum_offset) {
case offsetof(struct tcphdr, check):
case offsetof(struct udphdr, check):
@@ -3589,14 +3893,48 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
}
}
+sw_checksum:
return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *shinfo;
+ struct net_iov *niov;
+
+ if (likely(skb_frags_readable(skb)))
+ goto out;
+
+ if (!dev->netmem_tx)
+ goto out_free;
+
+ shinfo = skb_shinfo(skb);
+
+ if (shinfo->nr_frags > 0) {
+ niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+ if (net_is_devmem_iov(niov) &&
+ net_devmem_iov_binding(niov)->dev != dev)
+ goto out_free;
+ }
+
+out:
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
netdev_features_t features;
+ skb = validate_xmit_unreadable_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -3656,7 +3994,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
next = skb->next;
skb_mark_not_on_list(skb);
- /* in case skb wont be segmented, point to itself */
+ /* in case skb won't be segmented, point to itself */
skb->prev = skb;
skb = validate_xmit_skb(skb, dev, again);
@@ -3701,7 +4039,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
sizeof(_tcphdr), &_tcphdr);
if (likely(th))
hdr_len += __tcp_hdrlen(th);
- } else {
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
struct udphdr _udphdr;
if (skb_header_pointer(skb, hdr_len,
@@ -3709,10 +4047,14 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
hdr_len += sizeof(struct udphdr);
}
- if (shinfo->gso_type & SKB_GSO_DODGY)
- gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
- shinfo->gso_size);
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
+ int payload = skb->len - hdr_len;
+ /* Malicious packet. */
+ if (payload <= 0)
+ return;
+ gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
+ }
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
}
@@ -3775,6 +4117,10 @@ no_lock_out:
return rc;
}
+ if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
+ return NET_XMIT_DROP;
+ }
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
@@ -3814,7 +4160,9 @@ no_lock_out:
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
+ WRITE_ONCE(q->owner, smp_processor_id());
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ WRITE_ONCE(q->owner, -1);
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -3887,6 +4235,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
}
+#ifndef CONFIG_PREEMPT_RT
static bool netdev_xmit_txqueue_skipped(void)
{
return __this_cpu_read(softnet_data.xmit.skip_txqueue);
@@ -3897,6 +4246,19 @@ void netdev_xmit_skip_txqueue(bool skip)
__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+
+#else
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return current->net_xmit.skip_txqueue;
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ current->net_xmit.skip_txqueue = skip;
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+#endif
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_NET_XGRESS
@@ -3911,6 +4273,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
if (!miniq)
return ret;
+ /* Global bypass */
+ if (!static_branch_likely(&tcf_sw_enabled_key))
+ return ret;
+
+ /* Block-wise bypass */
+ if (tcf_block_bypass_sw(miniq->block))
+ return ret;
+
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
tcf_set_drop_reason(skb, *drop_reason);
@@ -3971,10 +4341,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int sch_ret;
if (!entry)
return skb;
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
@@ -4003,10 +4376,12 @@ ingress_verdict:
break;
}
*ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_RX_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
/* used by tc_run */
case TC_ACT_STOLEN:
@@ -4016,8 +4391,10 @@ ingress_verdict:
fallthrough;
case TC_ACT_CONSUMED:
*ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
@@ -4027,11 +4404,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int sch_ret;
if (!entry)
return skb;
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
* already set by the caller.
*/
@@ -4047,10 +4427,12 @@ egress_verdict:
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
case TC_ACT_SHOT:
kfree_skb_reason(skb, drop_reason);
*ret = NET_XMIT_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
/* used by tc_run */
case TC_ACT_STOLEN:
@@ -4060,8 +4442,10 @@ egress_verdict:
fallthrough;
case TC_ACT_CONSUMED:
*ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
@@ -4157,13 +4541,6 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
}
EXPORT_SYMBOL(dev_pick_tx_zero);
-u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev)
-{
- return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
-}
-EXPORT_SYMBOL(dev_pick_tx_cpu_id);
-
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
{
@@ -4251,7 +4628,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
skb_assert_len(skb);
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ if (unlikely(skb_shinfo(skb)->tx_flags &
+ (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
@@ -4404,8 +4782,8 @@ EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
*************************************************************************/
+static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
-unsigned int sysctl_skb_defer_max __read_mostly = 64;
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
@@ -4427,25 +4805,24 @@ static inline void ____napi_schedule(struct softnet_data *sd,
*/
thread = READ_ONCE(napi->thread);
if (thread) {
- /* Avoid doing set_bit() if the thread is in
- * INTERRUPTIBLE state, cause napi_thread_wait()
- * makes sure to proceed with napi polling
- * if the thread is explicitly woken from here.
- */
- if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
- set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
+ goto use_local_napi;
+
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
wake_up_process(thread);
return;
}
}
+use_local_napi:
+ DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ.
*/
if (!sd->in_net_rx_action)
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
@@ -4455,17 +4832,23 @@ EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
+static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+{
+ return hash_32(hash, flow_table->log);
+}
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
if (next_cpu < nr_cpu_ids) {
+ u32 head;
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
- u32 flow_id;
u16 rxq_index;
+ u32 flow_id;
int rc;
/* Should we steer this flow to a different hardware queue? */
@@ -4480,23 +4863,23 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb_get_hash(skb) & flow_table->mask;
+ flow_id = rfs_slot(skb_get_hash(skb), flow_table);
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
old_rflow = rflow;
rflow = &flow_table->flows[flow_id];
- rflow->filter = rc;
- if (old_rflow->filter == rflow->filter)
- old_rflow->filter = RPS_NO_FILTER;
+ WRITE_ONCE(rflow->filter, rc);
+ if (old_rflow->filter == rc)
+ WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out:
#endif
- rflow->last_qtail =
- per_cpu(softnet_data, next_cpu).input_queue_head;
+ head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
+ rps_input_queue_tail_save(&rflow->last_qtail, head);
}
- rflow->cpu = next_cpu;
+ WRITE_ONCE(rflow->cpu, next_cpu);
return rflow;
}
@@ -4559,7 +4942,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[hash & flow_table->mask];
+ rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
tcpu = rflow->cpu;
/*
@@ -4575,7 +4958,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
*/
if (unlikely(tcpu != next_cpu) &&
(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
- ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+ ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
@@ -4626,13 +5009,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
- if (flow_table && flow_id <= flow_table->mask) {
+ if (flow_table && flow_id < (1UL << flow_table->log)) {
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
- if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
- ((int)(per_cpu(softnet_data, cpu).input_queue_head -
- rflow->last_qtail) <
- (int)(10 * flow_table->mask)))
+ if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
+ ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
+ READ_ONCE(rflow->last_qtail)) <
+ (int)(10 << flow_table->log)))
expire = false;
}
rcu_read_unlock();
@@ -4648,7 +5031,8 @@ static void rps_trigger_softirq(void *data)
struct softnet_data *sd = data;
____napi_schedule(sd, &sd->backlog);
- sd->received_rps++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
}
#endif /* CONFIG_RPS */
@@ -4678,6 +5062,11 @@ static void napi_schedule_rps(struct softnet_data *sd)
#ifdef CONFIG_RPS
if (sd != mysd) {
+ if (use_backlog_threads()) {
+ __napi_schedule_irqoff(&sd->backlog);
+ return;
+ }
+
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
@@ -4692,6 +5081,23 @@ static void napi_schedule_rps(struct softnet_data *sd)
__napi_schedule_irqoff(&mysd->backlog);
}
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
+{
+ unsigned long flags;
+
+ if (use_backlog_threads()) {
+ backlog_lock_irq_save(sd, &flags);
+
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+ __napi_schedule_irqoff(&sd->backlog);
+
+ backlog_unlock_irq_restore(sd, &flags);
+
+ } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+ smp_call_function_single_async(cpu, &sd->defer_csd);
+ }
+}
+
#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif
@@ -4711,7 +5117,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
if (fl) {
- new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
old_flow = fl->history[fl->history_head];
fl->history[fl->history_head] = new_flow;
@@ -4722,7 +5128,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
fl->buckets[old_flow]--;
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
- fl->count++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(fl->count, fl->count + 1);
rcu_read_unlock();
return true;
}
@@ -4743,37 +5150,45 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
struct softnet_data *sd;
unsigned long flags;
unsigned int qlen;
+ int max_backlog;
+ u32 tail;
+
+ reason = SKB_DROP_REASON_DEV_READY;
+ if (!netif_running(skb->dev))
+ goto bad_dev;
- reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ reason = SKB_DROP_REASON_CPU_BACKLOG;
sd = &per_cpu(softnet_data, cpu);
- rps_lock_irqsave(sd, &flags);
- if (!netif_running(skb->dev))
- goto drop;
+ qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
+ max_backlog = READ_ONCE(net_hotdata.max_backlog);
+ if (unlikely(qlen > max_backlog))
+ goto cpu_backlog_drop;
+ backlog_lock_irq_save(sd, &flags);
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
- !skb_flow_limit(skb, qlen)) {
- if (qlen) {
-enqueue:
- __skb_queue_tail(&sd->input_pkt_queue, skb);
- input_queue_tail_incr_save(sd, qtail);
- rps_unlock_irq_restore(sd, &flags);
- return NET_RX_SUCCESS;
- }
-
- /* Schedule NAPI for backlog device
- * We can use non atomic operation since we own the queue lock
- */
- if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
- napi_schedule_rps(sd);
- goto enqueue;
+ if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (!qlen) {
+ /* Schedule NAPI for backlog device. We can use
+ * non atomic operation as we own the queue lock.
+ */
+ if (!__test_and_set_bit(NAPI_STATE_SCHED,
+ &sd->backlog.state))
+ napi_schedule_rps(sd);
+ }
+ __skb_queue_tail(&sd->input_pkt_queue, skb);
+ tail = rps_input_queue_tail_incr(sd);
+ backlog_unlock_irq_restore(sd, &flags);
+
+ /* save the tail outside of the critical section */
+ rps_input_queue_tail_save(qtail, tail);
+ return NET_RX_SUCCESS;
}
- reason = SKB_DROP_REASON_CPU_BACKLOG;
-drop:
- sd->dropped++;
- rps_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, &flags);
+cpu_backlog_drop:
+ atomic_inc(&sd->dropped);
+bad_dev:
dev_core_stats_rx_dropped_inc(skb->dev);
kfree_skb_reason(skb, reason);
return NET_RX_DROP;
@@ -4803,7 +5218,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
@@ -4905,12 +5320,15 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
}
static int
-netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
+netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
{
struct sk_buff *skb = *pskb;
int err, hroom, troom;
- if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ if (!err)
return 0;
/* In case we have to go down the path and also linearize,
@@ -4929,7 +5347,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *skb = *pskb;
u32 mac_len, act = XDP_DROP;
@@ -4982,7 +5400,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
* and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
* queues, so they do not have this starvation issue.
*/
-void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -5007,13 +5425,16 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
+int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+
if (xdp_prog) {
struct xdp_buff xdp;
u32 act;
int err;
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
if (act != XDP_PASS) {
switch (act) {
@@ -5027,11 +5448,14 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
generic_xdp_tx(*pskb, xdp_prog);
break;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return XDP_DROP;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
}
return XDP_PASS;
out_redir:
+ bpf_net_ctx_clear(bpf_net_ctx);
kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
return XDP_DROP;
}
@@ -5125,7 +5549,7 @@ int netif_rx(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_rx);
-static __latent_entropy void net_tx_action(struct softirq_action *h)
+static __latent_entropy void net_tx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -5147,7 +5571,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
trace_consume_skb(skb, net_tx_action);
else
trace_kfree_skb(skb, net_tx_action,
- get_kfree_skb_cb(skb)->reason);
+ get_kfree_skb_cb(skb)->reason, NULL);
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
@@ -5340,8 +5764,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
orig_dev = skb->dev;
skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+ /* We plan to no longer reset the transport header here.
+ * Give some time to fuzzers and dev build to catch bugs
+ * in network stacks.
+ */
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
+#endif
skb_reset_mac_len(skb);
pt_prev = NULL;
@@ -5377,7 +5807,8 @@ another_round:
if (pfmemalloc)
goto skip_taps;
- list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
+ list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
+ list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -5489,6 +5920,14 @@ check_vlan_id:
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
+
+ /* orig_dev and skb->dev could belong to different netns;
+ * Even in such case we need to traverse only the list
+ * coming from skb->dev, as the ptype owner (packet socket)
+ * will use dev_net(skb->dev) to do namespace filtering.
+ */
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &dev_net_rcu(skb->dev)->ptype_specific);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
@@ -5602,10 +6041,9 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct packet_type *pt_curr = NULL;
/* Current (common) orig_dev of sublist */
struct net_device *od_curr = NULL;
- struct list_head sublist;
struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
@@ -5700,7 +6138,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
static_branch_inc(&generic_xdp_needed_key);
- dev_disable_lro(dev);
+ netif_disable_lro(dev);
dev_disable_gro_hw(dev);
}
break;
@@ -5743,9 +6181,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
void netif_receive_skb_list_internal(struct list_head *head)
{
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
skb);
@@ -5827,35 +6264,39 @@ void netif_receive_skb_list(struct list_head *head)
}
EXPORT_SYMBOL(netif_receive_skb_list);
-static DEFINE_PER_CPU(struct work_struct, flush_works);
-
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
struct sk_buff *skb, *tmp;
+ struct sk_buff_head list;
struct softnet_data *sd;
+ __skb_queue_head_init(&list);
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
- rps_lock_irq_disable(sd);
+ backlog_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
- dev_kfree_skb_irq(skb);
- input_queue_head_incr(sd);
+ __skb_queue_tail(&list, skb);
+ rps_input_queue_head_incr(sd);
}
}
- rps_unlock_irq_enable(sd);
+ backlog_unlock_irq_enable(sd);
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
- kfree_skb(skb);
- input_queue_head_incr(sd);
+ __skb_queue_tail(&list, skb);
+ rps_input_queue_head_incr(sd);
}
}
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
local_bh_enable();
+
+ __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
}
static bool flush_required(int cpu)
@@ -5864,14 +6305,14 @@ static bool flush_required(int cpu)
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
bool do_flush;
- rps_lock_irq_disable(sd);
+ backlog_lock_irq_disable(sd);
/* as insertion into process_queue happens with the rps lock held,
* process_queue access may race only with dequeue
*/
do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
!skb_queue_empty_lockless(&sd->process_queue);
- rps_unlock_irq_enable(sd);
+ backlog_unlock_irq_enable(sd);
return do_flush;
#endif
@@ -5883,36 +6324,54 @@ static bool flush_required(int cpu)
return true;
}
+struct flush_backlogs {
+ cpumask_t flush_cpus;
+ struct work_struct w[];
+};
+
+static struct flush_backlogs *flush_backlogs_alloc(void)
+{
+ return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
+ GFP_KERNEL);
+}
+
+static struct flush_backlogs *flush_backlogs_fallback;
+static DEFINE_MUTEX(flush_backlogs_mutex);
+
static void flush_all_backlogs(void)
{
- static cpumask_t flush_cpus;
+ struct flush_backlogs *ptr = flush_backlogs_alloc();
unsigned int cpu;
- /* since we are under rtnl lock protection we can use static data
- * for the cpumask and avoid allocating on stack the possibly
- * large mask
- */
- ASSERT_RTNL();
+ if (!ptr) {
+ mutex_lock(&flush_backlogs_mutex);
+ ptr = flush_backlogs_fallback;
+ }
+ cpumask_clear(&ptr->flush_cpus);
cpus_read_lock();
- cpumask_clear(&flush_cpus);
for_each_online_cpu(cpu) {
if (flush_required(cpu)) {
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
- cpumask_set_cpu(cpu, &flush_cpus);
+ INIT_WORK(&ptr->w[cpu], flush_backlog);
+ queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
+ __cpumask_set_cpu(cpu, &ptr->flush_cpus);
}
}
/* we can have in flight packet[s] on the cpus we are not flushing,
* synchronize_net() in unregister_netdevice_many() will take care of
- * them
+ * them.
*/
- for_each_cpu(cpu, &flush_cpus)
- flush_work(per_cpu_ptr(&flush_works, cpu));
+ for_each_cpu(cpu, &ptr->flush_cpus)
+ flush_work(&ptr->w[cpu]);
cpus_read_unlock();
+
+ if (ptr != flush_backlogs_fallback)
+ kfree(ptr);
+ else
+ mutex_unlock(&flush_backlogs_mutex);
}
static void net_rps_send_ipi(struct softnet_data *remsd)
@@ -5937,7 +6396,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
- if (remsd) {
+ if (!use_backlog_threads() && remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
@@ -5952,7 +6411,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- return sd->rps_ipi_list != NULL;
+ return !use_backlog_threads() && sd->rps_ipi_list;
#else
return false;
#endif
@@ -5976,17 +6435,22 @@ static int process_backlog(struct napi_struct *napi, int quota)
while (again) {
struct sk_buff *skb;
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
while ((skb = __skb_dequeue(&sd->process_queue))) {
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
- input_queue_head_incr(sd);
- if (++work >= quota)
+ if (++work >= quota) {
+ rps_input_queue_head_add(sd, work);
return work;
+ }
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
}
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
- rps_lock_irq_disable(sd);
+ backlog_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
@@ -5996,15 +6460,19 @@ static int process_backlog(struct napi_struct *napi, int quota)
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
- napi->state = 0;
+ napi->state &= NAPIF_STATE_THREADED;
again = false;
} else {
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
}
- rps_unlock_irq_enable(sd);
+ backlog_unlock_irq_enable(sd);
}
+ if (work)
+ rps_input_queue_head_add(sd, work);
return work;
}
@@ -6092,25 +6560,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
return false;
if (work_done) {
- if (n->gro_bitmask)
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
- n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ if (n->gro.bitmask)
+ timeout = napi_get_gro_flush_timeout(n);
+ n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
- timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ timeout = napi_get_gro_flush_timeout(n);
if (timeout)
ret = false;
}
- if (n->gro_bitmask) {
- /* When the NAPI instance uses a timeout and keeps postponing
- * it, we need to bound somehow the time packets are kept in
- * the GRO layer
- */
- napi_gro_flush(n, !!timeout);
- }
- gro_normal_list(n);
+ /*
+ * When the NAPI instance uses a timeout and keeps postponing
+ * it, we need to bound somehow the time packets are kept in
+ * the GRO layer.
+ */
+ gro_flush(&n->gro, !!timeout);
+ gro_normal_list(&n->gro);
if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
@@ -6148,19 +6615,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
}
EXPORT_SYMBOL(napi_complete_done);
-/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *napi_by_id(unsigned int napi_id)
-{
- unsigned int hash = napi_id % HASH_SIZE(napi_hash);
- struct napi_struct *napi;
-
- hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
- if (napi->napi_id == napi_id)
- return napi;
-
- return NULL;
-}
-
static void skb_defer_free_flush(struct softnet_data *sd)
{
struct sk_buff *skb, *next;
@@ -6187,19 +6641,15 @@ static void skb_defer_free_flush(struct softnet_data *sd)
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
if (!skip_schedule) {
- gro_normal_list(napi);
+ gro_normal_list(&napi->gro);
__napi_schedule(napi);
return;
}
- if (napi->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(napi, HZ >= 1000);
- }
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush(&napi->gro, HZ >= 1000);
+ gro_normal_list(&napi->gro);
- gro_normal_list(napi);
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
@@ -6211,6 +6661,7 @@ enum {
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
unsigned flags, u16 budget)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
bool skip_schedule = false;
unsigned long timeout;
int rc;
@@ -6228,10 +6679,11 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (flags & NAPI_F_PREFER_BUSY_POLL) {
- napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
- timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
+ timeout = napi_get_gro_flush_timeout(napi);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
skip_schedule = true;
@@ -6250,6 +6702,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
netpoll_poll_unlock(have_poll_lock);
if (rc == budget)
__busy_poll_stop(napi, skip_schedule);
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
}
@@ -6259,6 +6712,7 @@ static void __napi_busy_loop(unsigned int napi_id,
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
void *have_poll_lock = NULL;
struct napi_struct *napi;
@@ -6277,6 +6731,7 @@ restart:
int work = 0;
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (!napi_poll) {
unsigned long val = READ_ONCE(napi->state);
@@ -6301,12 +6756,13 @@ restart:
}
work = napi_poll(napi, budget);
trace_napi_poll(napi, work, budget);
- gro_normal_list(napi);
+ gro_normal_list(&napi->gro);
count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
skb_defer_free_flush(this_cpu_ptr(&softnet_data));
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
if (!loop_end || loop_end(loop_end_arg, start_time))
@@ -6358,26 +6814,84 @@ void napi_busy_loop(unsigned int napi_id,
}
EXPORT_SYMBOL(napi_busy_loop);
+void napi_suspend_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ unsigned long timeout = napi_get_irq_suspend_timeout(napi);
+
+ if (timeout)
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ }
+ rcu_read_unlock();
+}
+
+void napi_resume_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ /* If irq_suspend_timeout is set to 0 between the call to
+ * napi_suspend_irqs and now, the original value still
+ * determines the safety timeout as intended and napi_watchdog
+ * will resume irq processing.
+ */
+ if (napi_get_irq_suspend_timeout(napi)) {
+ local_bh_disable();
+ napi_schedule(napi);
+ local_bh_enable();
+ }
+ }
+ rcu_read_unlock();
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ napi->gro.cached_napi_id = napi_id;
+
+ WRITE_ONCE(napi->napi_id, napi_id);
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
+ WARN_ON_ONCE(napi_by_id(napi_id));
+ __napi_hash_add_with_id(napi, napi_id);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
+}
+
static void napi_hash_add(struct napi_struct *napi)
{
+ unsigned long flags;
+
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
- spin_lock(&napi_hash_lock);
+ spin_lock_irqsave(&napi_hash_lock, flags);
/* 0..NR_CPUS range is reserved for sender_cpu use */
do {
- if (unlikely(++napi_gen_id < MIN_NAPI_ID))
+ if (unlikely(!napi_id_valid(++napi_gen_id)))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
- napi->napi_id = napi_gen_id;
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ __napi_hash_add_with_id(napi, napi_gen_id);
- spin_unlock(&napi_hash_lock);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
}
/* Warning : caller is responsible to make sure rcu grace period
@@ -6385,11 +6899,13 @@ static void napi_hash_add(struct napi_struct *napi)
*/
static void napi_hash_del(struct napi_struct *napi)
{
- spin_lock(&napi_hash_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
hlist_del_init_rcu(&napi->napi_hash_node);
- spin_unlock(&napi_hash_lock);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
}
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
@@ -6410,22 +6926,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void init_gro_hash(struct napi_struct *napi)
-{
- int i;
-
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- INIT_LIST_HEAD(&napi->gro_hash[i].list);
- napi->gro_hash[i].count = 0;
- }
- napi->gro_bitmask = 0;
-}
-
int dev_set_threaded(struct net_device *dev, bool threaded)
{
struct napi_struct *napi;
int err = 0;
+ netdev_assert_locked_or_invisible(dev);
+
if (dev->threaded == threaded)
return 0;
@@ -6441,7 +6948,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
}
}
- dev->threaded = threaded;
+ WRITE_ONCE(dev->threaded, threaded);
/* Make sure kthread is created before THREADED bit
* is set.
@@ -6480,8 +6987,7 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
if (WARN_ON_ONCE(napi && !napi->dev))
return;
- if (dev->reg_state >= NETREG_REGISTERED)
- ASSERT_RTNL();
+ netdev_ops_assert_locked_or_invisible(dev);
switch (type) {
case NETDEV_QUEUE_TYPE_RX:
@@ -6498,20 +7004,255 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
}
EXPORT_SYMBOL(netif_queue_set_napi);
-void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+static void
+netif_napi_irq_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
{
+ struct napi_struct *napi =
+ container_of(notify, struct napi_struct, notify);
+#ifdef CONFIG_RFS_ACCEL
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+ int err;
+#endif
+
+ if (napi->config && napi->dev->irq_affinity_auto)
+ cpumask_copy(&napi->config->affinity_mask, mask);
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
+ if (err)
+ netdev_warn(napi->dev, "RMAP update failed (%d)\n",
+ err);
+ }
+#endif
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static void netif_napi_affinity_release(struct kref *ref)
+{
+ struct napi_struct *napi =
+ container_of(ref, struct napi_struct, notify.kref);
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+
+ netdev_assert_locked(napi->dev);
+ WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
+ &napi->state));
+
+ if (!napi->dev->rx_cpu_rmap_auto)
+ return;
+ rmap->obj[napi->napi_rmap_idx] = NULL;
+ napi->napi_rmap_idx = -1;
+ cpu_rmap_put(rmap);
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ if (dev->rx_cpu_rmap_auto)
+ return 0;
+
+ dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
+ if (!dev->rx_cpu_rmap)
+ return -ENOMEM;
+
+ dev->rx_cpu_rmap_auto = true;
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+ struct cpu_rmap *rmap = dev->rx_cpu_rmap;
+
+ if (!dev->rx_cpu_rmap_auto)
+ return;
+
+ /* Free the rmap */
+ cpu_rmap_put(rmap);
+ dev->rx_cpu_rmap = NULL;
+ dev->rx_cpu_rmap_auto = false;
+}
+
+#else
+static void netif_napi_affinity_release(struct kref *ref)
+{
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+}
+#endif
+
+void netif_set_affinity_auto(struct net_device *dev)
+{
+ unsigned int i, maxqs, numa;
+
+ maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
+ numa = dev_to_node(&dev->dev);
+
+ for (i = 0; i < maxqs; i++)
+ cpumask_set_cpu(cpumask_local_spread(i, numa),
+ &dev->napi_config[i].affinity_mask);
+
+ dev->irq_affinity_auto = true;
+}
+EXPORT_SYMBOL(netif_set_affinity_auto);
+
+void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
+{
+ int rc;
+
+ netdev_assert_locked_or_invisible(napi->dev);
+
+ if (napi->irq == irq)
+ return;
+
+ /* Remove existing resources */
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ napi->irq = irq;
+ if (irq < 0 ||
+ (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
+ return;
+
+ /* Abort for buggy drivers */
+ if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
+ return;
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
+ if (rc < 0)
+ return;
+
+ cpu_rmap_get(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = rc;
+ }
+#endif
+
+ /* Use core IRQ notifier */
+ napi->notify.notify = netif_napi_irq_notify;
+ napi->notify.release = netif_napi_affinity_release;
+ rc = irq_set_affinity_notifier(irq, &napi->notify);
+ if (rc) {
+ netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
+ rc);
+ goto put_rmap;
+ }
+
+ set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
+ return;
+
+put_rmap:
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
+ cpu_rmap_put(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = -1;
+ }
+#endif
+ napi->notify.notify = NULL;
+ napi->notify.release = NULL;
+}
+EXPORT_SYMBOL(netif_napi_set_irq_locked);
+
+static void napi_restore_config(struct napi_struct *n)
+{
+ n->defer_hard_irqs = n->config->defer_hard_irqs;
+ n->gro_flush_timeout = n->config->gro_flush_timeout;
+ n->irq_suspend_timeout = n->config->irq_suspend_timeout;
+
+ if (n->dev->irq_affinity_auto &&
+ test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
+ irq_set_affinity(n->irq, &n->config->affinity_mask);
+
+ /* a NAPI ID might be stored in the config, if so use it. if not, use
+ * napi_hash_add to generate one for us.
+ */
+ if (n->config->napi_id) {
+ napi_hash_add_with_id(n, n->config->napi_id);
+ } else {
+ napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+ n->config->defer_hard_irqs = n->defer_hard_irqs;
+ n->config->gro_flush_timeout = n->gro_flush_timeout;
+ n->config->irq_suspend_timeout = n->irq_suspend_timeout;
+ napi_hash_del(n);
+}
+
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (napi_id_valid(pos->napi_id))
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+static void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
+void netif_napi_add_weight_locked(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ netdev_assert_locked(dev);
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
INIT_LIST_HEAD(&napi->poll_list);
INIT_HLIST_NODE(&napi->napi_hash_node);
- hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- napi->timer.function = napi_watchdog;
- init_gro_hash(napi);
+ hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ gro_init(&napi->gro);
napi->skb = NULL;
- INIT_LIST_HEAD(&napi->rx_list);
- napi->rx_count = 0;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
@@ -6524,24 +7265,32 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
- list_add_rcu(&napi->dev_list, &dev->napi_list);
- napi_hash_add(napi);
+ netif_napi_dev_list_add(dev, napi);
+
+ /* default settings from sysfs are applied to all NAPIs. any per-NAPI
+ * configuration will be loaded in napi_enable
+ */
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
napi_get_frags_check(napi);
/* Create kthread for this napi if dev->threaded is set.
* Clear dev->threaded if kthread creation failed so that
* threaded mode will not be enabled in napi_enable().
*/
if (dev->threaded && napi_kthread_create(napi))
- dev->threaded = 0;
- netif_napi_set_irq(napi, -1);
+ dev->threaded = false;
+ netif_napi_set_irq_locked(napi, -1);
}
-EXPORT_SYMBOL(netif_napi_add_weight);
+EXPORT_SYMBOL(netif_napi_add_weight_locked);
-void napi_disable(struct napi_struct *n)
+void napi_disable_locked(struct napi_struct *n)
{
unsigned long val, new;
might_sleep();
+ netdev_assert_locked(n->dev);
+
set_bit(NAPI_STATE_DISABLE, &n->state);
val = READ_ONCE(n->state);
@@ -6557,21 +7306,40 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ if (n->config)
+ napi_save_config(n);
+ else
+ napi_hash_del(n);
+
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
-EXPORT_SYMBOL(napi_disable);
+EXPORT_SYMBOL(napi_disable_locked);
/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
+ * napi_disable() - prevent NAPI from scheduling
+ * @n: NAPI context
*
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
+ * Stop NAPI from being scheduled on this context.
+ * Waits till any outstanding processing completes.
+ * Takes netdev_lock() for associated net_device.
*/
-void napi_enable(struct napi_struct *n)
+void napi_disable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_disable_locked(n);
+ netdev_unlock(n->dev);
+}
+EXPORT_SYMBOL(napi_disable);
+
+void napi_enable_locked(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
+ if (n->config)
+ napi_restore_config(n);
+ else
+ napi_hash_add(n);
+
do {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
@@ -6580,40 +7348,54 @@ void napi_enable(struct napi_struct *n)
new |= NAPIF_STATE_THREADED;
} while (!try_cmpxchg(&n->state, &val, new));
}
-EXPORT_SYMBOL(napi_enable);
+EXPORT_SYMBOL(napi_enable_locked);
-static void flush_gro_hash(struct napi_struct *napi)
+/**
+ * napi_enable() - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Enable scheduling of a NAPI instance.
+ * Must be paired with napi_disable().
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_enable(struct napi_struct *n)
{
- int i;
-
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- struct sk_buff *skb, *n;
-
- list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
- kfree_skb(skb);
- napi->gro_hash[i].count = 0;
- }
+ netdev_lock(n->dev);
+ napi_enable_locked(n);
+ netdev_unlock(n->dev);
}
+EXPORT_SYMBOL(napi_enable);
/* Must be called in process context */
-void __netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del_locked(struct napi_struct *napi)
{
+ netdev_assert_locked(napi->dev);
+
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
- napi_hash_del(napi);
+ /* Make sure NAPI is disabled (or was never enabled). */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ if (napi->config) {
+ napi->index = -1;
+ napi->config = NULL;
+ }
+
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
- flush_gro_hash(napi);
- napi->gro_bitmask = 0;
+ gro_cleanup(&napi->gro);
if (napi->thread) {
kthread_stop(napi->thread);
napi->thread = NULL;
}
}
-EXPORT_SYMBOL(__netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del_locked);
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
@@ -6665,14 +7447,9 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
return work;
}
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
- }
-
- gro_normal_list(n);
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush(&n->gro, HZ >= 1000);
+ gro_normal_list(&n->gro);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
@@ -6700,9 +7477,14 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
work = __napi_poll(n, &do_repoll);
- if (do_repoll)
+ if (do_repoll) {
+#if defined(CONFIG_DEBUG_NET)
+ if (unlikely(!napi_is_scheduled(n)))
+ pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
+ n->dev->name, n->poll);
+#endif
list_add_tail(&n->poll_list, repoll);
-
+ }
netpoll_poll_unlock(have);
return work;
@@ -6710,8 +7492,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
static int napi_thread_wait(struct napi_struct *napi)
{
- bool woken = false;
-
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
@@ -6720,15 +7500,13 @@ static int napi_thread_wait(struct napi_struct *napi)
* Testing SCHED bit is not enough because SCHED bit might be
* set by some other busy poll thread or by napi_disable().
*/
- if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
- /* woken being true indicates this thread owns this napi. */
- woken = true;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -6736,55 +7514,66 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
-static int napi_threaded_poll(void *data)
+static void napi_threaded_poll_loop(struct napi_struct *napi)
{
- struct napi_struct *napi = data;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct softnet_data *sd;
- void *have;
+ unsigned long last_qs = jiffies;
- while (!napi_thread_wait(napi)) {
- unsigned long last_qs = jiffies;
+ for (;;) {
+ bool repoll = false;
+ void *have;
- for (;;) {
- bool repoll = false;
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- local_bh_disable();
- sd = this_cpu_ptr(&softnet_data);
- sd->in_napi_threaded_poll = true;
+ sd = this_cpu_ptr(&softnet_data);
+ sd->in_napi_threaded_poll = true;
- have = netpoll_poll_lock(napi);
- __napi_poll(napi, &repoll);
- netpoll_poll_unlock(have);
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
- sd->in_napi_threaded_poll = false;
- barrier();
+ sd->in_napi_threaded_poll = false;
+ barrier();
- if (sd_has_rps_ipi_waiting(sd)) {
- local_irq_disable();
- net_rps_action_and_irq_enable(sd);
- }
- skb_defer_free_flush(sd);
- local_bh_enable();
+ if (sd_has_rps_ipi_waiting(sd)) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+ skb_defer_free_flush(sd);
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
- if (!repoll)
- break;
+ if (!repoll)
+ break;
- rcu_softirq_qs_periodic(last_qs);
- cond_resched();
- }
+ rcu_softirq_qs_periodic(last_qs);
+ cond_resched();
}
+}
+
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
+
+ while (!napi_thread_wait(napi))
+ napi_threaded_poll_loop(napi);
+
return 0;
}
-static __latent_entropy void net_rx_action(struct softirq_action *h)
+static __latent_entropy void net_rx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int budget = READ_ONCE(net_hotdata.netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
sd->in_net_rx_action = true;
local_irq_disable();
@@ -6821,7 +7610,8 @@ start:
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
- sd->time_squeeze++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
break;
}
}
@@ -6837,7 +7627,8 @@ start:
sd->in_net_rx_action = false;
net_rps_action_and_irq_enable(sd);
-end:;
+end:
+ bpf_net_ctx_clear(bpf_net_ctx);
}
struct netdev_adjacent {
@@ -8453,27 +9244,29 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
+ unsigned int promiscuity, flags;
kuid_t uid;
kgid_t gid;
ASSERT_RTNL();
- dev->flags |= IFF_PROMISC;
- dev->promiscuity += inc;
- if (dev->promiscuity == 0) {
+ promiscuity = dev->promiscuity + inc;
+ if (promiscuity == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch promisc and return error.
*/
- if (inc < 0)
- dev->flags &= ~IFF_PROMISC;
- else {
- dev->promiscuity -= inc;
+ if (unlikely(inc > 0)) {
netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
return -EOVERFLOW;
}
+ flags = old_flags & ~IFF_PROMISC;
+ } else {
+ flags = old_flags | IFF_PROMISC;
}
- if (dev->flags != old_flags) {
+ WRITE_ONCE(dev->promiscuity, promiscuity);
+ if (flags != old_flags) {
+ WRITE_ONCE(dev->flags, flags);
netdev_info(dev, "%s promiscuous mode\n",
dev->flags & IFF_PROMISC ? "entered" : "left");
if (audit_enabled) {
@@ -8491,23 +9284,20 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
- if (notify)
+ if (notify) {
+ /* The ops lock is only required to ensure consistent locking
+ * for `NETDEV_CHANGE` notifiers. This function is sometimes
+ * called without the lock, even for devices that are ops
+ * locked, such as in `dev_uc_sync_multiple` when using
+ * bonding or teaming.
+ */
+ netdev_ops_assert_locked(dev);
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
+ }
return 0;
}
-/**
- * dev_set_promiscuity - update promiscuity count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove promiscuity from a device. While the count in the device
- * remains above zero the interface remains promiscuous. Once it hits zero
- * the device reverts back to normal filtering operation. A negative inc
- * value is used to drop promiscuity on the device.
- * Return 0 if successful or a negative errno code on error.
- */
-int dev_set_promiscuity(struct net_device *dev, int inc)
+int netif_set_promiscuity(struct net_device *dev, int inc)
{
unsigned int old_flags = dev->flags;
int err;
@@ -8519,30 +9309,31 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
dev_set_rx_mode(dev);
return err;
}
-EXPORT_SYMBOL(dev_set_promiscuity);
-static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
+int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
+ unsigned int allmulti, flags;
ASSERT_RTNL();
- dev->flags |= IFF_ALLMULTI;
- dev->allmulti += inc;
- if (dev->allmulti == 0) {
+ allmulti = dev->allmulti + inc;
+ if (allmulti == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch allmulti and return error.
*/
- if (inc < 0)
- dev->flags &= ~IFF_ALLMULTI;
- else {
- dev->allmulti -= inc;
+ if (unlikely(inc > 0)) {
netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
return -EOVERFLOW;
}
+ flags = old_flags & ~IFF_ALLMULTI;
+ } else {
+ flags = old_flags | IFF_ALLMULTI;
}
- if (dev->flags ^ old_flags) {
+ WRITE_ONCE(dev->allmulti, allmulti);
+ if (flags != old_flags) {
+ WRITE_ONCE(dev->flags, flags);
netdev_info(dev, "%s allmulticast mode\n",
dev->flags & IFF_ALLMULTI ? "entered" : "left");
dev_change_rx_flags(dev, IFF_ALLMULTI);
@@ -8554,25 +9345,6 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
-/**
- * dev_set_allmulti - update allmulti count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove reception of all multicast frames to a device. While the
- * count in the device remains above zero the interface remains listening
- * to all interfaces. Once it hits zero the device reverts back to normal
- * filtering operation. A negative @inc value is used to drop the counter
- * when releasing a resource needing all multicasts.
- * Return 0 if successful or a negative errno code on error.
- */
-
-int dev_set_allmulti(struct net_device *dev, int inc)
-{
- return __dev_set_allmulti(dev, inc, true);
-}
-EXPORT_SYMBOL(dev_set_allmulti);
-
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
@@ -8688,7 +9460,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
- unsigned int old_flags = dev->flags;
+ old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
@@ -8705,7 +9477,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
- __dev_set_allmulti(dev, inc, false);
+ netif_set_allmulti(dev, inc, false);
}
return ret;
@@ -8740,17 +9512,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
}
}
-/**
- * dev_change_flags - change device settings
- * @dev: device
- * @flags: device state flags
- * @extack: netlink extended ack
- *
- * Change settings on device based state flags. The flags are
- * in the userspace exported format.
- */
-int dev_change_flags(struct net_device *dev, unsigned int flags,
- struct netlink_ext_ack *extack)
+int netif_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
@@ -8763,7 +9526,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
__dev_notify_flags(dev, old_flags, changes, 0, NULL);
return ret;
}
-EXPORT_SYMBOL(dev_change_flags);
int __dev_set_mtu(struct net_device *dev, int new_mtu)
{
@@ -8795,15 +9557,15 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu,
}
/**
- * dev_set_mtu_ext - Change maximum transfer unit
+ * netif_set_mtu_ext - Change maximum transfer unit
* @dev: device
* @new_mtu: new transfer unit
* @extack: netlink extended ack
*
* Change the maximum transfer size of the network device.
*/
-int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
- struct netlink_ext_ack *extack)
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
{
int err, orig_mtu;
@@ -8841,25 +9603,20 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
return err;
}
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int netif_set_mtu(struct net_device *dev, int new_mtu)
{
struct netlink_ext_ack extack;
int err;
memset(&extack, 0, sizeof(extack));
- err = dev_set_mtu_ext(dev, new_mtu, &extack);
+ err = netif_set_mtu_ext(dev, new_mtu, &extack);
if (err && extack._msg)
net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
return err;
}
-EXPORT_SYMBOL(dev_set_mtu);
+EXPORT_SYMBOL(netif_set_mtu);
-/**
- * dev_change_tx_queue_len - Change TX queue length of a netdevice
- * @dev: device
- * @new_len: new tx queue length
- */
-int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
unsigned int orig_len = dev->tx_queue_len;
int res;
@@ -8868,7 +9625,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
return -ERANGE;
if (new_len != orig_len) {
- dev->tx_queue_len = new_len;
+ WRITE_ONCE(dev->tx_queue_len, new_len);
res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
res = notifier_to_errno(res);
if (res)
@@ -8882,16 +9639,11 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
err_rollback:
netdev_err(dev, "refused to change device tx_queue_len\n");
- dev->tx_queue_len = orig_len;
+ WRITE_ONCE(dev->tx_queue_len, orig_len);
return res;
}
-/**
- * dev_set_group - Change group this device belongs to
- * @dev: device
- * @new_group: group this device should belong to
- */
-void dev_set_group(struct net_device *dev, int new_group)
+void netif_set_group(struct net_device *dev, int new_group)
{
dev->group = new_group;
}
@@ -8917,31 +9669,23 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
}
EXPORT_SYMBOL(dev_pre_changeaddr_notify);
-/**
- * dev_set_mac_address - Change Media Access Control Address
- * @dev: device
- * @sa: new address
- * @extack: netlink extended ack
- *
- * Change the hardware (MAC) address of the device
- */
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
- struct netlink_ext_ack *extack)
+int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
- if (sa->sa_family != dev->type)
+ if (ss->ss_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
+ err = dev_pre_changeaddr_notify(dev, ss->__data, extack);
if (err)
return err;
- if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
- err = ops->ndo_set_mac_address(dev, sa);
+ if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
+ err = ops->ndo_set_mac_address(dev, ss);
if (err)
return err;
}
@@ -8950,22 +9694,10 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
}
-EXPORT_SYMBOL(dev_set_mac_address);
DECLARE_RWSEM(dev_addr_sem);
-int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
- struct netlink_ext_ack *extack)
-{
- int ret;
-
- down_write(&dev_addr_sem);
- ret = dev_set_mac_address(dev, sa, extack);
- up_write(&dev_addr_sem);
- return ret;
-}
-EXPORT_SYMBOL(dev_set_mac_address_user);
-
+/* "sa" is a true struct sockaddr with limited "sa_data" member. */
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
size_t size = sizeof(sa->sa_data_min);
@@ -8994,14 +9726,7 @@ unlock:
}
EXPORT_SYMBOL(dev_get_mac_address);
-/**
- * dev_change_carrier - Change device carrier
- * @dev: device
- * @new_carrier: new value
- *
- * Change device carrier
- */
-int dev_change_carrier(struct net_device *dev, bool new_carrier)
+int netif_change_carrier(struct net_device *dev, bool new_carrier)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -9112,15 +9837,9 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
}
EXPORT_SYMBOL(netdev_port_same_parent_id);
-/**
- * dev_change_proto_down - set carrier according to proto_down.
- *
- * @dev: device
- * @proto_down: new value
- */
-int dev_change_proto_down(struct net_device *dev, bool proto_down)
+int netif_change_proto_down(struct net_device *dev, bool proto_down)
{
- if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
+ if (!dev->change_proto_down)
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -9128,32 +9847,35 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
netif_carrier_off(dev);
else
netif_carrier_on(dev);
- dev->proto_down = proto_down;
+ WRITE_ONCE(dev->proto_down, proto_down);
return 0;
}
/**
- * dev_change_proto_down_reason - proto down reason
+ * netdev_change_proto_down_reason_locked - proto down reason
*
* @dev: device
* @mask: proto down mask
* @value: proto down value
*/
-void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
- u32 value)
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value)
{
+ u32 proto_down_reason;
int b;
if (!mask) {
- dev->proto_down_reason = value;
+ proto_down_reason = value;
} else {
+ proto_down_reason = dev->proto_down_reason;
for_each_set_bit(b, &mask, 32) {
if (value & (1 << b))
- dev->proto_down_reason |= BIT(b);
+ proto_down_reason |= BIT(b);
else
- dev->proto_down_reason &= ~BIT(b);
+ proto_down_reason &= ~BIT(b);
}
}
+ WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
}
struct bpf_xdp_link {
@@ -9214,6 +9936,40 @@ u8 dev_xdp_prog_count(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
+u8 dev_xdp_sb_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog &&
+ !dev->xdp_state[i].prog->aux->xdp_has_frags)
+ count++;
+ return count;
+}
+
+int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ if (!dev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ bpf->command == XDP_SETUP_PROG &&
+ bpf->prog && !bpf->prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "unable to propagate XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
+ return -EBUSY;
+ }
+
+ return dev->netdev_ops->ndo_bpf(dev, bpf);
+}
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
+
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
struct bpf_prog *prog = dev_xdp_prog(dev, mode);
@@ -9242,6 +9998,19 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
struct netdev_bpf xdp;
int err;
+ netdev_ops_assert_locked(dev);
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ prog && !prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
+ return -EBUSY;
+ }
+
memset(&xdp, 0, sizeof(xdp));
xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
xdp.extack = extack;
@@ -9390,6 +10159,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
NL_SET_ERR_MSG(extack, "Program bound to different device");
return -EINVAL;
}
+ if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
+ NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
+ return -EINVAL;
+ }
if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
return -EINVAL;
@@ -9459,7 +10232,9 @@ static void bpf_xdp_link_release(struct bpf_link *link)
* already NULL, in which case link was already auto-detached
*/
if (xdp_link->dev) {
+ netdev_lock_ops(xdp_link->dev);
WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+ netdev_unlock_ops(xdp_link->dev);
xdp_link->dev = NULL;
}
@@ -9541,10 +10316,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
goto out_unlock;
}
+ netdev_lock_ops(xdp_link->dev);
mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
xdp_link->flags, new_prog);
+ netdev_unlock_ops(xdp_link->dev);
if (err)
goto out_unlock;
@@ -9597,7 +10374,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
goto unlock;
}
+ netdev_lock_ops(dev);
err = dev_xdp_attach_link(dev, &extack, link);
+ netdev_unlock_ops(dev);
rtnl_unlock();
if (err) {
@@ -9666,6 +10445,20 @@ err_out:
return err;
}
+u32 dev_get_min_mp_channel_count(const struct net_device *dev)
+{
+ int i;
+
+ netdev_ops_assert_locked(dev);
+
+ for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
+ if (dev->_rx[i].mp_params.mp_priv)
+ /* The channel count is the idx plus 1. */
+ return i + 1;
+
+ return 0;
+}
+
/**
* dev_index_reserve() - allocate an ifindex in a namespace
* @net: the applicable net namespace
@@ -9703,6 +10496,15 @@ static void dev_index_release(struct net *net, int ifindex)
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}
+static bool from_cleanup_net(void)
+{
+#ifdef CONFIG_NET_NS
+ return current == READ_ONCE(cleanup_net_task);
+#else
+ return false;
+#endif
+}
+
/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
@@ -9745,6 +10547,7 @@ static void netdev_sync_lower_features(struct net_device *upper,
if (!(features & feature) && (lower->features & feature)) {
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
+ netdev_lock_ops(lower);
lower->wanted_features &= ~feature;
__netdev_update_features(lower);
@@ -9753,10 +10556,20 @@ static void netdev_sync_lower_features(struct net_device *upper,
&feature, lower->name);
else
netdev_features_change(lower);
+ netdev_unlock_ops(lower);
}
}
}
+static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
+{
+ netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+ bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
+ bool hw_csum = features & NETIF_F_HW_CSUM;
+
+ return ip_csum || hw_csum;
+}
+
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{
@@ -9838,15 +10651,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_LRO;
}
- if (features & NETIF_F_HW_TLS_TX) {
- bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
- (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
- bool hw_csum = features & NETIF_F_HW_CSUM;
-
- if (!ip_csum && !hw_csum) {
- netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
- features &= ~NETIF_F_HW_TLS_TX;
- }
+ if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_TX;
}
if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
@@ -9854,6 +10661,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_HW_TLS_RX;
}
+ if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
+ features &= ~NETIF_F_GSO_UDP_L4;
+ }
+
return features;
}
@@ -9865,6 +10677,7 @@ int __netdev_update_features(struct net_device *dev)
int err = -1;
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
features = netdev_get_wanted_features(dev);
@@ -10158,6 +10971,17 @@ static void netdev_do_free_pcpu_stats(struct net_device *dev)
}
}
+static void netdev_free_phy_link_topology(struct net_device *dev)
+{
+ struct phy_link_topology *topo = dev->link_topo;
+
+ if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
+ xa_destroy(&topo->phys);
+ kfree(topo);
+ dev->link_topo = NULL;
+ }
+}
+
/**
* register_netdevice() - register a network device
* @dev: device to register
@@ -10187,6 +11011,10 @@ int register_netdevice(struct net_device *dev)
if (ret)
return ret;
+ /* rss ctx ID 0 is reserved for the default context, start from 1 */
+ xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
+ mutex_init(&dev->ethtool->rss_lock);
+
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
@@ -10276,12 +11104,16 @@ int register_netdevice(struct net_device *dev)
ret = netdev_register_kobject(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+ netdev_unlock(dev);
if (ret)
goto err_uninit_notify;
+ netdev_lock_ops(dev);
__netdev_update_features(dev);
+ netdev_unlock_ops(dev);
/*
* Default initial state at registry is that the
@@ -10307,7 +11139,9 @@ int register_netdevice(struct net_device *dev)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
+ netdev_lock_ops(dev);
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
ret = notifier_to_errno(ret);
if (ret) {
/* Expect explicit free_netdev() on failure */
@@ -10319,8 +11153,7 @@ int register_netdevice(struct net_device *dev)
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
out:
@@ -10343,47 +11176,26 @@ err_free_name:
}
EXPORT_SYMBOL(register_netdevice);
-/**
- * init_dummy_netdev - init a dummy network device for NAPI
- * @dev: device to init
- *
- * This takes a network device structure and initialize the minimum
- * amount of fields so it can be used to schedule NAPI polls without
- * registering a full blown interface. This is to be used by drivers
- * that need to tie several hardware interfaces to a single NAPI
- * poll scheduler due to HW limitations.
+/* Initialize the core of a dummy net device.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
*/
-void init_dummy_netdev(struct net_device *dev)
+static void init_dummy_netdev(struct net_device *dev)
{
- /* Clear everything. Note we don't initialize spinlocks
- * are they aren't supposed to be taken by any of the
- * NAPI code and this dummy netdev is supposed to be
- * only ever used for NAPI polls
- */
- memset(dev, 0, sizeof(struct net_device));
-
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
- /* NAPI wants this */
- INIT_LIST_HEAD(&dev->napi_list);
-
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
- /* napi_busy_loop stats accounting wants this */
- dev_net_set(dev, &init_net);
-
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
*/
}
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
-
/**
* register_netdev - register a network device
@@ -10400,12 +11212,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev);
*/
int register_netdev(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
int err;
- if (rtnl_lock_killable())
+ if (rtnl_net_lock_killable(net))
return -EINTR;
+
err = register_netdevice(dev);
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdev);
@@ -10482,8 +11298,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
rebroadcast_time = jiffies;
}
+ rcu_barrier();
+
if (!wait) {
- rcu_barrier();
wait = WAIT_REFS_MIN_MSECS;
} else {
msleep(wait);
@@ -10542,9 +11359,8 @@ void netdev_run_todo(void)
list_replace_init(&net_unlink_list, &unlink_list);
while (!list_empty(&unlink_list)) {
- struct net_device *dev = list_first_entry(&unlink_list,
- struct net_device,
- unlink_list);
+ dev = list_first_entry(&unlink_list, struct net_device,
+ unlink_list);
list_del_init(&dev->unlink_list);
dev->nested_level = dev->lower_level - 1;
}
@@ -10566,7 +11382,9 @@ void netdev_run_todo(void)
continue;
}
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ netdev_unlock(dev);
linkwatch_sync_dev(dev);
}
@@ -10597,6 +11415,54 @@ void netdev_run_todo(void)
wake_up(&netdev_unregistering_wq);
}
+/* Collate per-cpu network dstats statistics
+ *
+ * Read per-cpu network statistics from dev->dstats and populate the related
+ * fields in @s.
+ */
+static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_dstats __percpu *dstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ u64 rx_packets, rx_bytes, rx_drops;
+ u64 tx_packets, tx_bytes, tx_drops;
+ const struct pcpu_dstats *stats;
+ unsigned int start;
+
+ stats = per_cpu_ptr(dstats, cpu);
+ do {
+ start = u64_stats_fetch_begin(&stats->syncp);
+ rx_packets = u64_stats_read(&stats->rx_packets);
+ rx_bytes = u64_stats_read(&stats->rx_bytes);
+ rx_drops = u64_stats_read(&stats->rx_drops);
+ tx_packets = u64_stats_read(&stats->tx_packets);
+ tx_bytes = u64_stats_read(&stats->tx_bytes);
+ tx_drops = u64_stats_read(&stats->tx_drops);
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
+
+ s->rx_packets += rx_packets;
+ s->rx_bytes += rx_bytes;
+ s->rx_dropped += rx_drops;
+ s->tx_packets += tx_packets;
+ s->tx_bytes += tx_bytes;
+ s->tx_dropped += tx_drops;
+ }
+}
+
+/* ndo_get_stats64 implementation for dtstats-based accounting.
+ *
+ * Populate @s from dev->stats and dev->dstats. This is used internally by the
+ * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
+ */
+static void dev_get_dstats64(const struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ netdev_stats_to_stats64(s, &dev->stats);
+ dev_fetch_dstats(s, dev->dstats);
+}
+
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
* all the same fields in the same order as net_device_stats, with only
* the type differing, but rtnl_link_stats64 may have additional fields
@@ -10645,7 +11511,7 @@ noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
return;
}
- field = (__force unsigned long __percpu *)((__force void *)p + offset);
+ field = (unsigned long __percpu *)((void __percpu *)p + offset);
this_cpu_inc(*field);
}
EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
@@ -10666,6 +11532,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
const struct net_device_core_stats __percpu *p;
+ /*
+ * IPv{4,6} and udp tunnels share common stat helpers and use
+ * different stat type (NETDEV_PCPU_STAT_TSTATS vs
+ * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
+ */
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
+ offsetof(struct pcpu_dstats, rx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
+ offsetof(struct pcpu_dstats, rx_packets));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
+ offsetof(struct pcpu_dstats, tx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
+ offsetof(struct pcpu_dstats, tx_packets));
+
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
@@ -10673,6 +11553,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
dev_get_tstats64(dev, storage);
+ } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
+ dev_get_dstats64(dev, storage);
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
@@ -10784,19 +11666,12 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
WARN_ON(dev->reg_state == NETREG_REGISTERED);
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- dev->gro_flush_timeout = 20000;
- dev->napi_defer_hard_irqs = 1;
+ netdev_set_gro_flush_timeout(dev, 20000);
+ netdev_set_defer_hard_irqs(dev, 1);
}
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
-void netdev_freemem(struct net_device *dev)
-{
- char *addr = (char *)dev - dev->padded;
-
- kvfree(addr);
-}
-
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
@@ -10816,8 +11691,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
- unsigned int alloc_size;
- struct net_device *p;
+ size_t napi_config_sz;
+ unsigned int maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
@@ -10831,21 +11706,14 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
- alloc_size = sizeof(struct net_device);
- if (sizeof_priv) {
- /* ensure 32-byte alignment of private area */
- alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
- alloc_size += sizeof_priv;
- }
- /* ensure 32-byte alignment of whole construct */
- alloc_size += NETDEV_ALIGN - 1;
+ maxqs = max(txqs, rxqs);
- p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
- if (!p)
+ dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
+ GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+ if (!dev)
return NULL;
- dev = PTR_ALIGN(p, NETDEV_ALIGN);
- dev->padded = (char *)dev - (char *)p;
+ dev->priv_len = sizeof_priv;
ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
#ifdef CONFIG_PCPU_DEV_REFCNT
@@ -10892,6 +11760,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
+
+ mutex_init(&dev->lock);
+
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -10909,8 +11780,21 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
+ dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
+ if (!dev->ethtool)
+ goto free_all;
+
+ dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg)
+ goto free_all;
+ dev->cfg_pending = dev->cfg;
- strcpy(dev->name, name);
+ napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+ dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+ if (!dev->napi_config)
+ goto free_all;
+
+ strscpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
@@ -10929,11 +11813,27 @@ free_pcpu:
free_percpu(dev->pcpu_refcnt);
free_dev:
#endif
- netdev_freemem(dev);
+ kvfree(dev);
return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);
+static void netdev_napi_exit(struct net_device *dev)
+{
+ if (!list_empty(&dev->napi_list)) {
+ struct napi_struct *p, *n;
+
+ netdev_lock(dev);
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ __netif_napi_del_locked(p);
+ netdev_unlock(dev);
+
+ synchronize_net();
+ }
+
+ kvfree(dev->napi_config);
+}
+
/**
* free_netdev - free network device
* @dev: device
@@ -10945,8 +11845,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
*/
void free_netdev(struct net_device *dev)
{
- struct napi_struct *p, *n;
-
might_sleep();
/* When called immediately after register_netdevice() failed the unwind
@@ -10959,6 +11857,9 @@ void free_netdev(struct net_device *dev)
return;
}
+ WARN_ON(dev->cfg != dev->cfg_pending);
+ kfree(dev->cfg);
+ kfree(dev->ethtool);
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -10967,8 +11868,9 @@ void free_netdev(struct net_device *dev)
/* Flush device addresses */
dev_addr_flush(dev);
- list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
- netif_napi_del(p);
+ netdev_napi_exit(dev);
+
+ netif_del_cpu_rmap(dev);
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
@@ -10980,9 +11882,14 @@ void free_netdev(struct net_device *dev)
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
+ netdev_free_phy_link_topology(dev);
+
+ mutex_destroy(&dev->lock);
+
/* Compatibility with error handling in drivers */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- netdev_freemem(dev);
+ if (dev->reg_state == NETREG_UNINITIALIZED ||
+ dev->reg_state == NETREG_DUMMY) {
+ kvfree(dev);
return;
}
@@ -10995,6 +11902,19 @@ void free_netdev(struct net_device *dev)
EXPORT_SYMBOL(free_netdev);
/**
+ * alloc_netdev_dummy - Allocate and initialize a dummy net device.
+ * @sizeof_priv: size of private data to allocate space for
+ *
+ * Return: the allocated net_device on success, NULL otherwise
+ */
+struct net_device *alloc_netdev_dummy(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
+ init_dummy_netdev);
+}
+EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
+
+/**
* synchronize_net - Synchronize with packet receive processing
*
* Wait for packets currently being received to be done.
@@ -11003,13 +11923,41 @@ EXPORT_SYMBOL(free_netdev);
void synchronize_net(void)
{
might_sleep();
- if (rtnl_is_locked())
+ if (from_cleanup_net() || rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);
+static void netdev_rss_contexts_free(struct net_device *dev)
+{
+ struct ethtool_rxfh_context *ctx;
+ unsigned long context;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
+ struct ethtool_rxfh_param rxfh;
+
+ rxfh.indir = ethtool_rxfh_context_indir(ctx);
+ rxfh.key = ethtool_rxfh_context_key(ctx);
+ rxfh.hfunc = ctx->hfunc;
+ rxfh.input_xfrm = ctx->input_xfrm;
+ rxfh.rss_context = context;
+ rxfh.rss_delete = true;
+
+ xa_erase(&dev->ethtool->rss_ctx, context);
+ if (dev->ethtool_ops->create_rxfh_context)
+ dev->ethtool_ops->remove_rxfh_context(dev, ctx,
+ context, NULL);
+ else
+ dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL);
+ kfree(ctx);
+ }
+ xa_destroy(&dev->ethtool->rss_ctx);
+ mutex_unlock(&dev->ethtool->rss_lock);
+}
+
/**
* unregister_netdevice_queue - remove device from the kernel
* @dev: device
@@ -11038,6 +11986,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
}
EXPORT_SYMBOL(unregister_netdevice_queue);
+static void dev_memory_provider_uninstall(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct netdev_rx_queue *rxq = &dev->_rx[i];
+ struct pp_memory_provider_params *p = &rxq->mp_params;
+
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
+ }
+}
+
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh)
{
@@ -11068,15 +12029,29 @@ void unregister_netdevice_many_notify(struct list_head *head,
BUG_ON(dev->reg_state != NETREG_REGISTERED);
}
- /* If device is running, close it first. */
- list_for_each_entry(dev, head, unreg_list)
- list_add_tail(&dev->close_list, &close_head);
+ /* If device is running, close it first. Start with ops locked... */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (netdev_need_ops_lock(dev)) {
+ list_add_tail(&dev->close_list, &close_head);
+ netdev_lock(dev);
+ }
+ }
+ dev_close_many(&close_head, true);
+ /* ... now unlock them and go over the rest. */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+ else
+ list_add_tail(&dev->close_list, &close_head);
+ }
dev_close_many(&close_head, true);
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
+ netdev_unlock(dev);
}
flush_all_backlogs();
@@ -11086,9 +12061,12 @@ void unregister_netdevice_many_notify(struct list_head *head,
struct sk_buff *skb = NULL;
/* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
dev_shutdown(dev);
dev_tcx_uninstall(dev);
dev_xdp_uninstall(dev);
+ dev_memory_provider_uninstall(dev);
+ netdev_unlock_ops(dev);
bpf_dev_bound_netdev_unregister(dev);
netdev_offload_xstats_disable_all(dev);
@@ -11098,8 +12076,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
GFP_KERNEL, NULL, 0,
portid, nlh);
@@ -11113,11 +12090,17 @@ void unregister_netdevice_many_notify(struct list_head *head,
netdev_name_node_alt_flush(dev);
netdev_name_node_free(dev->name_node);
+ netdev_rss_contexts_free(dev);
+
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
+ mutex_destroy(&dev->ethtool->rss_lock);
+
+ net_shaper_flush_netdev(dev);
+
if (skb)
rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
@@ -11150,7 +12133,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
* @head: list of devices
*
* Note: As most callers use a stack allocated list_head,
- * we force a list_del() to make sure stack wont be corrupted later.
+ * we force a list_del() to make sure stack won't be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
@@ -11171,30 +12154,15 @@ EXPORT_SYMBOL(unregister_netdevice_many);
*/
void unregister_netdev(struct net_device *dev)
{
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);
-/**
- * __dev_change_net_namespace - move device to different nethost namespace
- * @dev: device
- * @net: network namespace
- * @pat: If not NULL name pattern to try if the current device name
- * is already taken in the destination network namespace.
- * @new_ifindex: If not zero, specifies device index in the target
- * namespace.
- *
- * This function shuts down a device interface and moves it
- * to a new network namespace. On success 0 is returned, on
- * a failure a netagive errno code is returned.
- *
- * Callers must hold the rtnl semaphore.
- */
-
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
- const char *pat, int new_ifindex)
+ const char *pat, int new_ifindex,
+ struct netlink_ext_ack *extack)
{
struct netdev_name_node *name_node;
struct net *net_old = dev_net(dev);
@@ -11205,12 +12173,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
- if (dev->features & NETIF_F_NETNS_LOCAL)
+ if (dev->netns_immutable) {
+ NL_SET_ERR_MSG(extack, "The interface netns is immutable");
goto out;
+ }
- /* Ensure the device has been registrered */
- if (dev->reg_state != NETREG_REGISTERED)
+ /* Ensure the device has been registered */
+ if (dev->reg_state != NETREG_REGISTERED) {
+ NL_SET_ERR_MSG(extack, "The interface isn't registered");
goto out;
+ }
/* Get out if there is nothing todo */
err = 0;
@@ -11223,30 +12195,49 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
err = -EEXIST;
if (netdev_name_in_use(net, dev->name)) {
/* We get here if we can't use the current device name */
- if (!pat)
+ if (!pat) {
+ NL_SET_ERR_MSG(extack,
+ "An interface with the same name exists in the target netns");
goto out;
+ }
err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Unable to use '%s' for the new interface name in the target netns",
+ pat);
goto out;
+ }
}
/* Check that none of the altnames conflicts. */
err = -EEXIST;
- netdev_for_each_altname(dev, name_node)
- if (netdev_name_in_use(net, name_node->name))
+ netdev_for_each_altname(dev, name_node) {
+ if (netdev_name_in_use(net, name_node->name)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "An interface with the altname %s exists in the target netns",
+ name_node->name);
goto out;
+ }
+ }
/* Check that new_ifindex isn't used yet. */
if (new_ifindex) {
err = dev_index_reserve(net, new_ifindex);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "The ifindex %d is not available in the target netns",
+ new_ifindex);
goto out;
+ }
} else {
/* If there is an ifindex conflict assign a new one */
err = dev_index_reserve(net, dev->ifindex);
if (err == -EBUSY)
err = dev_index_reserve(net, 0);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Unable to allocate a new ifindex in the target netns");
goto out;
+ }
new_ifindex = err;
}
@@ -11254,16 +12245,23 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
* And now a mini version of register_netdevice unregister_netdevice.
*/
+ netdev_lock_ops(dev);
/* If device is running close it first. */
- dev_close(dev);
-
+ netif_close(dev);
/* And unlink it from device chain */
unlist_netdevice(dev);
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->moving_ns = true;
+ netdev_unlock(dev);
+
synchronize_net();
/* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
dev_shutdown(dev);
+ netdev_unlock_ops(dev);
/* Notify protocols, that we are about to destroy
* this device. They should clean all the things.
@@ -11294,11 +12292,17 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
move_netdevice_notifiers_dev_net(dev, net);
/* Actually switch the network namespace */
+ netdev_lock(dev);
dev_net_set(dev, net);
+ netdev_unlock(dev);
dev->ifindex = new_ifindex;
- if (new_name[0]) /* Rename the netdev to prepared name */
+ if (new_name[0]) {
+ /* Rename the netdev to prepared name */
+ write_seqlock_bh(&netdev_rename_lock);
strscpy(dev->name, new_name, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
+ }
/* Fixup kobjects */
dev_set_uevent_suppress(&dev->dev, 1);
@@ -11316,11 +12320,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
err = netdev_change_owner(dev, net_old, net);
WARN_ON(err);
+ netdev_lock(dev);
+ dev->moving_ns = false;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+
/* Add the device back in the hashes */
list_netdevice(dev);
-
/* Notify protocols, that a new device appeared. */
call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
/*
* Prevent userspace races by waiting until the network
@@ -11333,7 +12342,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
out:
return err;
}
-EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
static int dev_cpu_dead(unsigned int oldcpu)
{
@@ -11373,7 +12381,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
list_del_init(&napi->poll_list);
if (napi->poll == process_backlog)
- napi->state = 0;
+ napi->state &= NAPIF_STATE_THREADED;
else
____napi_schedule(sd, napi);
}
@@ -11381,21 +12389,23 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+ if (!use_backlog_threads()) {
#ifdef CONFIG_RPS
- remsd = oldsd->rps_ipi_list;
- oldsd->rps_ipi_list = NULL;
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
#endif
- /* send out pending IPI's on offline CPU */
- net_rps_send_ipi(remsd);
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+ }
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx(skb);
- input_queue_head_incr(oldsd);
+ rps_input_queue_head_incr(oldsd);
}
while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
- input_queue_head_incr(oldsd);
+ rps_input_queue_head_incr(oldsd);
}
return 0;
@@ -11446,7 +12456,7 @@ static struct hlist_head * __net_init netdev_create_hash(void)
static int __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
- 8 * sizeof_field(struct napi_struct, gro_bitmask));
+ BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));
INIT_LIST_HEAD(&net->dev_base_head);
@@ -11581,7 +12591,7 @@ static void __net_exit default_device_exit_net(struct net *net)
char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */
- if (dev->features & NETIF_F_NETNS_LOCAL)
+ if (dev->netns_immutable)
continue;
/* Leave virtual devices for the generic cleanup */
@@ -11642,7 +12652,7 @@ static struct pernet_operations __net_initdata default_device_ops = {
static void __init net_dev_struct_check(void)
{
/* TX read-mostly hotpath */
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
@@ -11680,8 +12690,6 @@ static void __init net_dev_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -11693,7 +12701,7 @@ static void __init net_dev_struct_check(void)
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
}
/*
@@ -11712,19 +12720,58 @@ static int net_page_pool_create(int cpuid)
struct page_pool_params page_pool_params = {
.pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
.flags = PP_FLAG_SYSTEM_POOL,
- .nid = NUMA_NO_NODE,
+ .nid = cpu_to_mem(cpuid),
};
struct page_pool *pp_ptr;
+ int err;
pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
if (IS_ERR(pp_ptr))
return -ENOMEM;
- per_cpu(system_page_pool, cpuid) = pp_ptr;
+ err = xdp_reg_page_pool(pp_ptr);
+ if (err) {
+ page_pool_destroy(pp_ptr);
+ return err;
+ }
+
+ per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
#endif
return 0;
}
+static int backlog_napi_should_run(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+ struct napi_struct *napi = &sd->backlog;
+
+ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+}
+
+static void run_backlog_napi(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+
+ napi_threaded_poll_loop(&sd->backlog);
+}
+
+static void backlog_napi_setup(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+ struct napi_struct *napi = &sd->backlog;
+
+ napi->thread = this_cpu_read(backlog_napi);
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+}
+
+static struct smp_hotplug_thread backlog_threads = {
+ .store = &backlog_napi,
+ .thread_should_run = backlog_napi_should_run,
+ .thread_fn = run_backlog_napi,
+ .thread_comm = "backlog_napi/%u",
+ .setup = backlog_napi_setup,
+};
+
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
@@ -11753,12 +12800,13 @@ static int __init net_dev_init(void)
* Initialise the packet receive queues.
*/
+ flush_backlogs_fallback = flush_backlogs_alloc();
+ if (!flush_backlogs_fallback)
+ goto out;
+
for_each_possible_cpu(i) {
- struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
- INIT_WORK(flush, flush_backlog);
-
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
@@ -11773,13 +12821,16 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
spin_lock_init(&sd->defer_lock);
- init_gro_hash(&sd->backlog);
+ gro_init(&sd->backlog.gro);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
+ INIT_LIST_HEAD(&sd->backlog.poll_list);
if (net_page_pool_create(i))
goto out;
}
+ if (use_backlog_threads())
+ smpboot_register_percpu_thread(&backlog_threads);
dev_boot_phase = 0;
@@ -11805,17 +12856,22 @@ static int __init net_dev_init(void)
NULL, dev_cpu_dead);
WARN_ON(rc < 0);
rc = 0;
+
+ /* avoid static key IPIs to isolated CPUs */
+ if (housekeeping_enabled(HK_TYPE_MISC))
+ net_enable_timestamp();
out:
if (rc < 0) {
for_each_possible_cpu(i) {
struct page_pool *pp_ptr;
- pp_ptr = per_cpu(system_page_pool, i);
+ pp_ptr = per_cpu(system_page_pool.pool, i);
if (!pp_ptr)
continue;
+ xdp_unreg_page_pool(pp_ptr);
page_pool_destroy(pp_ptr);
- per_cpu(system_page_pool, i) = NULL;
+ per_cpu(system_page_pool.pool, i) = NULL;
}
}
diff --git a/net/core/dev.h b/net/core/dev.h
index 2bcaf8eee50c..e93f36b7ddf3 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -2,21 +2,22 @@
#ifndef _NET_CORE_DEV_H
#define _NET_CORE_DEV_H
+#include <linux/cleanup.h>
#include <linux/types.h>
#include <linux/rwsem.h>
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
struct net;
-struct net_device;
-struct netdev_bpf;
-struct netdev_phys_item_id;
struct netlink_ext_ack;
struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
struct sd_flow_limit {
- u64 count;
- unsigned int num_buckets;
+ struct rcu_head rcu;
+ unsigned int count;
+ u8 log_buckets;
unsigned int history_head;
u16 history[FLOW_LIMIT_HISTORY];
u8 buckets[];
@@ -24,6 +25,38 @@ struct sd_flow_limit {
extern int netdev_flow_limit_table_len;
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
+
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T));
+
+#define for_each_netdev_lock_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock) = NULL; \
+ (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \
+ ifindex++)
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex);
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *,
+ if (_T) netdev_unlock_ops_compat(_T));
+
+#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \
+ (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \
+ &ifindex)); \
+ ifindex++)
+
#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
#else
@@ -37,8 +70,17 @@ void dev_addr_flush(struct net_device *dev);
int dev_addr_init(struct net_device *dev);
void dev_addr_check(struct net_device *dev);
+#if IS_ENABLED(CONFIG_NET_SHAPER)
+void net_shaper_flush_netdev(struct net_device *dev);
+void net_shaper_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq);
+#else
+static inline void net_shaper_flush_netdev(struct net_device *dev) {}
+static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq) {}
+#endif
+
/* sysctls not referred to from outside net/core/ */
-extern unsigned int sysctl_skb_defer_max;
extern int netdev_unregister_timeout_secs;
extern int weight_p;
extern int dev_weight_rx_bias;
@@ -60,6 +102,7 @@ struct netdev_name_node {
};
int netdev_get_name(struct net *net, char *name, int ifindex);
+int netif_change_name(struct net_device *dev, const char *newname);
int dev_change_name(struct net_device *dev, const char *newname);
#define netdev_for_each_altname(dev, namenode) \
@@ -73,24 +116,28 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
int dev_validate_mtu(struct net_device *dev, int mtu,
struct netlink_ext_ack *extack);
-int dev_set_mtu_ext(struct net_device *dev, int mtu,
- struct netlink_ext_ack *extack);
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack);
int dev_get_phys_port_id(struct net_device *dev,
struct netdev_phys_item_id *ppid);
int dev_get_phys_port_name(struct net_device *dev,
char *name, size_t len);
+int netif_change_proto_down(struct net_device *dev, bool proto_down);
int dev_change_proto_down(struct net_device *dev, bool proto_down);
-void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
- u32 value);
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value);
typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, int expected_fd, u32 flags);
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
+void netif_set_group(struct net_device *dev, int new_group);
void dev_set_group(struct net_device *dev, int new_group);
+int netif_change_carrier(struct net_device *dev, bool new_carrier);
int dev_change_carrier(struct net_device *dev, bool new_carrier);
void __dev_set_rx_mode(struct net_device *dev);
@@ -102,6 +149,20 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh);
+static inline void netif_set_up(struct net_device *dev, bool value)
+{
+ if (value)
+ dev->flags |= IFF_UP;
+ else
+ dev->flags &= ~IFF_UP;
+
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->up = value;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+}
+
static inline void netif_set_gso_max_size(struct net_device *dev,
unsigned int size)
{
@@ -141,6 +202,119 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
WRITE_ONCE(dev->gro_ipv4_max_size, size);
}
+/**
+ * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs
+ * @n: napi struct to get the defer_hard_irqs field from
+ *
+ * Return: the per-NAPI value of the defar_hard_irqs field.
+ */
+static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n)
+{
+ return READ_ONCE(n->defer_hard_irqs);
+}
+
+/**
+ * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi
+ * @n: napi_struct to set the defer_hard_irqs field
+ * @defer: the value the field should be set to
+ */
+static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
+{
+ WRITE_ONCE(n->defer_hard_irqs, defer);
+}
+
+/**
+ * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev
+ * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set
+ * @defer: the defer_hard_irqs value to set
+ */
+static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
+ u32 defer)
+{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
+ struct napi_struct *napi;
+ int i;
+
+ WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_defer_hard_irqs(napi, defer);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].defer_hard_irqs = defer;
+}
+
+/**
+ * napi_get_gro_flush_timeout - get the gro_flush_timeout
+ * @n: napi struct to get the gro_flush_timeout from
+ *
+ * Return: the per-NAPI value of the gro_flush_timeout field.
+ */
+static inline unsigned long
+napi_get_gro_flush_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->gro_flush_timeout);
+}
+
+/**
+ * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi
+ * @n: napi struct to set the gro_flush_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout
+ */
+static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->gro_flush_timeout, timeout);
+}
+
+/**
+ * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs
+ * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set
+ * @timeout: the timeout value to set
+ */
+static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
+ unsigned long timeout)
+{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
+ struct napi_struct *napi;
+ int i;
+
+ WRITE_ONCE(netdev->gro_flush_timeout, timeout);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_gro_flush_timeout(napi, timeout);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].gro_flush_timeout = timeout;
+}
+
+/**
+ * napi_get_irq_suspend_timeout - get the irq_suspend_timeout
+ * @n: napi struct to get the irq_suspend_timeout from
+ *
+ * Return: the per-NAPI value of the irq_suspend_timeout field.
+ */
+static inline unsigned long
+napi_get_irq_suspend_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->irq_suspend_timeout);
+}
+
+/**
+ * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi
+ * @n: napi struct to set the irq_suspend_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout
+ */
+static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->irq_suspend_timeout, timeout);
+}
+
int rps_cpumask_housekeeping(struct cpumask *mask);
#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
@@ -149,5 +323,60 @@ void xdp_do_check_flushed(struct napi_struct *napi);
static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
#endif
-struct napi_struct *napi_by_id(unsigned int napi_id);
+/* Best effort check that NAPI is not idle (can't be scheduled to run) */
+static inline void napi_assert_will_not_race(const struct napi_struct *napi)
+{
+ /* uninitialized instance, can't race */
+ if (!napi->poll_list.next)
+ return;
+
+ /* SCHED bit is set on disabled instances */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+ WARN_ON(READ_ONCE(napi->list_owner) != -1);
+}
+
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
+
+#define XMIT_RECURSION_LIMIT 8
+
+#ifndef CONFIG_PREEMPT_RT
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
+ XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ __this_cpu_inc(softnet_data.xmit.recursion);
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.recursion);
+}
+#else
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ current->net_xmit.recursion++;
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ current->net_xmit.recursion--;
+}
+#endif
+
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack);
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg);
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg);
+
#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index baa63dee2829..90716bd736f3 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -242,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
__hw_addr_del_entry(from_list, ha, false, false);
}
-static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
- struct netdev_hw_addr_list *from_list,
- int addr_len)
+int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
{
int err = 0;
struct netdev_hw_addr *ha, *tmp;
@@ -260,9 +260,10 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
}
return err;
}
+EXPORT_SYMBOL(__hw_addr_sync_multiple);
/* This function only works where there is a strict 1-1 relationship
- * between source and destionation of they synch. If you ever need to
+ * between source and destination of they synch. If you ever need to
* sync addresses to more then 1 destination, you need to use
* __hw_addr_sync_multiple().
*/
@@ -299,8 +300,8 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
EXPORT_SYMBOL(__hw_addr_unsync);
/**
- * __hw_addr_sync_dev - Synchonize device's multicast list
- * @list: address list to syncronize
+ * __hw_addr_sync_dev - Synchronize device's multicast list
+ * @list: address list to synchronize
* @dev: device to sync
* @sync: function to call if address should be added
* @unsync: function to call if address should be removed
diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c
index 4dbd0dc6aea2..8e1dba825e94 100644
--- a/net/core/dev_addr_lists_test.c
+++ b/net/core/dev_addr_lists_test.c
@@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test)
KUNIT_FAIL(test, "Can't register netdev %d", err);
}
- rtnl_lock();
return 0;
}
@@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test)
{
struct net_device *netdev = test->priv;
- rtnl_unlock();
unregister_netdev(netdev);
free_netdev(netdev);
}
@@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test)
struct net_device *netdev = test->priv;
u8 addr[ETH_ALEN];
+ rtnl_lock();
KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr);
memset(addr, 2, sizeof(addr));
@@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test)
memset(addr, 3, sizeof(addr));
dev_addr_set(netdev, addr);
KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr));
+ rtnl_unlock();
}
static void dev_addr_test_sync_one(struct kunit *test)
@@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test)
datp = netdev_priv(netdev);
+ rtnl_lock();
memset(addr, 1, sizeof(addr));
eth_hw_addr_set(netdev, addr);
@@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test)
* considered synced and we overwrite in place.
*/
KUNIT_EXPECT_EQ(test, 0, datp->addr_seen);
+ rtnl_unlock();
}
static void dev_addr_test_add_del(struct kunit *test)
@@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test)
datp = netdev_priv(netdev);
+ rtnl_lock();
for (i = 1; i < 4; i++) {
memset(addr, i, sizeof(addr));
KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
@@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test)
__hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
dev_addr_test_unsync);
KUNIT_EXPECT_EQ(test, 1, datp->addr_seen);
+ rtnl_unlock();
}
static void dev_addr_test_del_main(struct kunit *test)
@@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test)
struct net_device *netdev = test->priv;
u8 addr[ETH_ALEN];
+ rtnl_lock();
memset(addr, 1, sizeof(addr));
eth_hw_addr_set(netdev, addr);
@@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test)
NETDEV_HW_ADDR_T_LAN));
KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
NETDEV_HW_ADDR_T_LAN));
+ rtnl_unlock();
}
static void dev_addr_test_add_set(struct kunit *test)
@@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test)
datp = netdev_priv(netdev);
+ rtnl_lock();
/* There is no external API like dev_addr_add_excl(),
* so shuffle the tree a little bit and exploit aliasing.
*/
@@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test)
__hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
dev_addr_test_unsync);
KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen);
+ rtnl_unlock();
}
static void dev_addr_test_add_excl(struct kunit *test)
@@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test)
u8 addr[ETH_ALEN];
int i;
+ rtnl_lock();
for (i = 0; i < 10; i++) {
memset(addr, i, sizeof(addr));
KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr));
@@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test)
memset(addr, i, sizeof(addr));
KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
}
+ rtnl_unlock();
}
static struct kunit_case dev_addr_test_cases[] = {
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
new file mode 100644
index 000000000000..1bf0153195f2
--- /dev/null
+++ b/net/core/dev_api.c
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+
+#include "dev.h"
+
+/**
+ * dev_change_name() - change name of a device
+ * @dev: device
+ * @newname: name (or format string) must be at least IFNAMSIZ
+ *
+ * Change name of a device, can pass format strings "eth%d".
+ * for wildcarding.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_name(struct net_device *dev, const char *newname)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_name(dev, newname);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_set_alias() - change ifalias of a device
+ * @dev: device
+ * @alias: name up to IFALIASZ
+ * @len: limit of bytes to copy from info
+ *
+ * Set ifalias for a device.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_alias(dev, alias, len);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_alias);
+
+/**
+ * dev_change_flags() - change device settings
+ * @dev: device
+ * @flags: device state flags
+ * @extack: netlink extended ack
+ *
+ * Change settings on device based state flags. The flags are
+ * in the userspace exported format.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_flags(dev, flags, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_change_flags);
+
+/**
+ * dev_set_group() - change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ netdev_lock_ops(dev);
+ netif_set_group(dev, new_group);
+ netdev_unlock_ops(dev);
+}
+
+int dev_set_mac_address_user(struct net_device *dev,
+ struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ down_write(&dev_addr_sem);
+ netdev_lock_ops(dev);
+ ret = netif_set_mac_address(dev, ss, extack);
+ netdev_unlock_ops(dev);
+ up_write(&dev_addr_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+/**
+ * dev_change_net_namespace() - move device to different nethost namespace
+ * @dev: device
+ * @net: network namespace
+ * @pat: If not NULL name pattern to try if the current device name
+ * is already taken in the destination network namespace.
+ *
+ * This function shuts down a device interface and moves it
+ * to a new network namespace. On success 0 is returned, on
+ * a failure a netagive errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat)
+{
+ return __dev_change_net_namespace(dev, net, pat, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+
+/**
+ * dev_change_carrier() - change device carrier
+ * @dev: device
+ * @new_carrier: new value
+ *
+ * Change device carrier
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_carrier(dev, new_carrier);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_change_tx_queue_len() - change TX queue length of a netdevice
+ * @dev: device
+ * @new_len: new tx queue length
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_tx_queue_len(dev, new_len);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_change_proto_down() - set carrier according to proto_down
+ * @dev: device
+ * @proto_down: new value
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_proto_down(dev, proto_down);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_open() - prepare an interface for use
+ * @dev: device to open
+ * @extack: netlink extended ack
+ *
+ * Takes a device from down to up state. The device's private open
+ * function is invoked and then the multicast lists are loaded. Finally
+ * the device is moved into the up state and a %NETDEV_UP message is
+ * sent to the netdev notifier chain.
+ *
+ * Calling this function on an active interface is a nop. On a failure
+ * a negative errno code is returned.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_open(dev, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_open);
+
+/**
+ * dev_close() - shutdown an interface
+ * @dev: device to shutdown
+ *
+ * This function moves an active device into down state. A
+ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ * chain.
+ */
+void dev_close(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_close(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(dev_close);
+
+int dev_eth_ioctl(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int ret = -ENODEV;
+
+ if (!ops->ndo_eth_ioctl)
+ return -EOPNOTSUPP;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_eth_ioctl(dev, ifr, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_eth_ioctl);
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_mtu(dev, new_mtu);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mtu);
+
+/**
+ * dev_disable_lro() - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_disable_lro(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+/**
+ * dev_set_promiscuity() - update promiscuity count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove promiscuity from a device. While the count in the device
+ * remains above zero the interface remains promiscuous. Once it hits zero
+ * the device reverts back to normal filtering operation. A negative inc
+ * value is used to drop promiscuity on the device.
+ * Return 0 if successful or a negative errno code on error.
+ */
+int dev_set_promiscuity(struct net_device *dev, int inc)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_promiscuity(dev, inc);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_promiscuity);
+
+/**
+ * dev_set_allmulti() - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_allmulti(dev, inc, true);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_allmulti);
+
+/**
+ * dev_set_mac_address() - change Media Access Control Address
+ * @dev: device
+ * @ss: new address
+ * @extack: netlink extended ack
+ *
+ * Change the hardware (MAC) address of the device
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_mac_address(dev, ss, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address);
+
+int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_xdp_propagate(dev, bpf);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_xdp_propagate);
+
+/**
+ * netdev_state_change() - device changes state
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed state. This function calls
+ * the notifier chains for netdev_chain and sends a NEWLINK message
+ * to the routing socket.
+ */
+void netdev_state_change(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_state_change(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(netdev_state_change);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 9a66cf5015f2..616479e71466 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -6,9 +6,11 @@
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
#include <linux/phylib_stubs.h>
+#include <linux/ptp_clock_kernel.h>
#include <linux/wireless.h>
#include <linux/if_bridge.h>
#include <net/dsa_stubs.h>
+#include <net/netdev_lock.h>
#include <net/wext.h>
#include "dev.h"
@@ -64,7 +66,7 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc)
}
/* Loop over the interfaces, and write an info block for each. */
- rtnl_lock();
+ rtnl_net_lock(net);
for_each_netdev(net, dev) {
if (!pos)
done = inet_gifconf(dev, NULL, 0, size);
@@ -72,12 +74,12 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc)
done = inet_gifconf(dev, pos + total,
len - total, size);
if (done < 0) {
- rtnl_unlock();
+ rtnl_net_unlock(net);
return -EFAULT;
}
total += done;
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
return put_user(total, &uifc->ifc_len);
}
@@ -109,7 +111,7 @@ static int dev_getifmap(struct net_device *dev, struct ifreq *ifr)
return 0;
}
-static int dev_setifmap(struct net_device *dev, struct ifreq *ifr)
+static int netif_setifmap(struct net_device *dev, struct ifreq *ifr)
{
struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map;
@@ -184,7 +186,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
return err;
}
-static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
{
enum hwtstamp_tx_types tx_type;
enum hwtstamp_rx_filters rx_filter;
@@ -239,39 +241,38 @@ static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
return 0;
}
-static int dev_eth_ioctl(struct net_device *dev,
- struct ifreq *ifr, unsigned int cmd)
-{
- const struct net_device_ops *ops = dev->netdev_ops;
-
- if (!ops->ndo_eth_ioctl)
- return -EOPNOTSUPP;
-
- if (!netif_device_present(dev))
- return -ENODEV;
-
- return ops->ndo_eth_ioctl(dev, ifr, cmd);
-}
-
/**
* dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC
* or of attached phylib PHY
* @dev: Network device
* @cfg: Timestamping configuration structure
*
- * Helper for enforcing a common policy that phylib timestamping, if available,
- * should take precedence in front of hardware timestamping provided by the
- * netdev.
+ * Helper for calling the default hardware provider timestamping.
*
* Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and
* there only exists a phydev->mii_ts->hwtstamp() method. So this will return
* -EOPNOTSUPP for phylib for now, which is still more accurate than letting
* the netdev handle the GET request.
*/
-static int dev_get_hwtstamp_phylib(struct net_device *dev,
- struct kernel_hwtstamp_config *cfg)
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
{
- if (phy_has_hwtstamp(dev->phydev))
+ struct hwtstamp_provider *hwprov;
+
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ cfg->qualifier = hwprov->desc.qualifier;
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev)
+ return phy_hwtstamp_get(hwprov->phydev, cfg);
+
+ if (hwprov->source == HWTSTAMP_SOURCE_NETDEV)
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+
+ return -EOPNOTSUPP;
+ }
+
+ if (phy_is_default_hwtstamp(dev->phydev))
return phy_hwtstamp_get(dev->phydev, cfg);
return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
@@ -291,7 +292,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
return -ENODEV;
kernel_cfg.ifr = ifr;
+ netdev_lock_ops(dev);
err = dev_get_hwtstamp_phylib(dev, &kernel_cfg);
+ netdev_unlock_ops(dev);
if (err)
return err;
@@ -319,28 +322,48 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
* should take precedence in front of hardware timestamping provided by the
* netdev. If the netdev driver needs to perform specific actions even for PHY
* timestamping to work properly (a switch port must trap the timestamped
- * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in
- * dev->priv_flags.
+ * frames and not forward them), it must set dev->see_all_hwtstamp_requests.
*/
int dev_set_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
- bool phy_ts = phy_has_hwtstamp(dev->phydev);
struct kernel_hwtstamp_config old_cfg = {};
+ struct hwtstamp_provider *hwprov;
+ struct phy_device *phydev;
bool changed = false;
+ bool phy_ts;
int err;
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev) {
+ phy_ts = true;
+ phydev = hwprov->phydev;
+ } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) {
+ phy_ts = false;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ cfg->qualifier = hwprov->desc.qualifier;
+ } else {
+ phy_ts = phy_is_default_hwtstamp(dev->phydev);
+ if (phy_ts)
+ phydev = dev->phydev;
+ }
+
cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
- if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
+ if (phy_ts && dev->see_all_hwtstamp_requests) {
err = ops->ndo_hwtstamp_get(dev, &old_cfg);
if (err)
return err;
}
- if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
+ if (!phy_ts || dev->see_all_hwtstamp_requests) {
err = ops->ndo_hwtstamp_set(dev, cfg, extack);
if (err) {
if (extack->_msg)
@@ -349,11 +372,11 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
}
}
- if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS))
+ if (phy_ts && dev->see_all_hwtstamp_requests)
changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
if (phy_ts) {
- err = phy_hwtstamp_set(dev->phydev, cfg, extack);
+ err = phy_hwtstamp_set(phydev, cfg, extack);
if (err) {
if (changed)
ops->ndo_hwtstamp_set(dev, &old_cfg, NULL);
@@ -363,7 +386,6 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
return 0;
}
-EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib);
static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
{
@@ -396,7 +418,9 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
if (!netif_device_present(dev))
return -ENODEV;
+ netdev_lock_ops(dev);
err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack);
+ netdev_unlock_ops(dev);
if (err)
return err;
@@ -471,10 +495,14 @@ static int dev_siocbond(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_siocbond) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
if (netif_device_present(dev))
- return ops->ndo_siocbond(dev, ifr, cmd);
- else
- return -ENODEV;
+ ret = ops->ndo_siocbond(dev, ifr, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
}
return -EOPNOTSUPP;
@@ -486,10 +514,14 @@ static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_siocdevprivate) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
if (netif_device_present(dev))
- return ops->ndo_siocdevprivate(dev, ifr, data, cmd);
- else
- return -ENODEV;
+ ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
}
return -EOPNOTSUPP;
@@ -500,17 +532,21 @@ static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_siocwandev) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
if (netif_device_present(dev))
- return ops->ndo_siocwandev(dev, ifs);
- else
- return -ENODEV;
+ ret = ops->ndo_siocwandev(dev, ifs);
+ netdev_unlock_ops(dev);
+
+ return ret;
}
return -EOPNOTSUPP;
}
/*
- * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ * Perform the SIOCxIFxxx calls, inside rtnl_net_lock()
*/
static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
unsigned int cmd)
@@ -518,7 +554,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
const struct net_device_ops *ops;
- netdevice_tracker dev_tracker;
if (!dev)
return -ENODEV;
@@ -537,9 +572,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return dev_set_mtu(dev, ifr->ifr_mtu);
case SIOCSIFHWADDR:
- if (dev->addr_len > sizeof(struct sockaddr))
+ if (dev->addr_len > sizeof(ifr->ifr_hwaddr))
return -EINVAL;
- return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL);
+ return dev_set_mac_address_user(dev,
+ (struct sockaddr_storage *)&ifr->ifr_hwaddr,
+ NULL);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
@@ -547,11 +584,16 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
min(sizeof(ifr->ifr_hwaddr.sa_data_min),
(size_t)dev->addr_len));
+ netdev_lock_ops(dev);
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ netdev_unlock_ops(dev);
return 0;
case SIOCSIFMAP:
- return dev_setifmap(dev, ifr);
+ netdev_lock_ops(dev);
+ err = netif_setifmap(dev, ifr);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCADDMULTI:
if (!ops->ndo_set_rx_mode ||
@@ -559,7 +601,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_lock_ops(dev);
+ err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCDELMULTI:
if (!ops->ndo_set_rx_mode ||
@@ -567,7 +612,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_lock_ops(dev);
+ err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCSIFTXQLEN:
if (ifr->ifr_qlen < 0)
@@ -581,19 +629,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
case SIOCWANDEV:
return dev_siocwandev(dev, &ifr->ifr_settings);
- case SIOCBRADDIF:
- case SIOCBRDELIF:
- if (!netif_device_present(dev))
- return -ENODEV;
- if (!netif_is_bridge_master(dev))
- return -EOPNOTSUPP;
- netdev_hold(dev, &dev_tracker, GFP_KERNEL);
- rtnl_unlock();
- err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
- netdev_put(dev, &dev_tracker);
- rtnl_lock();
- return err;
-
case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15:
return dev_siocdevprivate(dev, ifr, data, cmd);
@@ -737,9 +772,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
dev_load(net, ifr->ifr_name);
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
if (colon)
*colon = ':';
return ret;
@@ -774,8 +811,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
case SIOCBONDRELEASE:
case SIOCBONDSETHWADDR:
case SIOCBONDCHANGEACTIVE:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
case SIOCSHWTSTAMP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -783,9 +818,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
dev_load(net, ifr->ifr_name);
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
if (need_copyout)
*need_copyout = false;
return ret;
@@ -808,9 +845,10 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
(cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15)) {
dev_load(net, ifr->ifr_name);
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return ret;
}
return -ENOTTY;
diff --git a/net/core/devmem.c b/net/core/devmem.c
new file mode 100644
index 000000000000..b3a62ca0df65
--- /dev/null
+++ b/net/core/devmem.c
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Devmem TCP
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ * Willem de Bruijn <willemdebruijn.kernel@gmail.com>
+ * Kaiyuan Zhang <kaiyuanz@google.com
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/genalloc.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/sock.h>
+#include <trace/events/page_pool.h>
+
+#include "devmem.h"
+#include "mp_dmabuf_devmem.h"
+#include "page_pool_priv.h"
+
+/* Device memory support */
+
+static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
+
+static const struct memory_provider_ops dmabuf_devmem_ops;
+
+bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return niov->type == NET_IOV_DMABUF;
+}
+
+static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
+ struct gen_pool_chunk *chunk,
+ void *not_used)
+{
+ struct dmabuf_genpool_chunk_owner *owner = chunk->owner;
+
+ kvfree(owner->area.niovs);
+ kfree(owner);
+}
+
+static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
+{
+ struct dmabuf_genpool_chunk_owner *owner;
+
+ owner = net_devmem_iov_to_chunk_owner(niov);
+ return owner->base_dma_addr +
+ ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
+}
+
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
+{
+ struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
+
+ size_t size, avail;
+
+ gen_pool_for_each_chunk(binding->chunk_pool,
+ net_devmem_dmabuf_free_chunk_owner, NULL);
+
+ size = gen_pool_size(binding->chunk_pool);
+ avail = gen_pool_avail(binding->chunk_pool);
+
+ if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu",
+ size, avail))
+ gen_pool_destroy(binding->chunk_pool);
+
+ dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
+ DMA_FROM_DEVICE);
+ dma_buf_detach(binding->dmabuf, binding->attachment);
+ dma_buf_put(binding->dmabuf);
+ xa_destroy(&binding->bound_rxqs);
+ kvfree(binding->tx_vec);
+ kfree(binding);
+}
+EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free);
+
+struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ struct dmabuf_genpool_chunk_owner *owner;
+ unsigned long dma_addr;
+ struct net_iov *niov;
+ ssize_t offset;
+ ssize_t index;
+
+ dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE,
+ (void **)&owner);
+ if (!dma_addr)
+ return NULL;
+
+ offset = dma_addr - owner->base_dma_addr;
+ index = offset / PAGE_SIZE;
+ niov = &owner->area.niovs[index];
+
+ niov->pp_magic = 0;
+ niov->pp = NULL;
+ atomic_long_set(&niov->pp_ref_count, 0);
+
+ return niov;
+}
+
+void net_devmem_free_dmabuf(struct net_iov *niov)
+{
+ struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov);
+ unsigned long dma_addr = net_devmem_get_dma_addr(niov);
+
+ if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
+ PAGE_SIZE)))
+ return;
+
+ gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE);
+}
+
+void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ struct netdev_rx_queue *rxq;
+ unsigned long xa_idx;
+ unsigned int rxq_idx;
+
+ xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+
+ /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
+ * erase.
+ */
+ synchronize_net();
+
+ if (binding->list.next)
+ list_del(&binding->list);
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, rxq) {
+ const struct pp_memory_provider_params mp_params = {
+ .mp_priv = binding,
+ .mp_ops = &dmabuf_devmem_ops,
+ };
+
+ rxq_idx = get_netdev_rx_queue_index(rxq);
+
+ __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
+ }
+
+ net_devmem_dmabuf_binding_put(binding);
+}
+
+int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack)
+{
+ struct pp_memory_provider_params mp_params = {
+ .mp_priv = binding,
+ .mp_ops = &dmabuf_devmem_ops,
+ };
+ struct netdev_rx_queue *rxq;
+ u32 xa_idx;
+ int err;
+
+ err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack);
+ if (err)
+ return err;
+
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
+ err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b,
+ GFP_KERNEL);
+ if (err)
+ goto err_close_rxq;
+
+ return 0;
+
+err_close_rxq:
+ __net_mp_close_rxq(dev, rxq_idx, &mp_params);
+ return err;
+}
+
+struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ static u32 id_alloc_next;
+ struct scatterlist *sg;
+ struct dma_buf *dmabuf;
+ unsigned int sg_idx, i;
+ unsigned long virtual;
+ int err;
+
+ dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return ERR_CAST(dmabuf);
+
+ binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
+ dev_to_node(&dev->dev));
+ if (!binding) {
+ err = -ENOMEM;
+ goto err_put_dmabuf;
+ }
+
+ binding->dev = dev;
+ xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
+
+ refcount_set(&binding->ref, 1);
+
+ mutex_init(&binding->lock);
+
+ binding->dmabuf = dmabuf;
+
+ binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
+ if (IS_ERR(binding->attachment)) {
+ err = PTR_ERR(binding->attachment);
+ NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
+ goto err_free_binding;
+ }
+
+ binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
+ direction);
+ if (IS_ERR(binding->sgt)) {
+ err = PTR_ERR(binding->sgt);
+ NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
+ goto err_detach;
+ }
+
+ if (direction == DMA_TO_DEVICE) {
+ binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+ sizeof(struct net_iov *),
+ GFP_KERNEL);
+ if (!binding->tx_vec) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+ }
+
+ /* For simplicity we expect to make PAGE_SIZE allocations, but the
+ * binding can be much more flexible than that. We may be able to
+ * allocate MTU sized chunks here. Leave that for future work...
+ */
+ binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+ dev_to_node(&dev->dev));
+ if (!binding->chunk_pool) {
+ err = -ENOMEM;
+ goto err_tx_vec;
+ }
+
+ virtual = 0;
+ for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) {
+ dma_addr_t dma_addr = sg_dma_address(sg);
+ struct dmabuf_genpool_chunk_owner *owner;
+ size_t len = sg_dma_len(sg);
+ struct net_iov *niov;
+
+ owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
+ dev_to_node(&dev->dev));
+ if (!owner) {
+ err = -ENOMEM;
+ goto err_free_chunks;
+ }
+
+ owner->area.base_virtual = virtual;
+ owner->base_dma_addr = dma_addr;
+ owner->area.num_niovs = len / PAGE_SIZE;
+ owner->binding = binding;
+
+ err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
+ dma_addr, len, dev_to_node(&dev->dev),
+ owner);
+ if (err) {
+ kfree(owner);
+ err = -EINVAL;
+ goto err_free_chunks;
+ }
+
+ owner->area.niovs = kvmalloc_array(owner->area.num_niovs,
+ sizeof(*owner->area.niovs),
+ GFP_KERNEL);
+ if (!owner->area.niovs) {
+ err = -ENOMEM;
+ goto err_free_chunks;
+ }
+
+ for (i = 0; i < owner->area.num_niovs; i++) {
+ niov = &owner->area.niovs[i];
+ niov->type = NET_IOV_DMABUF;
+ niov->owner = &owner->area;
+ page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
+ net_devmem_get_dma_addr(niov));
+ if (direction == DMA_TO_DEVICE)
+ binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
+ }
+
+ virtual += len;
+ }
+
+ err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
+ binding, xa_limit_32b, &id_alloc_next,
+ GFP_KERNEL);
+ if (err < 0)
+ goto err_free_chunks;
+
+ list_add(&binding->list, &priv->bindings);
+
+ return binding;
+
+err_free_chunks:
+ gen_pool_for_each_chunk(binding->chunk_pool,
+ net_devmem_dmabuf_free_chunk_owner, NULL);
+ gen_pool_destroy(binding->chunk_pool);
+err_tx_vec:
+ kvfree(binding->tx_vec);
+err_unmap:
+ dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
+ DMA_FROM_DEVICE);
+err_detach:
+ dma_buf_detach(dmabuf, binding->attachment);
+err_free_binding:
+ kfree(binding);
+err_put_dmabuf:
+ dma_buf_put(dmabuf);
+ return ERR_PTR(err);
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+
+ rcu_read_lock();
+ binding = xa_load(&net_devmem_dmabuf_bindings, id);
+ if (binding) {
+ if (!net_devmem_dmabuf_binding_get(binding))
+ binding = NULL;
+ }
+ rcu_read_unlock();
+
+ return binding;
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
+}
+
+void net_devmem_put_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
+ unsigned int dmabuf_id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int err = 0;
+
+ binding = net_devmem_lookup_dmabuf(dmabuf_id);
+ if (!binding || !binding->tx_vec) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ /* The dma-addrs in this binding are only reachable to the corresponding
+ * net_device.
+ */
+ if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
+ err = -ENODEV;
+ goto out_err;
+ }
+
+ return binding;
+
+out_err:
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
+ return ERR_PTR(err);
+}
+
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
+ size_t virt_addr, size_t *off, size_t *size)
+{
+ if (virt_addr >= binding->dmabuf->size)
+ return NULL;
+
+ *off = virt_addr % PAGE_SIZE;
+ *size = PAGE_SIZE - *off;
+
+ return binding->tx_vec[virt_addr / PAGE_SIZE];
+}
+
+/*** "Dmabuf devmem memory provider" ***/
+
+int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+
+ if (!binding)
+ return -EINVAL;
+
+ /* dma-buf dma addresses do not need and should not be used with
+ * dma_sync_for_cpu/device. Force disable dma_sync.
+ */
+ pool->dma_sync = false;
+ pool->dma_sync_for_cpu = false;
+
+ if (pool->p.order != 0)
+ return -E2BIG;
+
+ net_devmem_dmabuf_binding_get(binding);
+ return 0;
+}
+
+netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+ struct net_iov *niov;
+ netmem_ref netmem;
+
+ niov = net_devmem_alloc_dmabuf(binding);
+ if (!niov)
+ return 0;
+
+ netmem = net_iov_to_netmem(niov);
+
+ page_pool_set_pp_info(pool, netmem);
+
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
+ return netmem;
+}
+
+void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+
+ net_devmem_dmabuf_binding_put(binding);
+}
+
+bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
+{
+ long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem));
+
+ if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ return false;
+
+ if (WARN_ON_ONCE(refcount != 1))
+ return false;
+
+ page_pool_clear_pp_info(netmem);
+
+ net_devmem_free_dmabuf(netmem_to_net_iov(netmem));
+
+ /* We don't want the page pool put_page()ing our net_iovs. */
+ return false;
+}
+
+static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp,
+ struct netdev_rx_queue *rxq)
+{
+ const struct net_devmem_dmabuf_binding *binding = mp_priv;
+ int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF;
+
+ return nla_put_u32(rsp, type, binding->id);
+}
+
+static void mp_dmabuf_devmem_uninstall(void *mp_priv,
+ struct netdev_rx_queue *rxq)
+{
+ struct net_devmem_dmabuf_binding *binding = mp_priv;
+ struct netdev_rx_queue *bound_rxq;
+ unsigned long xa_idx;
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) {
+ if (bound_rxq == rxq) {
+ xa_erase(&binding->bound_rxqs, xa_idx);
+ if (xa_empty(&binding->bound_rxqs)) {
+ mutex_lock(&binding->lock);
+ binding->dev = NULL;
+ mutex_unlock(&binding->lock);
+ }
+ break;
+ }
+ }
+}
+
+static const struct memory_provider_ops dmabuf_devmem_ops = {
+ .init = mp_dmabuf_devmem_init,
+ .destroy = mp_dmabuf_devmem_destroy,
+ .alloc_netmems = mp_dmabuf_devmem_alloc_netmems,
+ .release_netmem = mp_dmabuf_devmem_release_page,
+ .nl_fill = mp_dmabuf_devmem_nl_fill,
+ .uninstall = mp_dmabuf_devmem_uninstall,
+};
diff --git a/net/core/devmem.h b/net/core/devmem.h
new file mode 100644
index 000000000000..0a3b28ba5c13
--- /dev/null
+++ b/net/core/devmem.h
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Device memory TCP support
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ * Willem de Bruijn <willemb@google.com>
+ * Kaiyuan Zhang <kaiyuanz@google.com>
+ *
+ */
+#ifndef _NET_DEVMEM_H
+#define _NET_DEVMEM_H
+
+#include <net/netmem.h>
+#include <net/netdev_netlink.h>
+
+struct netlink_ext_ack;
+
+struct net_devmem_dmabuf_binding {
+ struct dma_buf *dmabuf;
+ struct dma_buf_attachment *attachment;
+ struct sg_table *sgt;
+ struct net_device *dev;
+ struct gen_pool *chunk_pool;
+ /* Protect dev */
+ struct mutex lock;
+
+ /* The user holds a ref (via the netlink API) for as long as they want
+ * the binding to remain alive. Each page pool using this binding holds
+ * a ref to keep the binding alive. The page_pool does not release the
+ * ref until all the net_iovs allocated from this binding are released
+ * back to the page_pool.
+ *
+ * The binding undos itself and unmaps the underlying dmabuf once all
+ * those refs are dropped and the binding is no longer desired or in
+ * use.
+ *
+ * net_devmem_get_net_iov() on dmabuf net_iovs will increment this
+ * reference, making sure that the binding remains alive until all the
+ * net_iovs are no longer used. net_iovs allocated from this binding
+ * that are stuck in the TX path for any reason (such as awaiting
+ * retransmits) hold a reference to the binding until the skb holding
+ * them is freed.
+ */
+ refcount_t ref;
+
+ /* The list of bindings currently active. Used for netlink to notify us
+ * of the user dropping the bind.
+ */
+ struct list_head list;
+
+ /* rxq's this binding is active on. */
+ struct xarray bound_rxqs;
+
+ /* ID of this binding. Globally unique to all bindings currently
+ * active.
+ */
+ u32 id;
+
+ /* Array of net_iov pointers for this binding, sorted by virtual
+ * address. This array is convenient to map the virtual addresses to
+ * net_iovs in the TX path.
+ */
+ struct net_iov **tx_vec;
+
+ struct work_struct unbind_w;
+};
+
+#if defined(CONFIG_NET_DEVMEM)
+/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist
+ * entry from the dmabuf is inserted into the genpool as a chunk, and needs
+ * this owner struct to keep track of some metadata necessary to create
+ * allocations from this chunk.
+ */
+struct dmabuf_genpool_chunk_owner {
+ struct net_iov_area area;
+ struct net_devmem_dmabuf_binding *binding;
+
+ /* dma_addr of the start of the chunk. */
+ dma_addr_t base_dma_addr;
+};
+
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
+struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack);
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
+void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
+int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack);
+void net_devmem_bind_tx_release(struct sock *sk);
+
+static inline struct dmabuf_genpool_chunk_owner *
+net_devmem_iov_to_chunk_owner(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return container_of(owner, struct dmabuf_genpool_chunk_owner, area);
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return net_devmem_iov_to_chunk_owner(niov)->binding;
+}
+
+static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
+{
+ return net_devmem_iov_binding(niov)->id;
+}
+
+static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return owner->base_virtual +
+ ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
+}
+
+static inline bool
+net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
+{
+ return refcount_inc_not_zero(&binding->ref);
+}
+
+static inline void
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+ if (!refcount_dec_and_test(&binding->ref))
+ return;
+
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov);
+void net_devmem_put_net_iov(struct net_iov *niov);
+
+struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
+void net_devmem_free_dmabuf(struct net_iov *ppiov);
+
+bool net_is_devmem_iov(struct net_iov *niov);
+struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size);
+
+#else
+struct net_devmem_dmabuf_binding;
+
+static inline void
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline void net_devmem_get_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void net_devmem_put_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
+{
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd,
+ struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ return NULL;
+}
+
+static inline void
+net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline int
+net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack)
+
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ return NULL;
+}
+
+static inline void net_devmem_free_dmabuf(struct net_iov *ppiov)
+{
+}
+
+static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
+{
+ return 0;
+}
+
+static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
+{
+ return 0;
+}
+
+static inline bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return false;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size)
+{
+ return NULL;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return NULL;
+}
+#endif
+
+#endif /* _NET_DEVMEM_H */
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index b0f221d658be..60d31c2feed3 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -37,7 +37,7 @@
#include <trace/events/napi.h>
#include <trace/events/devlink.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#define TRACE_ON 1
#define TRACE_OFF 0
@@ -74,7 +74,7 @@ struct net_dm_hw_entries {
};
struct per_cpu_dm_data {
- spinlock_t lock; /* Protects 'skb', 'hw_entries' and
+ raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and
* 'send_timer'
*/
union {
@@ -109,7 +109,8 @@ static u32 net_dm_queue_len = 1000;
struct net_dm_alert_ops {
void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
void *location,
- enum skb_drop_reason reason);
+ enum skb_drop_reason reason,
+ struct sock *rx_sk);
void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
int work, int budget);
void (*work_item_func)(struct work_struct *work);
@@ -168,9 +169,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
err:
mod_timer(&data->send_timer, jiffies + HZ / 10);
out:
- spin_lock_irqsave(&data->lock, flags);
+ raw_spin_lock_irqsave(&data->lock, flags);
swap(data->skb, skb);
- spin_unlock_irqrestore(&data->lock, flags);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
if (skb) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
@@ -207,7 +208,7 @@ static void send_dm_alert(struct work_struct *work)
*/
static void sched_send_work(struct timer_list *t)
{
- struct per_cpu_dm_data *data = from_timer(data, t, send_timer);
+ struct per_cpu_dm_data *data = timer_container_of(data, t, send_timer);
schedule_work(&data->dm_alert_work);
}
@@ -225,7 +226,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
local_irq_save(flags);
data = this_cpu_ptr(&dm_cpu_data);
- spin_lock(&data->lock);
+ raw_spin_lock(&data->lock);
dskb = data->skb;
if (!dskb)
@@ -259,12 +260,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
}
out:
- spin_unlock_irqrestore(&data->lock, flags);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
}
static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
void *location,
- enum skb_drop_reason reason)
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
{
trace_drop_common(skb, location);
}
@@ -314,9 +316,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)
mod_timer(&hw_data->send_timer, jiffies + HZ / 10);
}
- spin_lock_irqsave(&hw_data->lock, flags);
+ raw_spin_lock_irqsave(&hw_data->lock, flags);
swap(hw_data->hw_entries, hw_entries);
- spin_unlock_irqrestore(&hw_data->lock, flags);
+ raw_spin_unlock_irqrestore(&hw_data->lock, flags);
return hw_entries;
}
@@ -448,7 +450,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
return;
hw_data = this_cpu_ptr(&dm_hw_cpu_data);
- spin_lock_irqsave(&hw_data->lock, flags);
+ raw_spin_lock_irqsave(&hw_data->lock, flags);
hw_entries = hw_data->hw_entries;
if (!hw_entries)
@@ -477,7 +479,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
}
out:
- spin_unlock_irqrestore(&hw_data->lock, flags);
+ raw_spin_unlock_irqrestore(&hw_data->lock, flags);
}
static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
@@ -491,7 +493,8 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
struct sk_buff *skb,
void *location,
- enum skb_drop_reason reason)
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
{
ktime_t tstamp = ktime_get_real();
struct per_cpu_dm_data *data;
@@ -1085,7 +1088,7 @@ err_module_put:
struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
struct sk_buff *skb;
- del_timer_sync(&hw_data->send_timer);
+ timer_delete_sync(&hw_data->send_timer);
cancel_work_sync(&hw_data->dm_alert_work);
while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
struct devlink_trap_metadata *hw_metadata;
@@ -1119,7 +1122,7 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
struct sk_buff *skb;
- del_timer_sync(&hw_data->send_timer);
+ timer_delete_sync(&hw_data->send_timer);
cancel_work_sync(&hw_data->dm_alert_work);
while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
struct devlink_trap_metadata *hw_metadata;
@@ -1180,7 +1183,7 @@ err_module_put:
struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
struct sk_buff *skb;
- del_timer_sync(&data->send_timer);
+ timer_delete_sync(&data->send_timer);
cancel_work_sync(&data->dm_alert_work);
while ((skb = __skb_dequeue(&data->drop_queue)))
consume_skb(skb);
@@ -1208,7 +1211,7 @@ static void net_dm_trace_off_set(void)
struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
struct sk_buff *skb;
- del_timer_sync(&data->send_timer);
+ timer_delete_sync(&data->send_timer);
cancel_work_sync(&data->dm_alert_work);
while ((skb = __skb_dequeue(&data->drop_queue)))
consume_skb(skb);
@@ -1673,7 +1676,7 @@ static struct notifier_block dropmon_net_notifier = {
static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data)
{
- spin_lock_init(&data->lock);
+ raw_spin_lock_init(&data->lock);
skb_queue_head_init(&data->drop_queue);
u64_stats_init(&data->stats.syncp);
}
@@ -1731,30 +1734,30 @@ static int __init init_net_drop_monitor(void)
return -ENOSPC;
}
- rc = genl_register_family(&net_drop_monitor_family);
- if (rc) {
- pr_err("Could not create drop monitor netlink family\n");
- return rc;
+ for_each_possible_cpu(cpu) {
+ net_dm_cpu_data_init(cpu);
+ net_dm_hw_cpu_data_init(cpu);
}
- WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = register_netdevice_notifier(&dropmon_net_notifier);
if (rc < 0) {
pr_crit("Failed to register netdevice notifier\n");
+ return rc;
+ }
+
+ rc = genl_register_family(&net_drop_monitor_family);
+ if (rc) {
+ pr_err("Could not create drop monitor netlink family\n");
goto out_unreg;
}
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = 0;
- for_each_possible_cpu(cpu) {
- net_dm_cpu_data_init(cpu);
- net_dm_hw_cpu_data_init(cpu);
- }
-
goto out;
out_unreg:
- genl_unregister_family(&net_drop_monitor_family);
+ WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
out:
return rc;
}
@@ -1763,19 +1766,18 @@ static void exit_net_drop_monitor(void)
{
int cpu;
- BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
-
/*
* Because of the module_get/put we do in the trace state change path
* we are guaranteed not to have any current users when we get here
*/
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
for_each_possible_cpu(cpu) {
net_dm_hw_cpu_data_fini(cpu);
net_dm_cpu_data_fini(cpu);
}
-
- BUG_ON(genl_unregister_family(&net_drop_monitor_family));
}
module_init(init_net_drop_monitor);
diff --git a/net/core/dst.c b/net/core/dst.c
index 95f533844f17..795ca07e28a4 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -109,9 +109,6 @@ static void dst_destroy(struct dst_entry *dst)
child = xdst->child;
}
#endif
- if (!(dst->flags & DST_NOCOUNT))
- dst_entries_add(dst->ops, -1);
-
if (dst->ops->destroy)
dst->ops->destroy(dst);
netdev_put(dst->dev, &dst->dev_tracker);
@@ -159,17 +156,35 @@ void dst_dev_put(struct dst_entry *dst)
}
EXPORT_SYMBOL(dst_dev_put);
+static void dst_count_dec(struct dst_entry *dst)
+{
+ if (!(dst->flags & DST_NOCOUNT))
+ dst_entries_add(dst->ops, -1);
+}
+
void dst_release(struct dst_entry *dst)
{
- if (dst && rcuref_put(&dst->__rcuref))
+ if (dst && rcuref_put(&dst->__rcuref)) {
+#ifdef CONFIG_DST_CACHE
+ if (dst->flags & DST_METADATA) {
+ struct metadata_dst *md_dst = (struct metadata_dst *)dst;
+
+ if (md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_reset_now(&md_dst->u.tun_info.dst_cache);
+ }
+#endif
+ dst_count_dec(dst);
call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
+ }
}
EXPORT_SYMBOL(dst_release);
void dst_release_immediate(struct dst_entry *dst)
{
- if (dst && rcuref_put(&dst->__rcuref))
+ if (dst && rcuref_put(&dst->__rcuref)) {
+ dst_count_dec(dst);
dst_destroy(dst);
+ }
}
EXPORT_SYMBOL(dst_release_immediate);
@@ -279,7 +294,8 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
{
struct metadata_dst *md_dst;
- md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+ md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen),
+ flags);
if (!md_dst)
return NULL;
@@ -307,7 +323,8 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
int cpu;
struct metadata_dst __percpu *md_dst;
- md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
+ md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options,
+ optslen),
__alignof__(struct metadata_dst), flags);
if (!md_dst)
return NULL;
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 0ccfd5fa5cb9..93a04d18e505 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -17,6 +17,7 @@
struct dst_cache_pcpu {
unsigned long refresh_ts;
struct dst_entry *dst;
+ local_lock_t bh_lock;
u32 cookie;
union {
struct in_addr in_saddr;
@@ -27,6 +28,7 @@ struct dst_cache_pcpu {
static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
struct dst_entry *dst, u32 cookie)
{
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
dst_release(dst_cache->dst);
if (dst)
dst_hold(dst);
@@ -40,6 +42,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
{
struct dst_entry *dst;
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
dst = idst->dst;
if (!dst)
goto fail;
@@ -47,7 +50,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
/* the cache already hold a dst reference; it can't go away */
dst_hold(dst);
- if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+ if (unlikely(!time_after(idst->refresh_ts,
+ READ_ONCE(dst_cache->reset_ts)) ||
(dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
dst_cache_per_cpu_dst_set(idst, NULL, 0);
dst_release(dst);
@@ -62,10 +66,15 @@ fail:
struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
{
+ struct dst_entry *dst;
+
if (!dst_cache->cache)
return NULL;
- return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+ dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get);
@@ -77,13 +86,17 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in_saddr.s_addr;
- return container_of(dst, struct rtable, dst);
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst_rtable(dst);
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
@@ -95,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(idst, dst, 0);
idst->in_saddr.s_addr = saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
@@ -110,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
- dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
- rt6_get_cookie((struct rt6_info *)dst));
+ dst_cache_per_cpu_dst_set(idst, dst,
+ rt6_get_cookie(dst_rt6_info(dst)));
idst->in6_saddr = *saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
@@ -126,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in6_saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
@@ -139,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
{
+ unsigned int i;
+
dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
gfp | __GFP_ZERO);
if (!dst_cache->cache)
return -ENOMEM;
+ for_each_possible_cpu(i)
+ local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock);
dst_cache_reset(dst_cache);
return 0;
@@ -170,7 +197,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache)
if (!dst_cache->cache)
return;
- dst_cache->reset_ts = jiffies;
+ dst_cache_reset(dst_cache);
for_each_possible_cpu(i) {
struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);
struct dst_entry *dst = idst->dst;
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index fc96259807b6..5cdca49b1d7c 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -43,7 +43,6 @@ static unsigned int fib_seq_sum(struct net *net)
struct fib_notifier_ops *ops;
unsigned int fib_seq = 0;
- rtnl_lock();
rcu_read_lock();
list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
if (!try_module_get(ops->owner))
@@ -52,7 +51,6 @@ static unsigned int fib_seq_sum(struct net *net)
module_put(ops->owner);
}
rcu_read_unlock();
- rtnl_unlock();
return fib_seq;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 3f933ffcefc3..8ca634964e36 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -11,6 +11,7 @@
#include <linux/list.h>
#include <linux/module.h>
#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
#include <net/sock.h>
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
@@ -36,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = {
bool fib_rule_matchall(const struct fib_rule *rule)
{
- if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id ||
- rule->flags)
+ if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) ||
+ rule->mark || rule->tun_id || rule->flags)
return false;
if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1)
return false;
@@ -72,7 +73,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
- /* The lock is not required here, the list in unreacheable
+ /* The lock is not required here, the list in unreachable
* at the moment this function is called */
list_add_tail(&r->list, &ops->rules_list);
return 0;
@@ -100,7 +101,8 @@ static void notify_rule_change(int event, struct fib_rule *rule,
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
u32 pid);
-static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
+static struct fib_rules_ops *lookup_rules_ops(const struct net *net,
+ int family)
{
struct fib_rules_ops *ops;
@@ -255,16 +257,36 @@ static int nla_put_port_range(struct sk_buff *skb, int attrtype,
return nla_put(skb, attrtype, sizeof(*range), range);
}
+static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex,
+ const struct flowi *fl)
+{
+ u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master);
+
+ return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) :
+ fl->flowi_iif == iifindex;
+}
+
+static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex,
+ const struct flowi *fl)
+{
+ u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master);
+
+ return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) :
+ fl->flowi_oif == oifindex;
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
{
- int ret = 0;
+ int iifindex, oifindex, ret = 0;
- if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
+ iifindex = READ_ONCE(rule->iifindex);
+ if (iifindex && !fib_rule_iif_match(rule, iifindex, fl))
goto out;
- if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
+ oifindex = READ_ONCE(rule->oifindex);
+ if (oifindex && !fib_rule_oif_match(rule, oifindex, fl))
goto out;
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
@@ -369,7 +391,10 @@ static int call_fib_rule_notifiers(struct net *net,
.rule = rule,
};
- ops->fib_rules_seq++;
+ ASSERT_RTNL_NET(net);
+
+ /* Paired with READ_ONCE() in fib_rules_seq() */
+ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1);
return call_fib_notifiers(net, event_type, &info.info);
}
@@ -396,17 +421,16 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
}
EXPORT_SYMBOL_GPL(fib_rules_dump);
-unsigned int fib_rules_seq_read(struct net *net, int family)
+unsigned int fib_rules_seq_read(const struct net *net, int family)
{
unsigned int fib_rules_seq;
struct fib_rules_ops *ops;
- ASSERT_RTNL();
-
ops = lookup_rules_ops(net, family);
if (!ops)
return 0;
- fib_rules_seq = ops->fib_rules_seq;
+ /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */
+ fib_rules_seq = READ_ONCE(ops->fib_rules_seq);
rules_ops_put(ops);
return fib_rules_seq;
@@ -456,9 +480,6 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
if (rule->tun_id && r->tun_id != rule->tun_id)
continue;
- if (r->fr_net != rule->fr_net)
- continue;
-
if (rule->l3mdev && r->l3mdev != rule->l3mdev)
continue;
@@ -478,11 +499,17 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
&rule->sport_range))
continue;
+ if (rule->sport_mask && r->sport_mask != rule->sport_mask)
+ continue;
+
if (fib_rule_port_range_set(&rule->dport_range) &&
!fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
+ if (rule->dport_mask && r->dport_mask != rule->dport_mask)
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return r;
@@ -512,14 +539,40 @@ static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
}
#endif
-static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+static int fib_nl2rule_port_mask(const struct nlattr *mask_attr,
+ const struct fib_rule_port_range *range,
+ u16 *port_mask,
+ struct netlink_ext_ack *extack)
+{
+ if (!fib_rule_port_range_valid(range)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr,
+ "Cannot specify port mask without port value");
+ return -EINVAL;
+ }
+
+ if (fib_rule_port_is_range(range)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr,
+ "Cannot specify port mask for port range");
+ return -EINVAL;
+ }
+
+ if (range->start & ~nla_get_u16(mask_attr)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask");
+ return -EINVAL;
+ }
+
+ *port_mask = nla_get_u16(mask_attr);
+
+ return 0;
+}
+
+static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack,
struct fib_rules_ops *ops,
struct nlattr *tb[],
struct fib_rule **rule,
bool *user_priority)
{
- struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rule *nlrule = NULL;
int err = -EINVAL;
@@ -551,31 +604,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[FRA_PRIORITY]) {
nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
*user_priority = true;
- } else {
- nlrule->pref = fib_default_rule_pref(ops);
}
- nlrule->proto = tb[FRA_PROTOCOL] ?
- nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
+ nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC);
if (tb[FRA_IIFNAME]) {
- struct net_device *dev;
-
nlrule->iifindex = -1;
nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, nlrule->iifname);
- if (dev)
- nlrule->iifindex = dev->ifindex;
}
if (tb[FRA_OIFNAME]) {
- struct net_device *dev;
-
nlrule->oifindex = -1;
nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, nlrule->oifname);
- if (dev)
- nlrule->oifindex = dev->ifindex;
}
if (tb[FRA_FWMARK]) {
@@ -617,11 +657,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
nlrule->target = nla_get_u32(tb[FRA_GOTO]);
- /* Backward jumps are prohibited to avoid endless loops */
- if (nlrule->target <= nlrule->pref) {
- NL_SET_ERR_MSG(extack, "Backward goto not supported");
- goto errout_free;
- }
} else if (nlrule->action == FR_ACT_GOTO) {
NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
goto errout_free;
@@ -660,6 +695,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "Invalid sport range");
goto errout_free;
}
+ if (!fib_rule_port_is_range(&nlrule->sport_range))
+ nlrule->sport_mask = U16_MAX;
+ }
+
+ if (tb[FRA_SPORT_MASK]) {
+ err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK],
+ &nlrule->sport_range,
+ &nlrule->sport_mask, extack);
+ if (err)
+ goto errout_free;
}
if (tb[FRA_DPORT_RANGE]) {
@@ -669,6 +714,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "Invalid dport range");
goto errout_free;
}
+ if (!fib_rule_port_is_range(&nlrule->dport_range))
+ nlrule->dport_mask = U16_MAX;
+ }
+
+ if (tb[FRA_DPORT_MASK]) {
+ err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK],
+ &nlrule->dport_range,
+ &nlrule->dport_mask, extack);
+ if (err)
+ goto errout_free;
}
*rule = nlrule;
@@ -681,6 +736,43 @@ errout:
return err;
}
+static int fib_nl2rule_rtnl(struct fib_rule *nlrule,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ if (!tb[FRA_PRIORITY])
+ nlrule->pref = fib_default_rule_pref(ops);
+
+ /* Backward jumps are prohibited to avoid endless loops */
+ if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
+ return -EINVAL;
+ }
+
+ if (tb[FRA_IIFNAME]) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname);
+ if (dev) {
+ nlrule->iifindex = dev->ifindex;
+ nlrule->iif_is_l3_master = netif_is_l3_master(dev);
+ }
+ }
+
+ if (tb[FRA_OIFNAME]) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname);
+ if (dev) {
+ nlrule->oifindex = dev->ifindex;
+ nlrule->oif_is_l3_master = netif_is_l3_master(dev);
+ }
+ }
+
+ return 0;
+}
+
static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
struct nlattr **tb, struct fib_rule *rule)
{
@@ -717,9 +809,6 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->tun_id != rule->tun_id)
continue;
- if (r->fr_net != rule->fr_net)
- continue;
-
if (r->l3mdev != rule->l3mdev)
continue;
@@ -737,10 +826,16 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
&rule->sport_range))
continue;
+ if (r->sport_mask != rule->sport_mask)
+ continue;
+
if (!fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
+ if (r->dport_mask != rule->dport_mask)
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -766,21 +861,27 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
[FRA_PROTOCOL] = { .type = NLA_U8 },
[FRA_IP_PROTO] = { .type = NLA_U8 },
[FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
- [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }
+ [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
+ [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2),
+ [FRA_FLOWLABEL] = { .type = NLA_BE32 },
+ [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 },
+ [FRA_SPORT_MASK] = { .type = NLA_U16 },
+ [FRA_DPORT_MASK] = { .type = NLA_U16 },
+ [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2),
};
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held)
{
- struct net *net = sock_net(skb->sk);
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rules_ops *ops = NULL;
struct fib_rule *rule = NULL, *r, *last = NULL;
- struct nlattr *tb[FRA_MAX + 1];
int err = -EINVAL, unresolved = 0;
+ struct fib_rules_ops *ops = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
bool user_priority = false;
+ struct fib_rule_hdr *frh;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
}
@@ -799,10 +900,17 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+ err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority);
if (err)
goto errout;
+ if (!rtnl_held)
+ rtnl_net_lock(net);
+
+ err = fib_nl2rule_rtnl(rule, ops, tb, extack);
+ if (err)
+ goto errout_free;
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -864,31 +972,45 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rule->tun_id)
ip_tunnel_need_metadata();
+ fib_rule_get(rule);
+
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
return 0;
errout_free:
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
kfree(rule);
errout:
rules_ops_put(ops);
return err;
}
-EXPORT_SYMBOL_GPL(fib_nl_newrule);
+EXPORT_SYMBOL_GPL(fib_newrule);
-int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false);
+}
+
+int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held)
+{
+ struct fib_rule *rule = NULL, *nlrule = NULL;
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule = NULL, *r, *nlrule = NULL;
struct nlattr *tb[FRA_MAX+1];
- int err = -EINVAL;
bool user_priority = false;
+ struct fib_rule_hdr *frh;
+ int err = -EINVAL;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
}
@@ -907,25 +1029,32 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+ err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority);
if (err)
goto errout;
+ if (!rtnl_held)
+ rtnl_net_lock(net);
+
+ err = fib_nl2rule_rtnl(nlrule, ops, tb, extack);
+ if (err)
+ goto errout_free;
+
rule = rule_find(ops, frh, tb, nlrule, user_priority);
if (!rule) {
err = -ENOENT;
- goto errout;
+ goto errout_free;
}
if (rule->flags & FIB_RULE_PERMANENT) {
err = -EPERM;
- goto errout;
+ goto errout_free;
}
if (ops->delete) {
err = ops->delete(rule);
if (err)
- goto errout;
+ goto errout_free;
}
if (rule->tun_id)
@@ -947,7 +1076,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
* current if it is goto rule, have actually been added.
*/
if (ops->nr_goto_rules > 0) {
- struct fib_rule *n;
+ struct fib_rule *n, *r;
n = list_next_entry(rule, list);
if (&n->list == &ops->rules_list || n->pref != rule->pref)
@@ -961,22 +1090,33 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
- NULL);
- notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).portid);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL);
+
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
kfree(nlrule);
return 0;
-errout:
+errout_free:
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
kfree(nlrule);
+errout:
rules_ops_put(ops);
return err;
}
-EXPORT_SYMBOL_GPL(fib_nl_delrule);
+EXPORT_SYMBOL_GPL(fib_delrule);
+
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false);
+}
static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
struct fib_rule *rule)
@@ -995,7 +1135,9 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(1) /* FRA_PROTOCOL */
+ nla_total_size(1) /* FRA_IP_PROTO */
+ nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
- + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */
+ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */
+ + nla_total_size(2) /* FRA_SPORT_MASK */
+ + nla_total_size(2); /* FRA_DPORT_MASK */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -1036,14 +1178,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
if (rule->iifname[0]) {
if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
goto nla_put_failure;
- if (rule->iifindex == -1)
+ if (READ_ONCE(rule->iifindex) == -1)
frh->flags |= FIB_RULE_IIF_DETACHED;
}
if (rule->oifname[0]) {
if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
goto nla_put_failure;
- if (rule->oifindex == -1)
+ if (READ_ONCE(rule->oifindex) == -1)
frh->flags |= FIB_RULE_OIF_DETACHED;
}
@@ -1063,8 +1205,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
nla_put_uid_range(skb, &rule->uid_range)) ||
(fib_rule_port_range_set(&rule->sport_range) &&
nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
+ (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK,
+ rule->sport_mask)) ||
(fib_rule_port_range_set(&rule->dport_range) &&
nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
+ (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK,
+ rule->dport_mask)) ||
(rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
goto nla_put_failure;
@@ -1116,12 +1262,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh,
{
struct fib_rule_hdr *frh;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request");
return -EINVAL;
}
- frh = nlmsg_data(nlh);
if (frh->dst_len || frh->src_len || frh->tos || frh->table ||
frh->res1 || frh->res2 || frh->action || frh->flags) {
NL_SET_ERR_MSG(extack,
@@ -1142,10 +1288,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct fib_rules_ops *ops;
- int idx = 0, family;
+ int err, idx = 0, family;
if (cb->strict_check) {
- int err = fib_valid_dumprule_req(nlh, cb->extack);
+ err = fib_valid_dumprule_req(nlh, cb->extack);
if (err < 0)
return err;
@@ -1158,17 +1304,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
if (ops == NULL)
return -EAFNOSUPPORT;
- dump_rules(skb, cb, ops);
-
- return skb->len;
+ return dump_rules(skb, cb, ops);
}
+ err = 0;
rcu_read_lock();
list_for_each_entry_rcu(ops, &net->rules_ops, list) {
if (idx < cb->args[0] || !try_module_get(ops->owner))
goto skip;
- if (dump_rules(skb, cb, ops) < 0)
+ err = dump_rules(skb, cb, ops);
+ if (err < 0)
break;
cb->args[1] = 0;
@@ -1178,7 +1324,7 @@ skip:
rcu_read_unlock();
cb->args[0] = idx;
- return skb->len;
+ return err;
}
static void notify_rule_change(int event, struct fib_rule *rule,
@@ -1205,8 +1351,7 @@ static void notify_rule_change(int event, struct fib_rule *rule,
rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, ops->nlgroup, err);
+ rtnl_set_sk_err(net, ops->nlgroup, err);
}
static void attach_rules(struct list_head *rules, struct net_device *dev)
@@ -1215,11 +1360,17 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
list_for_each_entry(rule, rules, list) {
if (rule->iifindex == -1 &&
- strcmp(dev->name, rule->iifname) == 0)
- rule->iifindex = dev->ifindex;
+ strcmp(dev->name, rule->iifname) == 0) {
+ WRITE_ONCE(rule->iifindex, dev->ifindex);
+ WRITE_ONCE(rule->iif_is_l3_master,
+ netif_is_l3_master(dev));
+ }
if (rule->oifindex == -1 &&
- strcmp(dev->name, rule->oifname) == 0)
- rule->oifindex = dev->ifindex;
+ strcmp(dev->name, rule->oifname) == 0) {
+ WRITE_ONCE(rule->oifindex, dev->ifindex);
+ WRITE_ONCE(rule->oif_is_l3_master,
+ netif_is_l3_master(dev));
+ }
}
}
@@ -1228,10 +1379,14 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
struct fib_rule *rule;
list_for_each_entry(rule, rules, list) {
- if (rule->iifindex == dev->ifindex)
- rule->iifindex = -1;
- if (rule->oifindex == dev->ifindex)
- rule->oifindex = -1;
+ if (rule->iifindex == dev->ifindex) {
+ WRITE_ONCE(rule->iifindex, -1);
+ WRITE_ONCE(rule->iif_is_l3_master, false);
+ }
+ if (rule->oifindex == dev->ifindex) {
+ WRITE_ONCE(rule->oifindex, -1);
+ WRITE_ONCE(rule->oif_is_l3_master, false);
+ }
}
}
@@ -1288,12 +1443,20 @@ static struct pernet_operations fib_rules_net_ops = {
.exit = fib_rules_net_exit,
};
+static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+};
+
static int __init fib_rules_init(void)
{
int err;
- rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0);
+
+ rtnl_register_many(fib_rules_rtnl_msg_handlers);
err = register_pernet_subsys(&fib_rules_net_ops);
if (err < 0)
@@ -1308,9 +1471,7 @@ static int __init fib_rules_init(void)
fail_unregister:
unregister_pernet_subsys(&fib_rules_net_ops);
fail:
- rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
- rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
- rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
+ rtnl_unregister_many(fib_rules_rtnl_msg_handlers);
return err;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 8adf95765cdd..7a72f766aacf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -42,7 +42,7 @@
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
@@ -84,9 +84,13 @@
#include <net/netkit.h>
#include <linux/un.h>
#include <net/xdp_sock_drv.h>
+#include <net/inet_dscp.h>
#include "dev.h"
+/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
+static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");
+
static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -214,24 +218,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
+static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
+{
+ if (likely(offset >= 0))
+ return offset;
+
+ if (offset >= SKF_NET_OFF)
+ return offset - SKF_NET_OFF + skb_network_offset(skb);
+
+ if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
+ return offset - SKF_LL_OFF + skb_mac_offset(skb);
+
+ return INT_MIN;
+}
+
BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u8 tmp, *ptr;
+ u8 tmp;
const int len = sizeof(tmp);
- if (offset >= 0) {
- if (headlen - offset >= len)
- return *(u8 *)(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return tmp;
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return *(u8 *)ptr;
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
@@ -244,21 +260,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- __be16 tmp, *ptr;
+ __be16 tmp;
const int len = sizeof(tmp);
- if (offset >= 0) {
- if (headlen - offset >= len)
- return get_unaligned_be16(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return be16_to_cpu(tmp);
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return get_unaligned_be16(ptr);
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
@@ -271,21 +285,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- __be32 tmp, *ptr;
+ __be32 tmp;
const int len = sizeof(tmp);
- if (likely(offset >= 0)) {
- if (headlen - offset >= len)
- return get_unaligned_be32(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return be32_to_cpu(tmp);
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return get_unaligned_be32(ptr);
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
@@ -1262,8 +1274,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
* so we need to keep the user BPF around until the 2nd
* pass. At this time, the user BPF is stored in fp->insns.
*/
- old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
- GFP_KERNEL | __GFP_NOWARN);
+ old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter),
+ GFP_KERNEL | __GFP_NOWARN);
if (!old_prog) {
err = -ENOMEM;
goto out_err;
@@ -1650,18 +1662,14 @@ void sk_reuseport_prog_free(struct bpf_prog *prog)
bpf_prog_destroy(prog);
}
-struct bpf_scratchpad {
- union {
- __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
- u8 buff[MAX_BPF_STACK];
- };
-};
-
-static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
-
static inline int __bpf_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
+#ifdef CONFIG_DEBUG_NET
+ /* Avoid a splat in pskb_may_pull_reason() */
+ if (write_len > INT_MAX)
+ return -EINVAL;
+#endif
return skb_ensure_writable(skb, write_len);
}
@@ -1960,10 +1968,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
bool do_mforce = flags & BPF_F_MARK_ENFORCE;
+ bool is_ipv6 = flags & BPF_F_IPV6;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
- BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+ BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6)))
return -EINVAL;
if (unlikely(offset > 0xffff || offset & 1))
return -EFAULT;
@@ -1979,7 +1988,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
if (unlikely(from != 0))
return -EINVAL;
- inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6);
break;
case 2:
inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
@@ -2010,10 +2019,6 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
__be32 *, to, u32, to_size, __wsum, seed)
{
- struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
- u32 diff_size = from_size + to_size;
- int i, j = 0;
-
/* This is quite flexible, some examples:
*
* from_size == 0, to_size > 0, seed := csum --> pushing data
@@ -2022,16 +2027,19 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
*
* Even for diffing, from_size and to_size don't need to be equal.
*/
- if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
- diff_size > sizeof(sp->diff)))
- return -EINVAL;
- for (i = 0; i < from_size / sizeof(__be32); i++, j++)
- sp->diff[j] = ~from[i];
- for (i = 0; i < to_size / sizeof(__be32); i++, j++)
- sp->diff[j] = to[i];
+ __wsum ret = seed;
- return csum_partial(sp->diff, diff_size, seed);
+ if (from_size && to_size)
+ ret = csum_sub(csum_partial(to, to_size, ret),
+ csum_partial(from, from_size, 0));
+ else if (to_size)
+ ret = csum_partial(to, to_size, ret);
+
+ else if (from_size)
+ ret = ~csum_partial(from, from_size, ~ret);
+
+ return csum_from32to16((__force unsigned int)ret);
}
static const struct bpf_func_proto bpf_csum_diff_proto = {
@@ -2215,7 +2223,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
rcu_read_lock();
if (!nh) {
dst = skb_dst(skb);
- nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+ nexthop = rt6_nexthop(dst_rt6_info(dst),
&ipv6_hdr(skb)->daddr);
} else {
nexthop = &nh->ipv6_nh;
@@ -2233,7 +2241,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
rcu_read_unlock();
return ret;
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
if (dst)
IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
@@ -2271,12 +2279,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
err = bpf_out_neigh_v6(net, skb, dev, nh);
if (unlikely(net_xmit_eval(err)))
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
else
ret = NET_XMIT_SUCCESS;
goto out_xmit;
out_drop:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
out_xmit:
return ret;
@@ -2314,8 +2322,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
rcu_read_lock();
if (!nh) {
- struct dst_entry *dst = skb_dst(skb);
- struct rtable *rt = container_of(dst, struct rtable, dst);
+ struct rtable *rt = skb_rtable(skb);
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
} else if (nh->nh_family == AF_INET6) {
@@ -2357,7 +2364,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct flowi4 fl4 = {
.flowi4_flags = FLOWI_FLAG_ANYSRC,
.flowi4_mark = skb->mark,
- .flowi4_tos = RT_TOS(ip4h->tos),
+ .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)),
.flowi4_oif = dev->ifindex,
.flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
@@ -2378,12 +2385,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
err = bpf_out_neigh_v4(net, skb, dev, nh);
if (unlikely(net_xmit_eval(err)))
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
else
ret = NET_XMIT_SUCCESS;
goto out_xmit;
out_drop:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
out_xmit:
return ret;
@@ -2423,9 +2430,9 @@ out:
/* Internal, non-exposed redirect flags. */
enum {
- BPF_F_NEIGH = (1ULL << 1),
- BPF_F_PEER = (1ULL << 2),
- BPF_F_NEXTHOP = (1ULL << 3),
+ BPF_F_NEIGH = (1ULL << 16),
+ BPF_F_PEER = (1ULL << 17),
+ BPF_F_NEXTHOP = (1ULL << 18),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};
@@ -2435,6 +2442,8 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
struct sk_buff *clone;
int ret;
+ BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);
+
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return -EINVAL;
@@ -2469,9 +2478,6 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING,
};
-DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
-EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
-
static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -2484,7 +2490,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev)
int skb_do_redirect(struct sk_buff *skb)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct net *net = dev_net(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
@@ -2504,6 +2510,7 @@ int skb_do_redirect(struct sk_buff *skb)
goto out_drop;
skb->dev = dev;
dev_sw_netstats_rx_add(dev, skb->len);
+ skb_scrub_packet(skb, false);
return -EAGAIN;
}
return flags & BPF_F_NEIGH ?
@@ -2517,7 +2524,7 @@ out_drop:
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return TC_ACT_SHOT;
@@ -2538,7 +2545,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return TC_ACT_SHOT;
@@ -2560,7 +2567,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
int, plen, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely((plen && plen < sizeof(*params)) || flags))
return TC_ACT_SHOT;
@@ -2607,18 +2614,16 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
static void sk_msg_reset_curr(struct sk_msg *msg)
{
- u32 i = msg->sg.start;
- u32 len = 0;
-
- do {
- len += sk_msg_elem(msg, i)->length;
- sk_msg_iter_var_next(i);
- if (len >= msg->sg.size)
- break;
- } while (i != msg->sg.end);
+ if (!msg->sg.size) {
+ msg->sg.curr = msg->sg.start;
+ msg->sg.copybreak = 0;
+ } else {
+ u32 i = msg->sg.end;
- msg->sg.curr = i;
- msg->sg.copybreak = 0;
+ sk_msg_iter_var_prev(i);
+ msg->sg.curr = i;
+ msg->sg.copybreak = msg->sg.data[i].length;
+ }
}
static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
@@ -2781,7 +2786,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
- if (start >= offset + l)
+ if (start > offset + l)
return -EINVAL;
space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
@@ -2806,6 +2811,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
raw = page_address(page);
+ if (i == msg->sg.end)
+ sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
front = start - offset;
back = psge->length - front;
@@ -2822,7 +2829,13 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
}
put_page(sg_page(psge));
- } else if (start - offset) {
+ new = i;
+ goto place_new;
+ }
+
+ if (start - offset) {
+ if (i == msg->sg.end)
+ sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
rsge = sk_msg_elem_cpy(msg, i);
@@ -2833,39 +2846,44 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_next(i);
sg_unmark_end(psge);
sg_unmark_end(&rsge);
- sk_msg_iter_next(msg, end);
}
/* Slot(s) to place newly allocated data */
+ sk_msg_iter_next(msg, end);
new = i;
+ sk_msg_iter_var_next(i);
+
+ if (i == msg->sg.end) {
+ if (!rsge.length)
+ goto place_new;
+ sk_msg_iter_next(msg, end);
+ goto place_new;
+ }
/* Shift one or two slots as needed */
- if (!copy) {
- sge = sk_msg_elem_cpy(msg, i);
+ sge = sk_msg_elem_cpy(msg, new);
+ sg_unmark_end(&sge);
+ nsge = sk_msg_elem_cpy(msg, i);
+ if (rsge.length) {
sk_msg_iter_var_next(i);
- sg_unmark_end(&sge);
+ nnsge = sk_msg_elem_cpy(msg, i);
sk_msg_iter_next(msg, end);
+ }
- nsge = sk_msg_elem_cpy(msg, i);
+ while (i != msg->sg.end) {
+ msg->sg.data[i] = sge;
+ sge = nsge;
+ sk_msg_iter_var_next(i);
if (rsge.length) {
- sk_msg_iter_var_next(i);
+ nsge = nnsge;
nnsge = sk_msg_elem_cpy(msg, i);
- }
-
- while (i != msg->sg.end) {
- msg->sg.data[i] = sge;
- sge = nsge;
- sk_msg_iter_var_next(i);
- if (rsge.length) {
- nsge = nnsge;
- nnsge = sk_msg_elem_cpy(msg, i);
- } else {
- nsge = sk_msg_elem_cpy(msg, i);
- }
+ } else {
+ nsge = sk_msg_elem_cpy(msg, i);
}
}
+place_new:
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
@@ -2894,8 +2912,10 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
static void sk_msg_shift_left(struct sk_msg *msg, int i)
{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
int prev;
+ put_page(sg_page(sge));
do {
prev = i;
sk_msg_iter_var_next(i);
@@ -2932,6 +2952,9 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
if (unlikely(flags))
return -EINVAL;
+ if (unlikely(len == 0))
+ return 0;
+
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
@@ -2944,7 +2967,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
} while (i != msg->sg.end);
/* Bounds checks: start and pop must be inside message */
- if (start >= offset + l || last >= msg->sg.size)
+ if (start >= offset + l || last > msg->sg.size)
return -EINVAL;
space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
@@ -2973,12 +2996,12 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
*/
if (start != offset) {
struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
- int a = start;
+ int a = start - offset;
int b = sge->length - pop - a;
sk_msg_iter_var_next(i);
- if (pop < sge->length - a) {
+ if (b > 0) {
if (space) {
sge->length = a;
sk_msg_shift_right(msg, i);
@@ -2997,7 +3020,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
if (unlikely(!page))
return -ENOMEM;
- sge->length = a;
orig = sg_page(sge);
from = sg_virt(sge);
to = page_address(page);
@@ -3007,7 +3029,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
put_page(orig);
}
pop = 0;
- } else if (pop >= sge->length - a) {
+ } else {
pop -= (sge->length - a);
sge->length = a;
}
@@ -3041,7 +3063,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
pop -= sge->length;
sk_msg_shift_left(msg, i);
}
- sk_msg_iter_var_next(i);
}
sk_mem_uncharge(msg->sk, len - pop);
@@ -3178,6 +3199,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
bpf_push_mac_rcsum(skb);
ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
bpf_pull_mac_rcsum(skb);
+ skb_reset_mac_len(skb);
bpf_compute_data_pointers(skb);
return ret;
@@ -3211,6 +3233,13 @@ static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
+{
+ skb->protocol = htons(proto);
+ if (skb_valid_dst(skb))
+ skb_dst_drop(skb);
+}
+
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
/* Caller already did skb_cow() with len as headroom,
@@ -3307,7 +3336,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
}
}
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
skb_clear_hash(skb);
return 0;
@@ -3337,7 +3366,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
}
}
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
skb_clear_hash(skb);
return 0;
@@ -3528,22 +3557,29 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
}
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* Due to header grow, MSS needs to be downgraded. */
- if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
- skb_decrease_gso_size(shinfo, len_diff);
-
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= gso_type;
shinfo->gso_segs = 0;
+
+ /* Due to header growth, MSS needs to be downgraded.
+ * There is a BUG_ON() when segmenting the frag_list with
+ * head_frag true, so linearize the skb after downgrading
+ * the MSS.
+ */
+ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
+ skb_decrease_gso_size(shinfo, len_diff);
+ if (shinfo->frag_list)
+ return skb_linearize(skb);
+ }
}
return 0;
@@ -3577,10 +3613,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
@@ -3715,13 +3751,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
- u32 min_len = skb_network_offset(skb);
+ int offset = skb_network_offset(skb);
+ u32 min_len = 0;
- if (skb_transport_header_was_set(skb))
- min_len = skb_transport_offset(skb);
- if (skb->ip_summed == CHECKSUM_PARTIAL)
- min_len = skb_checksum_start_offset(skb) +
- skb->csum_offset + sizeof(__sum16);
+ if (offset > 0)
+ min_len = offset;
+ if (skb_transport_header_was_set(skb)) {
+ offset = skb_transport_offset(skb);
+ if (offset > 0)
+ min_len = offset;
+ }
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ offset = skb_checksum_start_offset(skb) +
+ skb->csum_offset + sizeof(__sum16);
+ if (offset > 0)
+ min_len = offset;
+ }
return min_len;
}
@@ -4100,13 +4145,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
}
static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
- struct xdp_mem_info *mem_info, bool release)
+ enum xdp_mem_type mem_type, bool release)
{
struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
if (release) {
xsk_buff_del_tail(zc_frag);
- __xdp_return(NULL, mem_info, false, zc_frag);
+ __xdp_return(0, mem_type, false, zc_frag);
} else {
zc_frag->data_end -= shrink;
}
@@ -4115,19 +4160,16 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
int shrink)
{
- struct xdp_mem_info *mem_info = &xdp->rxq->mem;
+ enum xdp_mem_type mem_type = xdp->rxq->mem.type;
bool release = skb_frag_size(frag) == shrink;
- if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
- bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
+ if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
+ bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release);
goto out;
}
- if (release) {
- struct page *page = skb_frag_page(frag);
-
- __xdp_return(page_address(page), mem_info, false, NULL);
- }
+ if (release)
+ __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL);
out:
return release;
@@ -4266,50 +4308,50 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
*/
void xdp_do_flush(void)
{
- __dev_flush();
- __cpu_map_flush();
- __xsk_map_flush();
+ struct list_head *lh_map, *lh_dev, *lh_xsk;
+
+ bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+ if (lh_dev)
+ __dev_flush(lh_dev);
+ if (lh_map)
+ __cpu_map_flush(lh_map);
+ if (lh_xsk)
+ __xsk_map_flush(lh_xsk);
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
void xdp_do_check_flushed(struct napi_struct *napi)
{
- bool ret;
+ struct list_head *lh_map, *lh_dev, *lh_xsk;
+ bool missed = false;
- ret = dev_check_flush();
- ret |= cpu_map_check_flush();
- ret |= xsk_map_check_flush();
+ bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+ if (lh_dev) {
+ __dev_flush(lh_dev);
+ missed = true;
+ }
+ if (lh_map) {
+ __cpu_map_flush(lh_map);
+ missed = true;
+ }
+ if (lh_xsk) {
+ __xsk_map_flush(lh_xsk);
+ missed = true;
+ }
- WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+ WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
napi->poll);
}
#endif
-void bpf_clear_redirect_map(struct bpf_map *map)
-{
- struct bpf_redirect_info *ri;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- ri = per_cpu_ptr(&bpf_redirect_info, cpu);
- /* Avoid polluting remote cacheline due to writes if
- * not needed. Once we pass this test, we need the
- * cmpxchg() to make sure it hasn't been changed in
- * the meantime by remote CPU.
- */
- if (unlikely(READ_ONCE(ri->map) == map))
- cmpxchg(&ri->map, map, NULL);
- }
-}
-
DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
u32 xdp_master_redirect(struct xdp_buff *xdp)
{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct net_device *master, *slave;
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
@@ -4329,9 +4371,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp)
EXPORT_SYMBOL_GPL(xdp_master_redirect);
static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
- struct net_device *dev,
+ const struct net_device *dev,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
@@ -4352,18 +4394,20 @@ err:
return err;
}
-static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
- struct net_device *dev,
- struct xdp_frame *xdpf,
- struct bpf_prog *xdp_prog)
+static __always_inline int
+__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
+ u32 flags = ri->flags;
struct bpf_map *map;
int err;
ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->flags = 0;
ri->map_type = BPF_MAP_TYPE_UNSPEC;
if (unlikely(!xdpf)) {
@@ -4375,11 +4419,20 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
- map = READ_ONCE(ri->map);
- if (unlikely(map)) {
+ if (unlikely(flags & BPF_F_BROADCAST)) {
+ map = READ_ONCE(ri->map);
+
+ /* The map pointer is cleared when the map is being torn
+ * down by dev_map_free()
+ */
+ if (unlikely(!map)) {
+ err = -ENOENT;
+ break;
+ }
+
WRITE_ONCE(ri->map, NULL);
err = dev_map_enqueue_multi(xdpf, dev, map,
- ri->flags & BPF_F_EXCLUDE_INGRESS);
+ flags & BPF_F_EXCLUDE_INGRESS);
} else {
err = dev_map_enqueue(fwd, xdpf, dev);
}
@@ -4413,9 +4466,9 @@ err:
}
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
if (map_type == BPF_MAP_TYPE_XSKMAP)
@@ -4427,9 +4480,10 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
EXPORT_SYMBOL_GPL(xdp_do_redirect);
int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
- struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
if (map_type == BPF_MAP_TYPE_XSKMAP)
@@ -4442,11 +4496,11 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog,
- void *fwd,
- enum bpf_map_type map_type, u32 map_id)
+ const struct bpf_prog *xdp_prog,
+ void *fwd, enum bpf_map_type map_type,
+ u32 map_id, u32 flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct bpf_map *map;
int err;
@@ -4454,11 +4508,20 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
- map = READ_ONCE(ri->map);
- if (unlikely(map)) {
+ if (unlikely(flags & BPF_F_BROADCAST)) {
+ map = READ_ONCE(ri->map);
+
+ /* The map pointer is cleared when the map is being torn
+ * down by dev_map_free()
+ */
+ if (unlikely(!map)) {
+ err = -ENOENT;
+ break;
+ }
+
WRITE_ONCE(ri->map, NULL);
err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
- ri->flags & BPF_F_EXCLUDE_INGRESS);
+ flags & BPF_F_EXCLUDE_INGRESS);
} else {
err = dev_map_generic_redirect(fwd, skb, xdp_prog);
}
@@ -4489,15 +4552,18 @@ err:
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
- struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
+ u32 flags = ri->flags;
int err;
ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->flags = 0;
ri->map_type = BPF_MAP_TYPE_UNSPEC;
if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
@@ -4517,7 +4583,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
return 0;
}
- return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
err:
_trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
return err;
@@ -4525,7 +4591,7 @@ err:
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return XDP_ABORTED;
@@ -4662,7 +4728,7 @@ set_compat:
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
if (flags & BPF_F_TUNINFO_FLAGS)
- to->tunnel_flags = info->key.tun_flags;
+ to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
else
to->tunnel_ext = 0;
@@ -4705,7 +4771,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
int err;
if (unlikely(!info ||
- !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
+ !ip_tunnel_is_options_present(info->key.tun_flags))) {
err = -ENOENT;
goto err_clear;
}
@@ -4775,15 +4841,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
memset(info, 0, sizeof(*info));
info->mode = IP_TUNNEL_INFO_TX;
- info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
- if (flags & BPF_F_DONT_FRAGMENT)
- info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
- if (flags & BPF_F_ZERO_CSUM_TX)
- info->key.tun_flags &= ~TUNNEL_CSUM;
- if (flags & BPF_F_SEQ_NUMBER)
- info->key.tun_flags |= TUNNEL_SEQ;
- if (flags & BPF_F_NO_TUNNEL_KEY)
- info->key.tun_flags &= ~TUNNEL_KEY;
+ __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
+ __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
+ flags & BPF_F_DONT_FRAGMENT);
+ __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
+ !(flags & BPF_F_ZERO_CSUM_TX));
+ __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
+ flags & BPF_F_SEQ_NUMBER);
+ __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
+ !(flags & BPF_F_NO_TUNNEL_KEY));
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
@@ -4821,13 +4887,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
{
struct ip_tunnel_info *info = skb_tunnel_info(skb);
const struct metadata_dst *md = this_cpu_ptr(md_dst);
+ IP_TUNNEL_DECLARE_FLAGS(present) = { };
if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
return -EINVAL;
if (unlikely(size > IP_TUNNEL_OPTS_MAX))
return -ENOMEM;
- ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
+ ip_tunnel_set_options_present(present);
+ ip_tunnel_info_opts_set(info, from, size, present);
return 0;
}
@@ -5094,6 +5162,17 @@ static u64 __bpf_get_netns_cookie(struct sock *sk)
return net->net_cookie;
}
+BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
+{
+ return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
+ .func = bpf_get_netns_cookie,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
{
return __bpf_get_netns_cookie(ctx);
@@ -5160,6 +5239,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
+{
+ u32 sk_bpf_cb_flags;
+
+ if (getopt) {
+ *(u32 *)optval = sk->sk_bpf_cb_flags;
+ return 0;
+ }
+
+ sk_bpf_cb_flags = *(u32 *)optval;
+
+ if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
+ return -EINVAL;
+
+ sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
+
+ return 0;
+}
+
static int sol_socket_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
@@ -5176,6 +5274,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
case SO_MAX_PACING_RATE:
case SO_BINDTOIFINDEX:
case SO_TXREHASH:
+ case SK_BPF_CB_FLAGS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
@@ -5185,6 +5284,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
return -EINVAL;
}
+ if (optname == SK_BPF_CB_FLAGS)
+ return sk_bpf_set_get_cb_flags(sk, optval, getopt);
+
if (getopt) {
if (optname == SO_BINDTODEVICE)
return -EINVAL;
@@ -5197,6 +5299,38 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
KERNEL_SOCKPTR(optval), *optlen);
}
+static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
+{
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_BPF_SOCK_OPS_CB_FLAGS: {
+ int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags;
+
+ memcpy(optval, &cb_flags, optlen);
+ break;
+ }
+ case TCP_BPF_RTO_MIN: {
+ int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min);
+
+ memcpy(optval, &rto_min_us, optlen);
+ break;
+ }
+ case TCP_BPF_DELACK_MAX: {
+ int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max);
+
+ memcpy(optval, &delack_max_us, optlen);
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
char *optval, int optlen)
{
@@ -5236,6 +5370,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
return -EINVAL;
inet_csk(sk)->icsk_rto_min = timeout;
break;
+ case TCP_BPF_SOCK_OPS_CB_FLAGS:
+ if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
+ return -EINVAL;
+ tp->bpf_sock_ops_cb_flags = val;
+ break;
default:
return -EINVAL;
}
@@ -5315,6 +5454,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
case TCP_USER_TIMEOUT:
case TCP_NOTSENT_LOWAT:
case TCP_SAVE_SYN:
+ case TCP_RTO_MAX_MS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
@@ -5326,7 +5466,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
break;
default:
if (getopt)
- return -EINVAL;
+ return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen);
return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
}
@@ -5422,6 +5562,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
}
+static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
+{
+ return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+}
+
static int _bpf_setsockopt(struct sock *sk, int level, int optname,
char *optval, int optlen)
{
@@ -5572,6 +5717,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
@@ -5657,6 +5805,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
int ret, copy_len = 0;
@@ -5699,6 +5850,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
struct sock *sk = bpf_sock->sk;
int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
return -EINVAL;
@@ -5857,7 +6011,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
fl4.flowi4_iif = params->ifindex;
fl4.flowi4_oif = 0;
}
- fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+ fl4.flowi4_tos = params->tos & INET_DSCP_MASK;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
@@ -5884,7 +6038,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
} else {
- fl4.flowi4_mark = 0;
+ if (flags & BPF_FIB_LOOKUP_MARK)
+ fl4.flowi4_mark = params->mark;
+ else
+ fl4.flowi4_mark = 0;
fl4.flowi4_secid = 0;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_uid = sock_net_uid(net, NULL);
@@ -6027,7 +6184,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
strict);
} else {
- fl6.flowi6_mark = 0;
+ if (flags & BPF_FIB_LOOKUP_MARK)
+ fl6.flowi6_mark = params->mark;
+ else
+ fl6.flowi6_mark = 0;
fl6.flowi6_secid = 0;
fl6.flowi6_tun_key.tun_id = 0;
fl6.flowi6_uid = sock_net_uid(net, NULL);
@@ -6105,7 +6265,7 @@ set_fwd_params:
#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
- BPF_FIB_LOOKUP_SRC)
+ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)
BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
struct bpf_fib_lookup *, params, int, plen, u32, flags)
@@ -6213,12 +6373,10 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
{
int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
struct net_device *dev = skb->dev;
- int skb_len, dev_len;
- int mtu;
+ int mtu, dev_len, skb_len;
if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
return -EINVAL;
-
if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
return -EINVAL;
@@ -6227,7 +6385,6 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
return -ENODEV;
mtu = READ_ONCE(dev->mtu);
-
dev_len = mtu + dev->hard_header_len;
/* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
@@ -6245,15 +6402,12 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
*/
if (skb_is_gso(skb)) {
ret = BPF_MTU_CHK_RET_SUCCESS;
-
if (flags & BPF_MTU_CHK_SEGS &&
!skb_gso_validate_network_len(skb, mtu))
ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
}
out:
- /* BPF verifier guarantees valid pointer */
*mtu_len = mtu;
-
return ret;
}
@@ -6274,8 +6428,6 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
return -ENODEV;
mtu = READ_ONCE(dev->mtu);
-
- /* Add L2-header as dev MTU is L3 size */
dev_len = mtu + dev->hard_header_len;
/* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
@@ -6286,9 +6438,7 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
if (xdp_len > dev_len)
ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
- /* BPF verifier guarantees valid pointer */
*mtu_len = mtu;
-
return ret;
}
@@ -6298,7 +6448,8 @@ static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_INT,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u32),
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6309,7 +6460,8 @@ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_INT,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u32),
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6418,6 +6570,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
void *srh_tlvs, *srh_end, *ptr;
int srhoff = 0;
+ lockdep_assert_held(&srh_state->bh_lock);
if (srh == NULL)
return -EINVAL;
@@ -6474,6 +6627,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
int hdroff = 0;
int err;
+ lockdep_assert_held(&srh_state->bh_lock);
switch (action) {
case SEG6_LOCAL_ACTION_END_X:
if (!seg6_bpf_has_valid_srh(skb))
@@ -6550,6 +6704,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
int srhoff = 0;
int ret;
+ lockdep_assert_held(&srh_state->bh_lock);
if (unlikely(srh == NULL))
return -EINVAL;
@@ -6705,8 +6860,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
- sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
/* Ensure there is no need to bump sk2 refcnt */
@@ -6753,8 +6906,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
* sock refcnt is decremented to prevent a request_sock leak.
*/
- if (!sk_fullsock(sk2))
- sk2 = NULL;
if (sk2 != sk) {
sock_gen_put(sk);
/* Ensure there is no need to bump sk2 refcnt */
@@ -6783,7 +6934,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6802,7 +6953,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6821,7 +6972,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6845,7 +6996,7 @@ static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6869,7 +7020,7 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6893,7 +7044,7 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6931,7 +7082,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6955,7 +7106,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6979,7 +7130,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -6999,7 +7150,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -7018,7 +7169,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -7037,7 +7188,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
@@ -7203,7 +7354,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
- if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+ if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
return (unsigned long)sk;
return (unsigned long)NULL;
@@ -7511,6 +7662,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
u8 search_kind, search_len, copy_len, magic_len;
int ret;
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
/* 2 byte is the minimal option len except TCPOPT_NOP and
* TCPOPT_EOL which are useless for the bpf prog to learn
* and this helper disallow loading them also.
@@ -7576,7 +7730,7 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -7694,17 +7848,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
return -EOPNOTSUPP;
switch (tstamp_type) {
- case BPF_SKB_TSTAMP_DELIVERY_MONO:
+ case BPF_SKB_CLOCK_REALTIME:
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
+ break;
+ case BPF_SKB_CLOCK_MONOTONIC:
if (!tstamp)
return -EINVAL;
skb->tstamp = tstamp;
- skb->mono_delivery_time = 1;
+ skb->tstamp_type = SKB_CLOCK_MONOTONIC;
break;
- case BPF_SKB_TSTAMP_UNSPEC:
- if (tstamp)
+ case BPF_SKB_CLOCK_TAI:
+ if (!tstamp)
return -EINVAL;
- skb->tstamp = 0;
- skb->mono_delivery_time = 0;
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_TAI;
break;
default:
return -EINVAL;
@@ -7828,42 +7986,37 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
#endif /* CONFIG_INET */
-bool bpf_helper_changes_pkt_data(void *func)
-{
- if (func == bpf_skb_vlan_push ||
- func == bpf_skb_vlan_pop ||
- func == bpf_skb_store_bytes ||
- func == bpf_skb_change_proto ||
- func == bpf_skb_change_head ||
- func == sk_skb_change_head ||
- func == bpf_skb_change_tail ||
- func == sk_skb_change_tail ||
- func == bpf_skb_adjust_room ||
- func == sk_skb_adjust_room ||
- func == bpf_skb_pull_data ||
- func == sk_skb_pull_data ||
- func == bpf_clone_redirect ||
- func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace ||
- func == bpf_xdp_adjust_head ||
- func == bpf_xdp_adjust_meta ||
- func == bpf_msg_pull_data ||
- func == bpf_msg_push_data ||
- func == bpf_msg_pop_data ||
- func == bpf_xdp_adjust_tail ||
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
- func == bpf_lwt_seg6_store_bytes ||
- func == bpf_lwt_seg6_adjust_srh ||
- func == bpf_lwt_seg6_action ||
-#endif
-#ifdef CONFIG_INET
- func == bpf_sock_ops_store_hdr_opt ||
-#endif
- func == bpf_lwt_in_push_encap ||
- func == bpf_lwt_xmit_push_encap)
+bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_clone_redirect:
+ case BPF_FUNC_l3_csum_replace:
+ case BPF_FUNC_l4_csum_replace:
+ case BPF_FUNC_lwt_push_encap:
+ case BPF_FUNC_lwt_seg6_action:
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ case BPF_FUNC_msg_pop_data:
+ case BPF_FUNC_msg_pull_data:
+ case BPF_FUNC_msg_push_data:
+ case BPF_FUNC_skb_adjust_room:
+ case BPF_FUNC_skb_change_head:
+ case BPF_FUNC_skb_change_proto:
+ case BPF_FUNC_skb_change_tail:
+ case BPF_FUNC_skb_pull_data:
+ case BPF_FUNC_skb_store_bytes:
+ case BPF_FUNC_skb_vlan_pop:
+ case BPF_FUNC_skb_vlan_push:
+ case BPF_FUNC_store_hdr_opt:
+ case BPF_FUNC_xdp_adjust_head:
+ case BPF_FUNC_xdp_adjust_meta:
+ case BPF_FUNC_xdp_adjust_tail:
+ /* tail-called program could call any of the above */
+ case BPF_FUNC_tail_call:
return true;
-
- return false;
+ default:
+ return false;
+ }
}
const struct bpf_func_proto bpf_event_output_data_proto __weak;
@@ -7878,10 +8031,6 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_proto;
@@ -7907,10 +8056,6 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_bind:
switch (prog->expected_attach_type) {
@@ -8001,6 +8146,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
case BPF_FUNC_perf_event_output:
@@ -8132,6 +8279,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_under_cgroup_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
case BPF_FUNC_fib_lookup:
@@ -8340,20 +8489,12 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_msg_pop_data_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
case BPF_FUNC_get_netns_cookie:
return &bpf_get_netns_cookie_sk_msg_proto;
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
default:
return bpf_sk_base_func_proto(func_id, prog);
}
@@ -8526,13 +8667,16 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (off + size > offsetofend(struct __sk_buff, cb[4]))
return false;
break;
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (info->is_ldsx || size != size_default)
+ return false;
+ break;
case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
- case bpf_ctx_range(struct __sk_buff, data):
- case bpf_ctx_range(struct __sk_buff, data_meta):
- case bpf_ctx_range(struct __sk_buff, data_end):
if (size != size_default)
return false;
break;
@@ -8976,6 +9120,14 @@ static bool xdp_is_valid_access(int off, int size,
}
}
return false;
+ } else {
+ switch (off) {
+ case offsetof(struct xdp_md, data_meta):
+ case offsetof(struct xdp_md, data):
+ case offsetof(struct xdp_md, data_end):
+ if (info->is_ldsx)
+ return false;
+ }
}
switch (off) {
@@ -8993,7 +9145,8 @@ static bool xdp_is_valid_access(int off, int size,
return __is_valid_xdp_access(off, size);
}
-void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act)
+void bpf_warn_invalid_xdp_action(const struct net_device *dev,
+ const struct bpf_prog *prog, u32 act)
{
const u32 act_max = XDP_REDIRECT;
@@ -9301,12 +9454,12 @@ static bool flow_dissector_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, data):
- if (size != size_default)
+ if (info->is_ldsx || size != size_default)
return false;
info->reg_type = PTR_TO_PACKET;
return true;
case bpf_ctx_range(struct __sk_buff, data_end):
- if (size != size_default)
+ if (info->is_ldsx || size != size_default)
return false;
info->reg_type = PTR_TO_PACKET_END;
return true;
@@ -9357,16 +9510,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
{
__u8 value_reg = si->dst_reg;
__u8 skb_reg = si->src_reg;
- /* AX is needed because src_reg and dst_reg could be the same */
- __u8 tmp_reg = BPF_REG_AX;
-
- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
- SKB_BF_MONO_TC_OFFSET);
- *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
- SKB_MONO_DELIVERY_TIME_MASK, 2);
- *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
- *insn++ = BPF_JMP_A(1);
- *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
+ BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
+ BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
+ BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
+ BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
+ *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
+#else
+ BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
+#endif
return insn;
}
@@ -9409,11 +9563,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
__u8 tmp_reg = BPF_REG_AX;
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
- TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
- *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
- TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
- /* skb->tc_at_ingress && skb->mono_delivery_time,
+ /* check if ingress mask bits is set */
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+ *insn++ = BPF_JMP_A(4);
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
+ *insn++ = BPF_JMP_A(2);
+ /* skb->tc_at_ingress && skb->tstamp_type,
* read 0 as the (rcv) timestamp.
*/
*insn++ = BPF_MOV64_IMM(value_reg, 0);
@@ -9438,7 +9593,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
* the bpf prog is aware the tstamp could have delivery time.
* Thus, write skb->tstamp as is if tstamp_type_access is true.
* Otherwise, writing at ingress will have to clear the
- * mono_delivery_time bit also.
+ * skb->tstamp_type bit also.
*/
if (!prog->tstamp_type_access) {
__u8 tmp_reg = BPF_REG_AX;
@@ -9448,8 +9603,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
/* goto <store> */
*insn++ = BPF_JMP_A(2);
- /* <clear>: mono_delivery_time */
- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
+ /* <clear>: skb->tstamp_type */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
}
#endif
@@ -9547,7 +9702,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, queue_mapping):
if (type == BPF_WRITE) {
- u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);
+ u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);
if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
*insn++ = BPF_JMP_A(0); /* noop */
@@ -9556,7 +9711,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
if (BPF_CLASS(si->code) == BPF_STX)
*insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
- *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
+ *insn++ = BPF_EMIT_STORE(BPF_H, si, offset);
} else {
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
bpf_target_off(struct sk_buff,
@@ -10153,10 +10308,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
} \
} while (0)
-#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \
- SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \
- S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)
-
static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -10274,10 +10425,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
} \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
- is_fullsock), \
+ is_locked_tcp_sock), \
fullsock_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, \
- is_fullsock)); \
+ is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
if (si->dst_reg == si->src_reg) \
*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
@@ -10362,10 +10513,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
temp)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
- is_fullsock), \
+ is_locked_tcp_sock), \
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
- is_fullsock)); \
+ is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
@@ -11005,7 +11156,6 @@ const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
};
const struct bpf_prog_ops lwt_seg6local_prog_ops = {
- .test_run = bpf_prog_test_run_skb,
};
const struct bpf_verifier_ops cg_sock_verifier_ops = {
@@ -11168,6 +11318,7 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
struct sock_reuseport *reuse;
struct sock *selected_sk;
+ int err;
selected_sk = map->ops->map_lookup_elem(map, key);
if (!selected_sk)
@@ -11175,10 +11326,6 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
if (!reuse) {
- /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
- if (sk_is_refcounted(selected_sk))
- sock_put(selected_sk);
-
/* reuseport_array has only sk with non NULL sk_reuseport_cb.
* The only (!reuse) case here is - the sk has already been
* unhashed (e.g. by close()), so treat it as -ENOENT.
@@ -11186,24 +11333,33 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
* Other maps (e.g. sock_map) do not provide this guarantee and
* the sk may never be in the reuseport group to begin with.
*/
- return is_sockarray ? -ENOENT : -EINVAL;
+ err = is_sockarray ? -ENOENT : -EINVAL;
+ goto error;
}
if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
struct sock *sk = reuse_kern->sk;
- if (sk->sk_protocol != selected_sk->sk_protocol)
- return -EPROTOTYPE;
- else if (sk->sk_family != selected_sk->sk_family)
- return -EAFNOSUPPORT;
-
- /* Catch all. Likely bound to a different sockaddr. */
- return -EBADFD;
+ if (sk->sk_protocol != selected_sk->sk_protocol) {
+ err = -EPROTOTYPE;
+ } else if (sk->sk_family != selected_sk->sk_family) {
+ err = -EAFNOSUPPORT;
+ } else {
+ /* Catch all. Likely bound to a different sockaddr. */
+ err = -EBADFD;
+ }
+ goto error;
}
reuse_kern->selected_sk = selected_sk;
return 0;
+error:
+ /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
+ if (sk_is_refcounted(selected_sk))
+ sock_put(selected_sk);
+
+ return err;
}
static const struct bpf_func_proto sk_select_reuseport_proto = {
@@ -11823,28 +11979,34 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
__bpf_kfunc_start_defs();
-__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
- struct bpf_dynptr_kern *ptr__uninit)
+__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)s;
+
if (flags) {
- bpf_dynptr_set_null(ptr__uninit);
+ bpf_dynptr_set_null(ptr);
return -EINVAL;
}
- bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
return 0;
}
-__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
- struct bpf_dynptr_kern *ptr__uninit)
+__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct xdp_buff *xdp = (struct xdp_buff *)x;
+
if (flags) {
- bpf_dynptr_set_null(ptr__uninit);
+ bpf_dynptr_set_null(ptr);
return -EINVAL;
}
- bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
+ bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
return 0;
}
@@ -11870,10 +12032,11 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
return 0;
}
-__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk,
+__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
struct bpf_tcp_req_attrs *attrs, int attrs__sz)
{
#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ struct sk_buff *skb = (struct sk_buff *)s;
const struct request_sock_ops *ops;
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
@@ -11966,24 +12129,44 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk,
#endif
}
+__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
+ u64 flags)
+{
+ struct sk_buff *skb;
+
+ if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
+ return -EOPNOTSUPP;
+
+ if (flags)
+ return -EINVAL;
+
+ skb = skops->skb;
+ skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
+ TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
+ skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
-int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
- struct bpf_dynptr_kern *ptr__uninit)
+int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
int err;
err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
if (err)
return err;
- bpf_dynptr_set_rdonly(ptr__uninit);
+ bpf_dynptr_set_rdonly(ptr);
return 0;
}
BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
-BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
@@ -11998,6 +12181,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
+BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
+
static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_skb,
@@ -12018,6 +12205,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
.set = &bpf_kfunc_check_set_tcp_reqsk,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_ops,
+};
+
static int __init bpf_kfunc_init(void)
{
int ret;
@@ -12032,10 +12224,12 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
&bpf_kfunc_set_sock_addr);
- return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
}
late_initcall(bpf_kfunc_init);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 272f09251343..1b61bb25ba0e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -106,7 +106,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net,
#endif /* CONFIG_BPF_SYSCALL */
/**
- * __skb_flow_get_ports - extract the upper layer ports and return them
+ * skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
* @ip_proto: protocol for which to get port offset
@@ -116,8 +116,8 @@ int flow_dissector_bpf_prog_attach_check(struct net *net,
* The function will try to retrieve the ports at offset thoff + poff where poff
* is the protocol port offset returned from proto_ports_offset
*/
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- const void *data, int hlen)
+__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+ const void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
@@ -137,7 +137,7 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
return 0;
}
-EXPORT_SYMBOL(__skb_flow_get_ports);
+EXPORT_SYMBOL(skb_flow_get_ports);
static bool icmp_has_id(u8 type)
{
@@ -299,9 +299,10 @@ void skb_flow_dissect_meta(const struct sk_buff *skb,
EXPORT_SYMBOL(skb_flow_dissect_meta);
static void
-skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
- struct flow_dissector *flow_dissector,
- void *target_container)
+skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type,
+ u32 ctrl_flags,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
{
struct flow_dissector_key_control *ctrl;
@@ -312,6 +313,7 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
FLOW_DISSECTOR_KEY_ENC_CONTROL,
target_container);
ctrl->addr_type = type;
+ ctrl->flags = ctrl_flags;
}
void
@@ -367,6 +369,7 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
{
struct ip_tunnel_info *info;
struct ip_tunnel_key *key;
+ u32 ctrl_flags = 0;
/* A quick check to see if there might be something to do. */
if (!dissector_uses_key(flow_dissector,
@@ -391,11 +394,20 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
key = &info->key;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM;
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT;
+ if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM;
+ if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT;
+
switch (ip_tunnel_info_af(info)) {
case AF_INET:
- skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
- flow_dissector,
- target_container);
+ skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ ctrl_flags, flow_dissector,
+ target_container);
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
struct flow_dissector_key_ipv4_addrs *ipv4;
@@ -408,9 +420,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
}
break;
case AF_INET6:
- skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
- flow_dissector,
- target_container);
+ skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ ctrl_flags, flow_dissector,
+ target_container);
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
struct flow_dissector_key_ipv6_addrs *ipv6;
@@ -422,6 +434,10 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
ipv6->dst = key->u.ipv6.dst;
}
break;
+ default:
+ skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector,
+ target_container);
+ break;
}
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
@@ -455,17 +471,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
struct flow_dissector_key_enc_opts *enc_opt;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ u32 val;
enc_opt = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_OPTS,
target_container);
- if (info->options_len) {
- enc_opt->len = info->options_len;
- ip_tunnel_info_opts_get(enc_opt->data, info);
- enc_opt->dst_opt_type = info->key.tun_flags &
- TUNNEL_OPTIONS_PRESENT;
- }
+ if (!info->options_len)
+ return;
+
+ enc_opt->len = info->options_len;
+ ip_tunnel_info_opts_get(enc_opt->data, info);
+
+ ip_tunnel_set_options_present(flags);
+ ip_tunnel_flags_and(flags, info->key.tun_flags, flags);
+
+ val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
+ IP_TUNNEL_GENEVE_OPT_BIT);
+ enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
}
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
@@ -829,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb,
void *target_container, const void *data,
int nhoff, u8 ip_proto, int hlen)
{
- enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
- struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
+ struct flow_dissector_key_ports *key_ports = NULL;
+ __be32 ports;
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
- dissector_ports = FLOW_DISSECTOR_KEY_PORTS;
- else if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE))
- dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE;
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
- if (dissector_ports == FLOW_DISSECTOR_KEY_MAX)
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE))
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+
+ if (!key_ports && !key_ports_range)
return;
- key_ports = skb_flow_dissector_target(flow_dissector,
- dissector_ports,
- target_container);
- key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
- data, hlen);
+ ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);
+
+ if (key_ports)
+ key_ports->ports = ports;
+
+ if (key_ports_range)
+ key_ports_range->tp.ports = ports;
}
static void
@@ -900,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
struct flow_dissector *flow_dissector,
void *target_container)
{
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
struct flow_dissector_key_ports *key_ports = NULL;
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
@@ -944,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
- if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
key_ports = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_PORTS,
target_container);
- else if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE))
- key_ports = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE,
- target_container);
-
- if (key_ports) {
key_ports->src = flow_keys->sport;
key_ports->dst = flow_keys->dport;
}
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE)) {
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+ key_ports_range->tp.src = flow_keys->sport;
+ key_ports_range->tp.dst = flow_keys->dport;
+ }
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
@@ -1084,21 +1117,22 @@ bool __skb_flow_dissect(const struct net *net,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
+ rcu_read_lock();
+
if (skb) {
if (!net) {
if (skb->dev)
- net = dev_net(skb->dev);
+ net = dev_net_rcu(skb->dev);
else if (skb->sk)
net = sock_net(skb->sk);
}
}
- WARN_ON_ONCE(!net);
+ DEBUG_NET_WARN_ON_ONCE(!net);
if (net) {
enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
struct bpf_prog_array *run_array;
- rcu_read_lock();
run_array = rcu_dereference(init_net.bpf.run_array[type]);
if (!run_array)
run_array = rcu_dereference(net->bpf.run_array[type]);
@@ -1126,17 +1160,17 @@ bool __skb_flow_dissect(const struct net *net,
prog = READ_ONCE(run_array->items[0].prog);
result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
hlen, flags);
- if (result == BPF_FLOW_DISSECTOR_CONTINUE)
- goto dissect_continue;
- __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
- target_container);
- rcu_read_unlock();
- return result == BPF_OK;
+ if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
+ __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+ target_container);
+ rcu_read_unlock();
+ return result == BPF_OK;
+ }
}
-dissect_continue:
- rcu_read_unlock();
}
+ rcu_read_unlock();
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
@@ -1784,6 +1818,13 @@ u32 flow_hash_from_keys(struct flow_keys *keys)
}
EXPORT_SYMBOL(flow_hash_from_keys);
+u32 flow_hash_from_keys_seed(struct flow_keys *keys,
+ const siphash_key_t *keyval)
+{
+ return __flow_hash_from_keys(keys, keyval);
+}
+EXPORT_SYMBOL(flow_hash_from_keys_seed);
+
static inline u32 ___skb_get_hash(const struct sk_buff *skb,
struct flow_keys *keys,
const siphash_key_t *keyval)
@@ -1823,22 +1864,23 @@ EXPORT_SYMBOL(make_flow_keys_digest);
static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
-u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
+u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)
{
struct flow_keys keys;
__flow_hash_secret_init();
memset(&keys, 0, sizeof(keys));
- __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric,
+ __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,
&keys, NULL, 0, 0, 0, 0);
return __flow_hash_from_keys(&keys, &hashrnd);
}
-EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
+EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);
/**
- * __skb_get_hash: calculate a flow hash
+ * __skb_get_hash_net: calculate a flow hash
+ * @net: associated network namespace, derived from @skb if NULL
* @skb: sk_buff to calculate flow hash from
*
* This function calculates a flow hash based on src/dst addresses
@@ -1846,18 +1888,24 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
* on success, zero indicates no valid hash. Also, sets l4_hash in skb
* if hash is a canonical 4-tuple hash over transport ports.
*/
-void __skb_get_hash(struct sk_buff *skb)
+void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
struct flow_keys keys;
u32 hash;
+ memset(&keys, 0, sizeof(keys));
+
+ __skb_flow_dissect(net, skb, &flow_keys_dissector,
+ &keys, NULL, 0, 0, 0,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
__flow_hash_secret_init();
- hash = ___skb_get_hash(skb, &keys, &hashrnd);
+ hash = __flow_hash_from_keys(&keys, &hashrnd);
__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
-EXPORT_SYMBOL(__skb_get_hash);
+EXPORT_SYMBOL(__skb_get_hash_net);
__u32 skb_get_hash_perturb(const struct sk_buff *skb,
const siphash_key_t *perturb)
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index fae9c4694186..7d426a8e29f3 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -75,7 +75,7 @@ static void est_fetch_counters(struct net_rate_estimator *e,
static void est_timer(struct timer_list *t)
{
- struct net_rate_estimator *est = from_timer(est, t, timer);
+ struct net_rate_estimator *est = timer_container_of(est, t, timer);
struct gnet_stats_basic_sync b;
u64 b_bytes, b_packets;
u64 rate, brate;
@@ -177,7 +177,7 @@ int gen_new_estimator(struct gnet_stats_basic_sync *bstats,
spin_lock_bh(lock);
old = rcu_dereference_protected(*rate_est, 1);
if (old) {
- del_timer_sync(&old->timer);
+ timer_delete_sync(&old->timer);
est->avbps = old->avbps;
est->avpps = old->avpps;
}
@@ -206,7 +206,7 @@ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
struct net_rate_estimator *est;
- est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ est = unrcu_pointer(xchg(rate_est, NULL));
if (est) {
timer_shutdown_sync(&est->timer);
kfree_rcu(est, rcu);
diff --git a/net/core/gro.c b/net/core/gro.c
index ee30d4f0c038..b350e5b69549 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -3,12 +3,10 @@
#include <net/dst_metadata.h>
#include <net/busy_poll.h>
#include <trace/events/net.h>
+#include <linux/skbuff_ref.h>
#define MAX_GRO_SKBS 8
-/* This should be increased if a protocol with a bigger head is added. */
-#define GRO_MAX_HEAD (MAX_HEADER + 128)
-
static DEFINE_SPINLOCK(offload_lock);
/**
@@ -97,7 +95,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
unsigned int delta_truesize;
- unsigned int gro_max_size;
unsigned int new_truesize;
struct sk_buff *lp;
int segs;
@@ -111,12 +108,8 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
if (p->pp_recycle != skb->pp_recycle)
return -ETOOMANYREFS;
- /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
- gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
- READ_ONCE(p->dev->gro_max_size) :
- READ_ONCE(p->dev->gro_ipv4_max_size);
-
- if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
+ if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) ||
+ NAPI_GRO_CB(skb)->flush))
return -E2BIG;
if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
@@ -192,8 +185,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
}
merge:
- /* sk owenrship - if any - completely transferred to the aggregated packet */
+ /* sk ownership - if any - completely transferred to the aggregated packet */
skb->destructor = NULL;
+ skb->sk = NULL;
delta_truesize = skb->truesize;
if (offset > headlen) {
unsigned int eat = offset - headlen;
@@ -229,8 +223,34 @@ done:
return 0;
}
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+
+ /* sk ownership - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
-static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
+static void gro_complete(struct gro_node *gro, struct sk_buff *skb)
{
struct list_head *head = &net_hotdata.offload_base;
struct packet_offload *ptype;
@@ -263,43 +283,43 @@ static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
}
out:
- gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
+ gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count);
}
-static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
- bool flush_old)
+static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old)
{
- struct list_head *head = &napi->gro_hash[index].list;
+ struct list_head *head = &gro->hash[index].list;
struct sk_buff *skb, *p;
list_for_each_entry_safe_reverse(skb, p, head, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
skb_list_del_init(skb);
- napi_gro_complete(napi, skb);
- napi->gro_hash[index].count--;
+ gro_complete(gro, skb);
+ gro->hash[index].count--;
}
- if (!napi->gro_hash[index].count)
- __clear_bit(index, &napi->gro_bitmask);
+ if (!gro->hash[index].count)
+ __clear_bit(index, &gro->bitmask);
}
-/* napi->gro_hash[].list contains packets ordered by age.
+/*
+ * gro->hash[].list contains packets ordered by age.
* youngest packets at the head of it.
* Complete skbs in reverse order to reduce latencies.
*/
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+void __gro_flush(struct gro_node *gro, bool flush_old)
{
- unsigned long bitmask = napi->gro_bitmask;
+ unsigned long bitmask = gro->bitmask;
unsigned int i, base = ~0U;
while ((i = ffs(bitmask)) != 0) {
bitmask >>= i;
base += i;
- __napi_gro_flush_chain(napi, base, flush_old);
+ __gro_flush_chain(gro, base, flush_old);
}
}
-EXPORT_SYMBOL(napi_gro_flush);
+EXPORT_SYMBOL(__gro_flush);
static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb,
const struct sk_buff *p,
@@ -329,8 +349,6 @@ static void gro_list_prepare(const struct list_head *head,
list_for_each_entry(p, head, list) {
unsigned long diffs;
- NAPI_GRO_CB(p)->flush = 0;
-
if (hash != skb_get_hash_raw(p)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
@@ -347,7 +365,7 @@ static void gro_list_prepare(const struct list_head *head,
skb_mac_header(skb),
maclen);
- /* in most common scenarions 'slow_gro' is 0
+ /* in most common scenarios 'slow_gro' is 0
* otherwise we are already on some slower paths
* either skip all the infrequent tests altogether or
* avoid trying too hard to skip each of them individually
@@ -370,6 +388,7 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
const skb_frag_t *frag0;
unsigned int headlen;
+ NAPI_GRO_CB(skb)->network_offset = 0;
NAPI_GRO_CB(skb)->data_offset = 0;
headlen = skb_headlen(skb);
NAPI_GRO_CB(skb)->frag0 = skb->data;
@@ -380,7 +399,8 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
pinfo = skb_shinfo(skb);
frag0 = &pinfo->frags[0];
- if (pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) &&
+ if (pinfo->nr_frags && skb_frag_page(frag0) &&
+ !PageHighMem(skb_frag_page(frag0)) &&
(!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
@@ -418,7 +438,7 @@ static void gro_try_pull_from_frag0(struct sk_buff *skb)
gro_pull_from_frag0(skb, grow);
}
-static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
+static void gro_flush_oldest(struct gro_node *gro, struct list_head *head)
{
struct sk_buff *oldest;
@@ -434,14 +454,15 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
* SKB to the chain.
*/
skb_list_del_init(oldest);
- napi_gro_complete(napi, oldest);
+ gro_complete(gro, oldest);
}
-static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+static enum gro_result dev_gro_receive(struct gro_node *gro,
+ struct sk_buff *skb)
{
u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
- struct gro_list *gro_list = &napi->gro_hash[bucket];
struct list_head *head = &net_hotdata.offload_base;
+ struct gro_list *gro_list = &gro->hash[bucket];
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct sk_buff *pp = NULL;
@@ -469,7 +490,6 @@ found_ptype:
sizeof(u32))); /* Avoid slow unaligned acc */
*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->count = 1;
if (unlikely(skb_is_gso(skb))) {
NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
@@ -506,7 +526,7 @@ found_ptype:
if (pp) {
skb_list_del_init(pp);
- napi_gro_complete(napi, pp);
+ gro_complete(gro, pp);
gro_list->count--;
}
@@ -517,7 +537,7 @@ found_ptype:
goto normal;
if (unlikely(gro_list->count >= MAX_GRO_SKBS))
- gro_flush_oldest(napi, &gro_list->list);
+ gro_flush_oldest(gro, &gro_list->list);
else
gro_list->count++;
@@ -531,10 +551,10 @@ found_ptype:
ret = GRO_HELD;
ok:
if (gro_list->count) {
- if (!test_bit(bucket, &napi->gro_bitmask))
- __set_bit(bucket, &napi->gro_bitmask);
- } else if (test_bit(bucket, &napi->gro_bitmask)) {
- __clear_bit(bucket, &napi->gro_bitmask);
+ if (!test_bit(bucket, &gro->bitmask))
+ __set_bit(bucket, &gro->bitmask);
+ } else if (test_bit(bucket, &gro->bitmask)) {
+ __clear_bit(bucket, &gro->bitmask);
}
return ret;
@@ -573,13 +593,12 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
}
EXPORT_SYMBOL(gro_find_complete_by_type);
-static gro_result_t napi_skb_finish(struct napi_struct *napi,
- struct sk_buff *skb,
- gro_result_t ret)
+static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb,
+ gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
- gro_normal_one(napi, skb, 1);
+ gro_normal_one(gro, skb, 1);
break;
case GRO_MERGED_FREE:
@@ -600,21 +619,21 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
return ret;
}
-gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb)
{
gro_result_t ret;
- skb_mark_napi_id(skb, napi);
+ __skb_mark_napi_id(skb, gro);
trace_napi_gro_receive_entry(skb);
skb_gro_reset_offset(skb, 0);
- ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
+ ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb));
trace_napi_gro_receive_exit(ret);
return ret;
}
-EXPORT_SYMBOL(napi_gro_receive);
+EXPORT_SYMBOL(gro_receive_skb);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
@@ -633,6 +652,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->pkt_type = PACKET_HOST;
skb->encapsulation = 0;
+ skb->ip_summed = CHECKSUM_NONE;
skb_shinfo(skb)->gso_type = 0;
skb_shinfo(skb)->gso_size = 0;
if (unlikely(skb->slow_gro)) {
@@ -670,7 +690,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_NORMAL)
- gro_normal_one(napi, skb, 1);
+ gro_normal_one(&napi->gro, skb, 1);
break;
case GRO_MERGED_FREE:
@@ -739,7 +759,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
trace_napi_gro_frags_entry(skb);
- ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb));
trace_napi_gro_frags_exit(ret);
return ret;
@@ -771,3 +791,37 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
return sum;
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
+void gro_init(struct gro_node *gro)
+{
+ for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) {
+ INIT_LIST_HEAD(&gro->hash[i].list);
+ gro->hash[i].count = 0;
+ }
+
+ gro->bitmask = 0;
+ gro->cached_napi_id = 0;
+
+ INIT_LIST_HEAD(&gro->rx_list);
+ gro->rx_count = 0;
+}
+
+void gro_cleanup(struct gro_node *gro)
+{
+ struct sk_buff *skb, *n;
+
+ for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) {
+ list_for_each_entry_safe(skb, n, &gro->hash[i].list, list)
+ kfree_skb(skb);
+
+ gro->hash[i].count = 0;
+ }
+
+ gro->bitmask = 0;
+ gro->cached_napi_id = 0;
+
+ list_for_each_entry_safe(skb, n, &gro->rx_list, list)
+ kfree_skb(skb);
+
+ gro->rx_count = 0;
+}
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index c8a7a451c18a..0bc893d5f07b 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -1,13 +1,12 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <net/hotdata.h>
#include <linux/cache.h>
#include <linux/jiffies.h>
#include <linux/list.h>
-
+#include <net/hotdata.h>
+#include <net/proto_memory.h>
struct net_hotdata net_hotdata __cacheline_aligned = {
.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
- .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all),
.gro_normal_batch = 8,
.netdev_budget = 300,
@@ -18,5 +17,8 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
.max_backlog = 1000,
.dev_tx_weight = 64,
.dev_rx_weight = 64,
+ .sysctl_max_skb_frags = MAX_SKB_FRAGS,
+ .sysctl_skb_defer_max = 64,
+ .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE
};
EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c
new file mode 100644
index 000000000000..759a9b9f3f89
--- /dev/null
+++ b/net/core/ieee8021q_helpers.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/array_size.h>
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <net/dscp.h>
+#include <net/ieee8021q.h>
+
+/* The following arrays map Traffic Types (TT) to traffic classes (TC) for
+ * different number of queues as shown in the example provided by
+ * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and
+ * Table I-1 "Traffic type to traffic class mapping".
+ */
+static const u8 ieee8021q_8queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2,
+ [IEEE8021Q_TT_CA] = 3,
+ [IEEE8021Q_TT_VI] = 4,
+ [IEEE8021Q_TT_VO] = 5,
+ [IEEE8021Q_TT_IC] = 6,
+ [IEEE8021Q_TT_NC] = 7,
+};
+
+static const u8 ieee8021q_7queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2,
+ [IEEE8021Q_TT_CA] = 3,
+ [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4,
+ [IEEE8021Q_TT_IC] = 5,
+ [IEEE8021Q_TT_NC] = 6,
+};
+
+static const u8 ieee8021q_6queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2,
+ [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3,
+ [IEEE8021Q_TT_IC] = 4,
+ [IEEE8021Q_TT_NC] = 5,
+};
+
+static const u8 ieee8021q_5queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+ [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+ [IEEE8021Q_TT_IC] = 3,
+ [IEEE8021Q_TT_NC] = 4,
+};
+
+static const u8 ieee8021q_4queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+ [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+ [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3,
+};
+
+static const u8 ieee8021q_3queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+ [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2,
+};
+
+static const u8 ieee8021q_2queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+ [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1,
+};
+
+static const u8 ieee8021q_1queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0,
+ [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0,
+};
+
+/**
+ * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class
+ * @tt: IEEE 802.1Q Traffic Type
+ * @num_queues: Number of queues
+ *
+ * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based
+ * on the number of queues configured on the NIC. The mapping is based on the
+ * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic
+ * class mapping" and Table I-1 "Traffic type to traffic class mapping".
+ *
+ * Return: Traffic Class corresponding to the given Traffic Type or negative
+ * value in case of error.
+ */
+int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues)
+{
+ if (tt < 0 || tt >= IEEE8021Q_TT_MAX) {
+ pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt,
+ IEEE8021Q_TT_MAX);
+ return -EINVAL;
+ }
+
+ switch (num_queues) {
+ case 8:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_8queue_tt_tc_map != max - 1");
+ return ieee8021q_8queue_tt_tc_map[tt];
+ case 7:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_7queue_tt_tc_map != max - 1");
+
+ return ieee8021q_7queue_tt_tc_map[tt];
+ case 6:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_6queue_tt_tc_map != max - 1");
+
+ return ieee8021q_6queue_tt_tc_map[tt];
+ case 5:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_5queue_tt_tc_map != max - 1");
+
+ return ieee8021q_5queue_tt_tc_map[tt];
+ case 4:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_4queue_tt_tc_map != max - 1");
+
+ return ieee8021q_4queue_tt_tc_map[tt];
+ case 3:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_3queue_tt_tc_map != max - 1");
+
+ return ieee8021q_3queue_tt_tc_map[tt];
+ case 2:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_2queue_tt_tc_map != max - 1");
+
+ return ieee8021q_2queue_tt_tc_map[tt];
+ case 1:
+ compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) !=
+ IEEE8021Q_TT_MAX - 1,
+ "ieee8021q_1queue_tt_tc_map != max - 1");
+
+ return ieee8021q_1queue_tt_tc_map[tt];
+ }
+
+ pr_err("Invalid number of queues %d\n", num_queues);
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc);
+
+/**
+ * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type
+ * @dscp: IETF DSCP value
+ *
+ * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT).
+ * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic
+ * Type, this function is inspired by the RFC8325 documentation which describe
+ * the mapping between DSCP and 802.11 User Priority (UP) values.
+ *
+ * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value
+ */
+int ietf_dscp_to_ieee8021q_tt(u8 dscp)
+{
+ switch (dscp) {
+ case DSCP_CS0:
+ /* Comment from RFC8325:
+ * [RFC4594], Section 4.8, recommends High-Throughput Data be marked
+ * AF1x (that is, AF11, AF12, and AF13, according to the rules defined
+ * in [RFC2475]).
+ *
+ * By default (as described in Section 2.3), High-Throughput Data will
+ * map to UP 1 and, thus, to the Background Access Category (AC_BK),
+ * which is contrary to the intent expressed in [RFC4594].
+
+ * Unfortunately, there really is no corresponding fit for the High-
+ * Throughput Data service class within the constrained 4 Access
+ * Category [IEEE.802.11-2016] model. If the High-Throughput Data
+ * service class is assigned to the Best Effort Access Category (AC_BE),
+ * then it would contend with Low-Latency Data (while [RFC4594]
+ * recommends a distinction in servicing between these service classes)
+ * as well as with the default service class; alternatively, if it is
+ * assigned to the Background Access Category (AC_BK), then it would
+ * receive a less-then-best-effort service and contend with Low-Priority
+ * Data (as discussed in Section 4.2.10).
+ *
+ * As such, since there is no directly corresponding fit for the High-
+ * Throughout Data service class within the [IEEE.802.11-2016] model, it
+ * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby
+ * admitting it to the Best Effort Access Category (AC_BE).
+ *
+ * Note: The above text is from RFC8325 which is describing the mapping
+ * between DSCP and 802.11 User Priority (UP) values. The mapping
+ * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but
+ * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q
+ * Traffic Types BE and BK.
+ */
+ case DSCP_AF11:
+ case DSCP_AF12:
+ case DSCP_AF13:
+ return IEEE8021Q_TT_BE;
+ /* Comment from RFC8325:
+ * RFC3662 and RFC4594 both recommend Low-Priority Data be marked
+ * with DSCP CS1. The Low-Priority Data service class loosely
+ * corresponds to the [IEEE.802.11-2016] Background Access Category
+ */
+ case DSCP_CS1:
+ return IEEE8021Q_TT_BK;
+ case DSCP_CS2:
+ case DSCP_AF21:
+ case DSCP_AF22:
+ case DSCP_AF23:
+ return IEEE8021Q_TT_EE;
+ case DSCP_CS3:
+ case DSCP_AF31:
+ case DSCP_AF32:
+ case DSCP_AF33:
+ return IEEE8021Q_TT_CA;
+ case DSCP_CS4:
+ case DSCP_AF41:
+ case DSCP_AF42:
+ case DSCP_AF43:
+ return IEEE8021Q_TT_VI;
+ case DSCP_CS5:
+ case DSCP_EF:
+ case DSCP_VOICE_ADMIT:
+ return IEEE8021Q_TT_VO;
+ case DSCP_CS6:
+ return IEEE8021Q_TT_IC;
+ case DSCP_CS7:
+ return IEEE8021Q_TT_NC;
+ }
+
+ return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp);
+}
+EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 8ec35194bfcb..864f3bbc3a4c 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -42,12 +42,21 @@ static unsigned int default_operstate(const struct net_device *dev)
* first check whether lower is indeed the source of its down state.
*/
if (!netif_carrier_ok(dev)) {
- int iflink = dev_get_iflink(dev);
struct net_device *peer;
+ int iflink;
+
+ /* If called from netdev_run_todo()/linkwatch_sync_dev(),
+ * dev_net(dev) can be already freed, and RTNL is not held.
+ */
+ if (dev->reg_state <= NETREG_REGISTERED)
+ iflink = dev_get_iflink(dev);
+ else
+ iflink = dev->ifindex;
if (iflink == dev->ifindex)
return IF_OPER_DOWN;
+ ASSERT_RTNL();
peer = __dev_get_by_index(dev_net(dev), iflink);
if (!peer)
return IF_OPER_DOWN;
@@ -148,9 +157,9 @@ static void linkwatch_schedule_work(int urgent)
* override the existing timer.
*/
if (test_bit(LW_URGENT, &linkwatch_flags))
- mod_delayed_work(system_wq, &linkwatch_work, 0);
+ mod_delayed_work(system_unbound_wq, &linkwatch_work, 0);
else
- schedule_delayed_work(&linkwatch_work, delay);
+ queue_delayed_work(system_unbound_wq, &linkwatch_work, delay);
}
@@ -174,7 +183,7 @@ static void linkwatch_do_dev(struct net_device *dev)
else
dev_deactivate(dev);
- netdev_state_change(dev);
+ netif_state_change(dev);
}
/* Note: our callers are responsible for calling netdev_tracker_free().
* This is the reason we use __dev_put() instead of dev_put().
@@ -231,7 +240,9 @@ static void __linkwatch_run_queue(int urgent_only)
*/
netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
spin_unlock_irq(&lweventlist_lock);
+ netdev_lock_ops(dev);
linkwatch_do_dev(dev);
+ netdev_unlock_ops(dev);
do_dev--;
spin_lock_irq(&lweventlist_lock);
}
@@ -244,25 +255,41 @@ static void __linkwatch_run_queue(int urgent_only)
spin_unlock_irq(&lweventlist_lock);
}
-void linkwatch_sync_dev(struct net_device *dev)
+static bool linkwatch_clean_dev(struct net_device *dev)
{
unsigned long flags;
- int clean = 0;
+ bool clean = false;
spin_lock_irqsave(&lweventlist_lock, flags);
if (!list_empty(&dev->link_watch_list)) {
list_del_init(&dev->link_watch_list);
- clean = 1;
+ clean = true;
/* We must release netdev tracker under
* the spinlock protection.
*/
netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
- if (clean)
+
+ return clean;
+}
+
+void __linkwatch_sync_dev(struct net_device *dev)
+{
+ netdev_ops_assert_locked(dev);
+
+ if (linkwatch_clean_dev(dev))
linkwatch_do_dev(dev);
}
+void linkwatch_sync_dev(struct net_device *dev)
+{
+ if (linkwatch_clean_dev(dev)) {
+ netdev_lock_ops(dev);
+ linkwatch_do_dev(dev);
+ netdev_unlock_ops(dev);
+ }
+}
/* Must be called with the rtnl semaphore held */
void linkwatch_run_queue(void)
diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c
new file mode 100644
index 000000000000..9e9fb25314b9
--- /dev/null
+++ b/net/core/lock_debug.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright Amazon.com Inc. or its affiliates. */
+
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/netdev_lock.h>
+#include <net/netns/generic.h>
+
+int netdev_debug_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ enum netdev_cmd cmd = event;
+
+ /* Keep enum and don't add default to trigger -Werror=switch */
+ switch (cmd) {
+ case NETDEV_XDP_FEAT_CHANGE:
+ netdev_assert_locked(dev);
+ fallthrough;
+ case NETDEV_CHANGE:
+ case NETDEV_REGISTER:
+ case NETDEV_UP:
+ netdev_ops_assert_locked(dev);
+ fallthrough;
+ case NETDEV_DOWN:
+ case NETDEV_REBOOT:
+ case NETDEV_UNREGISTER:
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_PRE_CHANGEADDR:
+ case NETDEV_GOING_DOWN:
+ case NETDEV_FEAT_CHANGE:
+ case NETDEV_BONDING_FAILOVER:
+ case NETDEV_PRE_UP:
+ case NETDEV_PRE_TYPE_CHANGE:
+ case NETDEV_POST_TYPE_CHANGE:
+ case NETDEV_POST_INIT:
+ case NETDEV_PRE_UNINIT:
+ case NETDEV_RELEASE:
+ case NETDEV_NOTIFY_PEERS:
+ case NETDEV_JOIN:
+ case NETDEV_CHANGEUPPER:
+ case NETDEV_RESEND_IGMP:
+ case NETDEV_PRECHANGEMTU:
+ case NETDEV_CHANGEINFODATA:
+ case NETDEV_BONDING_INFO:
+ case NETDEV_PRECHANGEUPPER:
+ case NETDEV_CHANGELOWERSTATE:
+ case NETDEV_UDP_TUNNEL_PUSH_INFO:
+ case NETDEV_UDP_TUNNEL_DROP_INFO:
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ case NETDEV_CVLAN_FILTER_PUSH_INFO:
+ case NETDEV_CVLAN_FILTER_DROP_INFO:
+ case NETDEV_SVLAN_FILTER_PUSH_INFO:
+ case NETDEV_SVLAN_FILTER_DROP_INFO:
+ case NETDEV_OFFLOAD_XSTATS_ENABLE:
+ case NETDEV_OFFLOAD_XSTATS_DISABLE:
+ case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
+ case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
+ ASSERT_RTNL();
+ break;
+
+ case NETDEV_CHANGENAME:
+ ASSERT_RTNL_NET(net);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL");
+
+static int rtnl_net_debug_net_id;
+
+static int __net_init rtnl_net_debug_net_init(struct net *net)
+{
+ struct notifier_block *nb;
+
+ nb = net_generic(net, rtnl_net_debug_net_id);
+ nb->notifier_call = netdev_debug_event;
+
+ return register_netdevice_notifier_net(net, nb);
+}
+
+static void __net_exit rtnl_net_debug_net_exit(struct net *net)
+{
+ struct notifier_block *nb;
+
+ nb = net_generic(net, rtnl_net_debug_net_id);
+ unregister_netdevice_notifier_net(net, nb);
+}
+
+static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = {
+ .init = rtnl_net_debug_net_init,
+ .exit = rtnl_net_debug_net_exit,
+ .id = &rtnl_net_debug_net_id,
+ .size = sizeof(struct notifier_block),
+};
+
+static struct notifier_block rtnl_net_debug_block = {
+ .notifier_call = netdev_debug_event,
+};
+
+static int __init rtnl_net_debug_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&rtnl_net_debug_net_ops);
+ if (ret)
+ return ret;
+
+ ret = register_netdevice_notifier(&rtnl_net_debug_block);
+ if (ret)
+ unregister_pernet_subsys(&rtnl_net_debug_net_ops);
+
+ return ret;
+}
+
+subsys_initcall(rtnl_net_debug_init);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 4a0797f0a154..ae74634310a3 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -10,8 +10,10 @@
#include <linux/bpf.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
+#include <net/ip.h>
#include <net/ip6_route.h>
#include <net/ipv6_stubs.h>
+#include <net/inet_dscp.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -38,13 +40,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
struct dst_entry *dst, bool can_redirect)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int ret;
- /* Migration disable and BH disable are needed to protect per-cpu
- * redirect_info between BPF prog and skb_do_redirect().
+ /* Disabling BH is needed to protect per-CPU bpf_redirect_info between
+ * BPF prog and skb_do_redirect().
*/
- migrate_disable();
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
@@ -77,24 +80,26 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
break;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
- migrate_enable();
return ret;
}
static int bpf_lwt_input_reroute(struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
int err = -EINVAL;
if (skb->protocol == htons(ETH_P_IP)) {
struct net_device *dev = skb_dst(skb)->dev;
- struct iphdr *iph = ip_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
dev_hold(dev);
skb_dst_drop(skb);
- err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, dev);
+ reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev);
+ err = reason ? -EINVAL : 0;
dev_put(dev);
} else if (skb->protocol == htons(ETH_P_IPV6)) {
skb_dst_drop(skb);
@@ -204,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
fl4.flowi4_oif = oif;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_uid = sock_net_uid(net, sk);
- fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
fl4.flowi4_proto = iph->protocol;
fl4.daddr = iph->daddr;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 711cd3b4347a..f9d76d85d04f 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -23,6 +23,8 @@
#include <net/ip6_fib.h>
#include <net/rtnh.h>
+#include "dev.h"
+
DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
@@ -158,21 +160,14 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
return ret;
}
- rcu_read_lock();
- ops = rcu_dereference(lwtun_encaps[encap_type]);
- rcu_read_unlock();
+ ops = rcu_access_pointer(lwtun_encaps[encap_type]);
#ifdef CONFIG_MODULES
if (!ops) {
const char *encap_type_str = lwtunnel_encap_str(encap_type);
if (encap_type_str) {
- __rtnl_unlock();
request_module("rtnl-lwt-%s", encap_type_str);
- rtnl_lock();
-
- rcu_read_lock();
- ops = rcu_dereference(lwtun_encaps[encap_type]);
- rcu_read_unlock();
+ ops = rcu_access_pointer(lwtun_encaps[encap_type]);
}
}
#endif
@@ -206,8 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
}
encap_type = nla_get_u16(nla_entype);
- if (lwtunnel_valid_encap_type(encap_type,
- extack) != 0)
+ if (lwtunnel_valid_encap_type(encap_type, extack))
return -EOPNOTSUPP;
}
}
@@ -325,82 +319,132 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ local_bh_disable();
- if (!dst)
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
- lwtstate->type > LWTUNNEL_ENCAP_MAX)
- return 0;
+ lwtstate->type > LWTUNNEL_ENCAP_MAX) {
+ ret = 0;
+ goto out;
+ }
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->output))
+ if (likely(ops && ops->output)) {
+ dev_xmit_recursion_inc();
ret = ops->output(net, sk, skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
goto drop;
- return ret;
+ goto out;
drop:
kfree_skb(skb);
+out:
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(lwtunnel_output);
int lwtunnel_xmit(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
- if (!dst)
+ local_bh_disable();
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
- lwtstate->type > LWTUNNEL_ENCAP_MAX)
- return 0;
+ lwtstate->type > LWTUNNEL_ENCAP_MAX) {
+ ret = 0;
+ goto out;
+ }
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->xmit))
+ if (likely(ops && ops->xmit)) {
+ dev_xmit_recursion_inc();
ret = ops->xmit(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
goto drop;
- return ret;
+ goto out;
drop:
kfree_skb(skb);
+out:
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(lwtunnel_xmit);
int lwtunnel_input(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
- if (!dst)
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -410,8 +454,11 @@ int lwtunnel_input(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->input))
+ if (likely(ops && ops->input)) {
+ dev_xmit_recursion_inc();
ret = ops->input(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
diff --git a/net/core/mp_dmabuf_devmem.h b/net/core/mp_dmabuf_devmem.h
new file mode 100644
index 000000000000..67cd0dd7319c
--- /dev/null
+++ b/net/core/mp_dmabuf_devmem.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Dmabuf device memory provider.
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ *
+ */
+#ifndef _NET_MP_DMABUF_DEVMEM_H
+#define _NET_MP_DMABUF_DEVMEM_H
+
+#include <net/netmem.h>
+
+#if defined(CONFIG_NET_DEVMEM)
+int mp_dmabuf_devmem_init(struct page_pool *pool);
+
+netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp);
+
+void mp_dmabuf_devmem_destroy(struct page_pool *pool);
+
+bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem);
+#else
+static inline int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline netmem_ref
+mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ return 0;
+}
+
+static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+}
+
+static inline bool
+mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
+{
+ return false;
+}
+#endif
+
+#endif /* _NET_MP_DMABUF_DEVMEM_H */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 552719c3bbc3..49dce9a82295 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -14,7 +14,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/slab.h>
-#include <linux/kmemleak.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -61,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
static const struct seq_operations neigh_stat_seq_ops;
#endif
+static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family)
+{
+ int i;
+
+ switch (family) {
+ default:
+ DEBUG_NET_WARN_ON_ONCE(1);
+ fallthrough; /* to avoid panic by null-ptr-deref */
+ case AF_INET:
+ i = NEIGH_ARP_TABLE;
+ break;
+ case AF_INET6:
+ i = NEIGH_ND_TABLE;
+ break;
+ }
+
+ return &dev->neighbours[i];
+}
+
/*
Neighbour hash table buckets are protected with rwlock tbl->lock.
@@ -205,18 +223,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
}
}
-static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
- struct neigh_table *tbl)
+bool neigh_remove_one(struct neighbour *n)
{
bool retval = false;
write_lock(&n->lock);
if (refcount_read(&n->refcnt) == 1) {
- struct neighbour *neigh;
-
- neigh = rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock));
- rcu_assign_pointer(*np, neigh);
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
retval = true;
}
@@ -226,29 +240,6 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
return retval;
}
-bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
-{
- struct neigh_hash_table *nht;
- void *pkey = ndel->primary_key;
- u32 hash_val;
- struct neighbour *n;
- struct neighbour __rcu **np;
-
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
- hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
- hash_val = hash_val >> (32 - nht->hash_shift);
-
- np = &nht->hash_buckets[hash_val];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock)))) {
- if (n == ndel)
- return neigh_del(n, np, tbl);
- np = &n->next;
- }
- return false;
-}
-
static int neigh_forced_gc(struct neigh_table *tbl)
{
int max_clean = atomic_read(&tbl->gc_entries) -
@@ -276,7 +267,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
remove = true;
write_unlock(&n->lock);
- if (remove && neigh_remove_one(n, tbl))
+ if (remove && neigh_remove_one(n))
shrunk++;
if (shrunk >= max_clean)
break;
@@ -318,7 +309,7 @@ static void neigh_add_timer(struct neighbour *n, unsigned long when)
static int neigh_del_timer(struct neighbour *n)
{
if ((n->nud_state & NUD_IN_TIMER) &&
- del_timer(&n->timer)) {
+ timer_delete(&n->timer)) {
neigh_release(n);
return 1;
}
@@ -380,54 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
bool skip_perm)
{
- int i;
- struct neigh_hash_table *nht;
+ struct hlist_head *dev_head;
+ struct hlist_node *tmp;
+ struct neighbour *n;
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
+ dev_head = neigh_get_dev_table(dev, tbl->family);
- for (i = 0; i < (1 << nht->hash_shift); i++) {
- struct neighbour *n;
- struct neighbour __rcu **np = &nht->hash_buckets[i];
+ hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
+ if (skip_perm && n->nud_state & NUD_PERMANENT)
+ continue;
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
- if (dev && n->dev != dev) {
- np = &n->next;
- continue;
- }
- if (skip_perm && n->nud_state & NUD_PERMANENT) {
- np = &n->next;
- continue;
- }
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
- write_lock(&n->lock);
- neigh_del_timer(n);
- neigh_mark_dead(n);
- if (refcount_read(&n->refcnt) != 1) {
- /* The most unpleasant situation.
- We must destroy neighbour entry,
- but someone still uses it.
-
- The destroy will be delayed until
- the last user releases us, but
- we must kill timers etc. and move
- it to safe state.
- */
- __skb_queue_purge(&n->arp_queue);
- n->arp_queue_len_bytes = 0;
- WRITE_ONCE(n->output, neigh_blackhole);
- if (n->nud_state & NUD_VALID)
- n->nud_state = NUD_NOARP;
- else
- n->nud_state = NUD_NONE;
- neigh_dbg(2, "neigh %p is stray\n", n);
- }
- write_unlock(&n->lock);
- neigh_cleanup_and_release(n);
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ write_lock(&n->lock);
+ neigh_del_timer(n);
+ neigh_mark_dead(n);
+ if (refcount_read(&n->refcnt) != 1) {
+ /* The most unpleasant situation.
+ * We must destroy neighbour entry,
+ * but someone still uses it.
+ *
+ * The destroy will be delayed until
+ * the last user releases us, but
+ * we must kill timers etc. and move
+ * it to safe state.
+ */
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
+ WRITE_ONCE(n->output, neigh_blackhole);
+ if (n->nud_state & NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+ neigh_dbg(2, "neigh %p is stray\n", n);
}
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
}
}
@@ -448,7 +427,7 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
tbl->family);
if (skb_queue_empty_lockless(&tbl->proxy_queue))
- del_timer_sync(&tbl->proxy_timer);
+ timer_delete_sync(&tbl->proxy_timer);
return 0;
}
@@ -530,27 +509,21 @@ static void neigh_get_hash_rnd(u32 *x)
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
- size_t size = (1 << shift) * sizeof(struct neighbour *);
+ size_t size = (1 << shift) * sizeof(struct hlist_head);
+ struct hlist_head *hash_heads;
struct neigh_hash_table *ret;
- struct neighbour __rcu **buckets;
int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
- if (size <= PAGE_SIZE) {
- buckets = kzalloc(size, GFP_ATOMIC);
- } else {
- buckets = (struct neighbour __rcu **)
- __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
- get_order(size));
- kmemleak_alloc(buckets, size, 1, GFP_ATOMIC);
- }
- if (!buckets) {
+
+ hash_heads = kzalloc(size, GFP_ATOMIC);
+ if (!hash_heads) {
kfree(ret);
return NULL;
}
- ret->hash_buckets = buckets;
+ ret->hash_heads = hash_heads;
ret->hash_shift = shift;
for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
neigh_get_hash_rnd(&ret->hash_rnd[i]);
@@ -562,15 +535,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
- size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
- struct neighbour __rcu **buckets = nht->hash_buckets;
- if (size <= PAGE_SIZE) {
- kfree(buckets);
- } else {
- kmemleak_free(buckets);
- free_pages((unsigned long)buckets, get_order(size));
- }
+ kfree(nht->hash_heads);
kfree(nht);
}
@@ -589,24 +555,17 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
return old_nht;
for (i = 0; i < (1 << old_nht->hash_shift); i++) {
- struct neighbour *n, *next;
+ struct hlist_node *tmp;
+ struct neighbour *n;
- for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
- lockdep_is_held(&tbl->lock));
- n != NULL;
- n = next) {
+ neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) {
hash = tbl->hash(n->primary_key, n->dev,
new_nht->hash_rnd);
hash >>= (32 - new_nht->hash_shift);
- next = rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock));
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(
- new_nht->hash_buckets[hash],
- lockdep_is_held(&tbl->lock)));
- rcu_assign_pointer(new_nht->hash_buckets[hash], n);
+ hlist_del_rcu(&n->hash);
+ hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]);
}
}
@@ -693,11 +652,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_tbl_unlock;
}
- for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock));
- n1 != NULL;
- n1 = rcu_dereference_protected(n1->next,
- lockdep_is_held(&tbl->lock))) {
+ neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
@@ -713,10 +668,11 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
list_add_tail(&n->managed_list, &n->tbl->managed_list);
if (want_ref)
neigh_hold(n);
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock)));
- rcu_assign_pointer(nht->hash_buckets[hash_val], n);
+ hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
+
+ hlist_add_head_rcu(&n->dev_list,
+ neigh_get_dev_table(dev, tbl->family));
+
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
@@ -734,7 +690,9 @@ out_neigh_release:
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
- return ___neigh_create(tbl, pkey, dev, 0, false, want_ref);
+ bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK);
+
+ return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);
}
EXPORT_SYMBOL(__neigh_create);
@@ -874,12 +832,10 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
return -ENOENT;
}
-static void neigh_parms_destroy(struct neigh_parms *parms);
-
static inline void neigh_parms_put(struct neigh_parms *parms)
{
if (refcount_dec_and_test(&parms->refcnt))
- neigh_parms_destroy(parms);
+ kfree(parms);
}
/*
@@ -946,10 +902,10 @@ static void neigh_connect(struct neighbour *neigh)
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
+ struct neigh_hash_table *nht;
+ struct hlist_node *tmp;
struct neighbour *n;
- struct neighbour __rcu **np;
unsigned int i;
- struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
@@ -974,10 +930,7 @@ static void neigh_periodic_work(struct work_struct *work)
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
- np = &nht->hash_buckets[i];
-
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
unsigned int state;
write_lock(&n->lock);
@@ -986,7 +939,7 @@ static void neigh_periodic_work(struct work_struct *work)
if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
(n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
- goto next_elt;
+ continue;
}
if (time_before(n->used, n->confirmed) &&
@@ -997,18 +950,14 @@ static void neigh_periodic_work(struct work_struct *work)
(state == NUD_FAILED ||
!time_in_range_open(jiffies, n->used,
n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
-
-next_elt:
- np = &n->next;
}
/*
* It's fine to release lock here, even if hash table
@@ -1082,7 +1031,7 @@ static void neigh_probe(struct neighbour *neigh)
static void neigh_timer_handler(struct timer_list *t)
{
unsigned long now, next;
- struct neighbour *neigh = from_timer(neigh, t, timer);
+ struct neighbour *neigh = timer_container_of(neigh, t, timer);
unsigned int state;
int notify = 0;
@@ -1568,7 +1517,7 @@ out:
return rc;
out_kfree_skb:
rc = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
goto out;
}
EXPORT_SYMBOL(neigh_resolve_output);
@@ -1592,7 +1541,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
err = dev_queue_xmit(skb);
else {
err = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
}
return err;
}
@@ -1620,7 +1569,7 @@ static void neigh_managed_work(struct work_struct *work)
static void neigh_proxy_process(struct timer_list *t)
{
- struct neigh_table *tbl = from_timer(tbl, t, proxy_timer);
+ struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer);
long sched_next = 0;
unsigned long now = jiffies;
struct sk_buff *skb, *n;
@@ -1648,7 +1597,7 @@ static void neigh_proxy_process(struct timer_list *t)
} else if (!sched_next || tdif < sched_next)
sched_next = tdif;
}
- del_timer(&tbl->proxy_timer);
+ timer_delete(&tbl->proxy_timer);
if (sched_next)
mod_timer(&tbl->proxy_timer, jiffies + sched_next);
spin_unlock(&tbl->proxy_queue.lock);
@@ -1679,7 +1628,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
spin_lock(&tbl->proxy_queue.lock);
- if (del_timer(&tbl->proxy_timer)) {
+ if (timer_delete(&tbl->proxy_timer)) {
if (time_before(tbl->proxy_timer.expires, sched_next))
sched_next = tbl->proxy_timer.expires;
}
@@ -1762,14 +1711,9 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
}
EXPORT_SYMBOL(neigh_parms_release);
-static void neigh_parms_destroy(struct neigh_parms *parms)
-{
- kfree(parms);
-}
-
static struct lock_class_key neigh_table_proxy_queue_class;
-static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
+static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
void neigh_table_init(int index, struct neigh_table *tbl)
{
@@ -1826,17 +1770,23 @@ void neigh_table_init(int index, struct neigh_table *tbl)
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
- neigh_tables[index] = tbl;
+ rcu_assign_pointer(neigh_tables[index], tbl);
}
EXPORT_SYMBOL(neigh_table_init);
+/*
+ * Only called from ndisc_cleanup(), which means this is dead code
+ * because we no longer can unload IPv6 module.
+ */
int neigh_table_clear(int index, struct neigh_table *tbl)
{
- neigh_tables[index] = NULL;
+ RCU_INIT_POINTER(neigh_tables[index], NULL);
+ synchronize_rcu();
+
/* It is not clean... Fix it to unload IPv6 module safely */
cancel_delayed_work_sync(&tbl->managed_work);
cancel_delayed_work_sync(&tbl->gc_work);
- del_timer_sync(&tbl->proxy_timer);
+ timer_delete_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
@@ -1864,10 +1814,10 @@ static struct neigh_table *neigh_find_table(int family)
switch (family) {
case AF_INET:
- tbl = neigh_tables[NEIGH_ARP_TABLE];
+ tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]);
break;
case AF_INET6:
- tbl = neigh_tables[NEIGH_ND_TABLE];
+ tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]);
break;
}
@@ -1949,7 +1899,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
NETLINK_CB(skb).portid, extack);
write_lock_bh(&tbl->lock);
neigh_release(neigh);
- neigh_remove_one(neigh, tbl);
+ neigh_remove_one(neigh);
write_unlock_bh(&tbl->lock);
out:
@@ -2293,6 +2243,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_IFINDEX] = { .type = NLA_U32 },
[NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 },
[NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
@@ -2331,7 +2282,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
ndtmsg = nlmsg_data(nlh);
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
- tbl = neigh_tables[tidx];
+ tbl = rcu_dereference_rtnl(neigh_tables[tidx]);
if (!tbl)
continue;
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
@@ -2479,12 +2430,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh,
{
struct ndtmsg *ndtm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) {
+ ndtm = nlmsg_payload(nlh, sizeof(*ndtm));
+ if (!ndtm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request");
return -EINVAL;
}
- ndtm = nlmsg_data(nlh);
if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request");
return -EINVAL;
@@ -2519,7 +2470,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
- tbl = neigh_tables[tidx];
+ tbl = rcu_dereference_rtnl(neigh_tables[tidx]);
if (!tbl)
continue;
@@ -2674,7 +2625,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)
if (!master_idx)
return false;
- master = dev ? netdev_master_upper_dev_get(dev) : NULL;
+ master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL;
/* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another
* invalid value for ifindex to denote "no master".
@@ -2707,7 +2658,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
{
struct net *net = sock_net(skb->sk);
struct neighbour *n;
- int rc, h, s_h = cb->args[1];
+ int err = 0, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
struct neigh_hash_table *nht;
unsigned int flags = NLM_F_MULTI;
@@ -2715,37 +2666,31 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (filter->dev_idx || filter->master_idx)
flags |= NLM_F_DUMP_FILTERED;
- rcu_read_lock();
nht = rcu_dereference(tbl->nht);
for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
- for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0;
- n != NULL;
- n = rcu_dereference(n->next)) {
+ idx = 0;
+ neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) {
if (idx < s_idx || !net_eq(dev_net(n->dev), net))
goto next;
if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
neigh_master_filtered(n->dev, filter->master_idx))
goto next;
- if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- flags) < 0) {
- rc = -1;
+ err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, flags);
+ if (err < 0)
goto out;
- }
next:
idx++;
}
}
- rc = skb->len;
out:
- rcu_read_unlock();
cb->args[1] = h;
cb->args[2] = idx;
- return rc;
+ return err;
}
static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
@@ -2754,7 +2699,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
{
struct pneigh_entry *n;
struct net *net = sock_net(skb->sk);
- int rc, h, s_h = cb->args[3];
+ int err = 0, h, s_h = cb->args[3];
int idx, s_idx = idx = cb->args[4];
unsigned int flags = NLM_F_MULTI;
@@ -2772,11 +2717,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
neigh_master_filtered(n->dev, filter->master_idx))
goto next;
- if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH, flags, tbl) < 0) {
+ err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, flags, tbl);
+ if (err < 0) {
read_unlock_bh(&tbl->lock);
- rc = -1;
goto out;
}
next:
@@ -2785,12 +2730,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
}
read_unlock_bh(&tbl->lock);
- rc = skb->len;
out:
cb->args[3] = h;
cb->args[4] = idx;
- return rc;
-
+ return err;
}
static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
@@ -2804,12 +2747,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ndmsg *ndm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex ||
ndm->ndm_state || ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
@@ -2875,11 +2818,13 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack);
if (err < 0 && cb->strict_check)
return err;
+ err = 0;
s_t = cb->args[0];
+ rcu_read_lock();
for (t = 0; t < NEIGH_NR_TABLES; t++) {
- tbl = neigh_tables[t];
+ tbl = rcu_dereference(neigh_tables[t]);
if (!tbl)
continue;
@@ -2895,9 +2840,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
if (err < 0)
break;
}
+ rcu_read_unlock();
cb->args[0] = t;
- return skb->len;
+ return err;
}
static int neigh_valid_get_req(const struct nlmsghdr *nlh,
@@ -2909,12 +2855,12 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request");
@@ -3094,9 +3040,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
- for (n = rcu_dereference(nht->hash_buckets[chain]);
- n != NULL;
- n = rcu_dereference(n->next))
+ neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
cb(n, cookie);
}
read_unlock_bh(&tbl->lock);
@@ -3108,29 +3052,25 @@ EXPORT_SYMBOL(neigh_for_each);
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
- int chain;
struct neigh_hash_table *nht;
+ int chain;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct hlist_node *tmp;
struct neighbour *n;
- struct neighbour __rcu **np;
- np = &nht->hash_buckets[chain];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
- } else
- np = &n->next;
+ }
write_unlock(&n->lock);
if (release)
neigh_cleanup_and_release(n);
@@ -3143,14 +3083,15 @@ int neigh_xmit(int index, struct net_device *dev,
const void *addr, struct sk_buff *skb)
{
int err = -EAFNOSUPPORT;
+
if (likely(index < NEIGH_NR_TABLES)) {
struct neigh_table *tbl;
struct neighbour *neigh;
- tbl = neigh_tables[index];
- if (!tbl)
- goto out;
rcu_read_lock();
+ tbl = rcu_dereference(neigh_tables[index]);
+ if (!tbl)
+ goto out_unlock;
if (index == NEIGH_ARP_TABLE) {
u32 key = *((u32 *)addr);
@@ -3166,6 +3107,7 @@ int neigh_xmit(int index, struct net_device *dev,
goto out_kfree_skb;
}
err = READ_ONCE(neigh->output)(neigh, skb);
+out_unlock:
rcu_read_unlock();
}
else if (index == NEIGH_LINK_TABLE) {
@@ -3185,43 +3127,53 @@ EXPORT_SYMBOL(neigh_xmit);
#ifdef CONFIG_PROC_FS
-static struct neighbour *neigh_get_first(struct seq_file *seq)
+static struct neighbour *neigh_get_valid(struct seq_file *seq,
+ struct neighbour *n,
+ loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
+
+ if (!net_eq(dev_net(n->dev), net))
+ return NULL;
+
+ if (state->neigh_sub_iter) {
+ loff_t fakep = 0;
+ void *v;
+
+ v = state->neigh_sub_iter(state, n, pos ? pos : &fakep);
+ if (!v)
+ return NULL;
+ if (pos)
+ return v;
+ }
+
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ return n;
+
+ if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
+ return n;
+
+ return NULL;
+}
+
+static struct neighbour *neigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
struct neigh_hash_table *nht = state->nht;
- struct neighbour *n = NULL;
- int bucket;
+ struct neighbour *n, *tmp;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
- for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
- n = rcu_dereference(nht->hash_buckets[bucket]);
- while (n) {
- if (!net_eq(dev_net(n->dev), net))
- goto next;
- if (state->neigh_sub_iter) {
- loff_t fakep = 0;
- void *v;
-
- v = state->neigh_sub_iter(state, n, &fakep);
- if (!v)
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
- if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
- break;
-next:
- n = rcu_dereference(n->next);
+ while (++state->bucket < (1 << nht->hash_shift)) {
+ neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) {
+ tmp = neigh_get_valid(seq, n, NULL);
+ if (tmp)
+ return tmp;
}
-
- if (n)
- break;
}
- state->bucket = bucket;
- return n;
+ return NULL;
}
static struct neighbour *neigh_get_next(struct seq_file *seq,
@@ -3229,46 +3181,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
- struct net *net = seq_file_net(seq);
- struct neigh_hash_table *nht = state->nht;
+ struct neighbour *tmp;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
+
if (v)
return n;
}
- n = rcu_dereference(n->next);
- while (1) {
- while (n) {
- if (!net_eq(dev_net(n->dev), net))
- goto next;
- if (state->neigh_sub_iter) {
- void *v = state->neigh_sub_iter(state, n, pos);
- if (v)
- return n;
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
-
- if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
- break;
-next:
- n = rcu_dereference(n->next);
+ hlist_for_each_entry_continue(n, hash) {
+ tmp = neigh_get_valid(seq, n, pos);
+ if (tmp) {
+ n = tmp;
+ goto out;
}
-
- if (n)
- break;
-
- if (++state->bucket >= (1 << nht->hash_shift))
- break;
-
- n = rcu_dereference(nht->hash_buckets[state->bucket]);
}
+ n = neigh_get_first(seq);
+out:
if (n && pos)
--(*pos);
+
return n;
}
@@ -3371,7 +3305,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
struct neigh_seq_state *state = seq->private;
state->tbl = tbl;
- state->bucket = 0;
+ state->bucket = -1;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
rcu_read_lock();
@@ -3507,10 +3441,12 @@ static const struct seq_operations neigh_stat_seq_ops = {
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid)
{
- struct net *net = dev_net(n->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
+ struct net *net;
+ rcu_read_lock();
+ net = dev_net_rcu(n->dev);
skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
@@ -3523,10 +3459,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
- return;
+ goto out;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+out:
+ rcu_read_unlock();
}
void neigh_app_ns(struct neighbour *n)
@@ -3538,7 +3475,7 @@ EXPORT_SYMBOL(neigh_app_ns);
#ifdef CONFIG_SYSCTL
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
-static int proc_unres_qlen(struct ctl_table *ctl, int write,
+static int proc_unres_qlen(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int size, ret;
@@ -3573,7 +3510,7 @@ static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
rcu_read_unlock();
}
-static void neigh_proc_update(struct ctl_table *ctl, int write)
+static void neigh_proc_update(const struct ctl_table *ctl, int write)
{
struct net_device *dev = ctl->extra1;
struct neigh_parms *p = ctl->extra2;
@@ -3590,7 +3527,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
neigh_copy_dflt_parms(net, p, index);
}
-static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
+static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3605,7 +3542,7 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
return ret;
}
-static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write,
+static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table tmp = *ctl;
@@ -3621,7 +3558,7 @@ static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int wr
return ret;
}
-int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
+int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -3631,7 +3568,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
}
EXPORT_SYMBOL(neigh_proc_dointvec);
-int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,
+int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3641,7 +3578,7 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,
}
EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
-static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
+static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3651,7 +3588,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
return ret;
}
-int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
+int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3661,7 +3598,7 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
}
EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
-static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
+static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3671,7 +3608,7 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
return ret;
}
-static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
+static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -3728,7 +3665,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX];
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
@@ -3779,7 +3716,6 @@ static struct neigh_sysctl_table {
.extra2 = SYSCTL_INT_MAX,
.proc_handler = proc_dointvec_minmax,
},
- {},
},
};
@@ -3807,8 +3743,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
- memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
- sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1;
} else {
struct neigh_table *tbl = p->tbl;
@@ -3885,16 +3819,18 @@ EXPORT_SYMBOL(neigh_sysctl_unregister);
#endif /* CONFIG_SYSCTL */
+static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNEIGH, .doit = neigh_add},
+ {.msgtype = RTM_DELNEIGH, .doit = neigh_delete},
+ {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info},
+ {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set},
+};
+
static int __init neigh_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
- 0);
- rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0);
-
+ rtnl_register_many(neigh_rtnl_msg_handlers);
return 0;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index a97eceb84e61..4f0f0709a1cb 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
+ /* Pairs with WRITE_ONCE() in skb_flow_limit() */
if (fl)
- flow_limit_count = fl->count;
+ flow_limit_count = READ_ONCE(fl->count);
rcu_read_unlock();
#endif
@@ -144,10 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x\n",
- sd->processed, sd->dropped, sd->time_squeeze, 0,
+ READ_ONCE(sd->processed), atomic_read(&sd->dropped),
+ READ_ONCE(sd->time_squeeze), 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
- sd->received_rps, flow_limit_count,
+ READ_ONCE(sd->received_rps), flow_limit_count,
input_qlen + process_qlen, (int)seq->index,
input_qlen, process_qlen);
return 0;
@@ -184,7 +186,13 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
}
}
- list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) {
+ list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) {
if (i == pos)
return pt;
++i;
@@ -209,6 +217,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
struct net_device *dev;
struct packet_type *pt;
struct list_head *nxt;
@@ -231,15 +240,22 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
goto found;
}
}
-
- nxt = net_hotdata.ptype_all.next;
- goto ptype_all;
+ nxt = net->ptype_all.next;
+ goto net_ptype_all;
}
- if (pt->type == htons(ETH_P_ALL)) {
-ptype_all:
- if (nxt != &net_hotdata.ptype_all)
+ if (pt->af_packet_net) {
+net_ptype_all:
+ if (nxt != &net->ptype_all && nxt != &net->ptype_specific)
goto found;
+
+ if (nxt == &net->ptype_all) {
+ /* continue with ->ptype_specific if it's not empty */
+ nxt = net->ptype_specific.next;
+ if (nxt != &net->ptype_specific)
+ goto found;
+ }
+
hash = 0;
nxt = ptype_base[0].next;
} else
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e3d7a8cfa20b..1ace0cd01adc 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -23,6 +23,7 @@
#include <linux/of.h>
#include <linux/of_net.h>
#include <linux/cpu.h>
+#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/rps.h>
@@ -32,15 +33,97 @@
#ifdef CONFIG_SYSFS
static const char fmt_hex[] = "%#x\n";
static const char fmt_dec[] = "%d\n";
+static const char fmt_uint[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
-/* Caller holds RTNL or RCU */
+/* Caller holds RTNL, netdev->lock or RCU */
static inline int dev_isalive(const struct net_device *dev)
{
return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED;
}
+/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active,
+ * when unregistering a net device and accessing associated sysfs files. The
+ * potential deadlock is as follow:
+ *
+ * CPU 0 CPU 1
+ *
+ * rtnl_lock vfs_read
+ * unregister_netdevice_many kernfs_seq_start
+ * device_del / kobject_put kernfs_get_active (kn->active++)
+ * kernfs_drain sysfs_kf_seq_show
+ * wait_event( rtnl_lock
+ * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release
+ * -> waits on CPU 1 to decrease kn->active the rtnl lock.
+ *
+ * The historical fix was to use rtnl_trylock with restart_syscall to bail out
+ * of sysfs operations when the lock couldn't be taken. This fixed the above
+ * issue as it allowed CPU 1 to bail out of the ABBA situation.
+ *
+ * But it came with performances issues, as syscalls are being restarted in
+ * loops when there was contention on the rtnl lock, with huge slow downs in
+ * specific scenarios (e.g. lots of virtual interfaces created and userspace
+ * daemons querying their attributes).
+ *
+ * The idea below is to bail out of the active kernfs_node protection
+ * (kn->active) while trying to take the rtnl lock.
+ *
+ * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The
+ * net device is guaranteed to be alive if this returns successfully.
+ */
+static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr,
+ struct net_device *ndev)
+{
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ /* First, we hold a reference to the net device as the unregistration
+ * path might run in parallel. This will ensure the net device and the
+ * associated sysfs objects won't be freed while we try to take the rtnl
+ * lock.
+ */
+ dev_hold(ndev);
+ /* sysfs_break_active_protection was introduced to allow self-removal of
+ * devices and their associated sysfs files by bailing out of the
+ * sysfs/kernfs protection. We do this here to allow the unregistration
+ * path to complete in parallel. The following takes a reference on the
+ * kobject and the kernfs_node being accessed.
+ *
+ * This works because we hold a reference onto the net device and the
+ * unregistration path will wait for us eventually in netdev_run_todo
+ * (outside an rtnl lock section).
+ */
+ kn = sysfs_break_active_protection(kobj, attr);
+ /* We can now try to take the rtnl lock. This can't deadlock us as the
+ * unregistration path is able to drain sysfs files (kernfs_node) thanks
+ * to the above dance.
+ */
+ if (rtnl_lock_interruptible()) {
+ ret = -ERESTARTSYS;
+ goto unbreak;
+ }
+ /* Check dismantle on the device hasn't started, otherwise deny the
+ * operation.
+ */
+ if (!dev_isalive(ndev)) {
+ rtnl_unlock();
+ ret = -ENODEV;
+ goto unbreak;
+ }
+ /* We are now sure the device dismantle hasn't started nor that it can
+ * start before we exit the locking section as we hold the rtnl lock.
+ * There's no need to keep unbreaking the sysfs protection nor to hold
+ * a net device reference from that point; that was only needed to take
+ * the rtnl lock.
+ */
+unbreak:
+ sysfs_unbreak_active_protection(kn);
+ dev_put(ndev);
+
+ return ret;
+}
+
/* use same locking rules as GIF* ioctl's */
static ssize_t netdev_show(const struct device *dev,
struct device_attribute *attr, char *buf,
@@ -94,16 +177,46 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
if (ret)
goto err;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ goto err;
+
+ ret = (*set)(netdev, new);
+ if (ret == 0)
+ ret = len;
+
+ rtnl_unlock();
+ err:
+ return ret;
+}
+
+/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */
+static ssize_t
+netdev_lock_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ unsigned long new;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
+ return ret;
+
+ netdev_lock(netdev);
if (dev_isalive(netdev)) {
ret = (*set)(netdev, new);
if (ret == 0)
ret = len;
}
- rtnl_unlock();
- err:
+ netdev_unlock(netdev);
+
return ret;
}
@@ -189,7 +302,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
struct net_device *netdev = to_net_dev(dev);
/* The check is also done in change_carrier; this helps returning early
- * without hitting the trylock/restart in netdev_store.
+ * without hitting the locking section in netdev_store.
*/
if (!netdev->netdev_ops->ndo_change_carrier)
return -EOPNOTSUPP;
@@ -201,11 +314,13 @@ static ssize_t carrier_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- int ret = -EINVAL;
+ int ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+ ret = -EINVAL;
if (netif_running(netdev)) {
/* Synchronize carrier state with link watch,
* see also rtnl_getlink().
@@ -214,8 +329,8 @@ static ssize_t carrier_show(struct device *dev,
ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
}
- rtnl_unlock();
+ rtnl_unlock();
return ret;
}
static DEVICE_ATTR_RW(carrier);
@@ -227,15 +342,17 @@ static ssize_t speed_show(struct device *dev,
int ret = -EINVAL;
/* The check is also done in __ethtool_get_link_ksettings; this helps
- * returning early without hitting the trylock/restart below.
+ * returning early without hitting the locking section below.
*/
if (!netdev->ethtool_ops->get_link_ksettings)
return ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (netif_running(netdev) && netif_device_present(netdev)) {
+ ret = -EINVAL;
+ if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
if (!__ethtool_get_link_ksettings(netdev, &cmd))
@@ -253,14 +370,16 @@ static ssize_t duplex_show(struct device *dev,
int ret = -EINVAL;
/* The check is also done in __ethtool_get_link_ksettings; this helps
- * returning early without hitting the trylock/restart below.
+ * returning early without hitting the locking section below.
*/
if (!netdev->ethtool_ops->get_link_ksettings)
return ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+ ret = -EINVAL;
if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
@@ -408,7 +527,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
- WRITE_ONCE(dev->gro_flush_timeout, val);
+ netdev_set_gro_flush_timeout(dev, val);
return 0;
}
@@ -419,13 +538,16 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+ return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout);
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
{
- WRITE_ONCE(dev->napi_defer_hard_irqs, val);
+ if (val > S32_MAX)
+ return -ERANGE;
+
+ netdev_set_defer_hard_irqs(dev, (u32)val);
return 0;
}
@@ -436,9 +558,10 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
+ return netdev_lock_store(dev, attr, buf, len,
+ change_napi_defer_hard_irqs);
}
-NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
+NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint);
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
@@ -446,7 +569,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
struct net_device *netdev = to_net_dev(dev);
struct net *net = dev_net(netdev);
size_t count = len;
- ssize_t ret = 0;
+ ssize_t ret;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -455,16 +578,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
if (len > 0 && buf[len - 1] == '\n')
--count;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- ret = dev_set_alias(netdev, buf, count);
- if (ret < 0)
- goto err;
- ret = len;
- netdev_state_change(netdev);
- }
+ ret = dev_set_alias(netdev, buf, count);
+ if (ret < 0)
+ goto err;
+ ret = len;
+ netdev_state_change(netdev);
err:
rtnl_unlock();
@@ -476,7 +598,7 @@ static ssize_t ifalias_show(struct device *dev,
{
const struct net_device *netdev = to_net_dev(dev);
char tmp[IFALIASZ];
- ssize_t ret = 0;
+ ssize_t ret;
ret = dev_get_alias(netdev, tmp, sizeof(tmp));
if (ret > 0)
@@ -516,24 +638,23 @@ static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ struct netdev_phys_item_id ppid;
+ ssize_t ret;
/* The check is also done in dev_get_phys_port_id; this helps returning
- * early without hitting the trylock/restart below.
+ * early without hitting the locking section below.
*/
if (!netdev->netdev_ops->ndo_get_phys_port_id)
return -EOPNOTSUPP;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- struct netdev_phys_item_id ppid;
+ ret = dev_get_phys_port_id(netdev, &ppid);
+ if (!ret)
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- ret = dev_get_phys_port_id(netdev, &ppid);
- if (!ret)
- ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- }
rtnl_unlock();
return ret;
@@ -544,25 +665,24 @@ static ssize_t phys_port_name_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ char name[IFNAMSIZ];
+ ssize_t ret;
/* The checks are also done in dev_get_phys_port_name; this helps
- * returning early without hitting the trylock/restart below.
+ * returning early without hitting the locking section below.
*/
if (!netdev->netdev_ops->ndo_get_phys_port_name &&
!netdev->devlink_port)
return -EOPNOTSUPP;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- char name[IFNAMSIZ];
+ ret = dev_get_phys_port_name(netdev, name, sizeof(name));
+ if (!ret)
+ ret = sysfs_emit(buf, "%s\n", name);
- ret = dev_get_phys_port_name(netdev, name, sizeof(name));
- if (!ret)
- ret = sysfs_emit(buf, "%s\n", name);
- }
rtnl_unlock();
return ret;
@@ -573,26 +693,25 @@ static ssize_t phys_switch_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ struct netdev_phys_item_id ppid = { };
+ ssize_t ret;
/* The checks are also done in dev_get_phys_port_name; this helps
- * returning early without hitting the trylock/restart below. This works
+ * returning early without hitting the locking section below. This works
* because recurse is false when calling dev_get_port_parent_id.
*/
if (!netdev->netdev_ops->ndo_get_port_parent_id &&
!netdev->devlink_port)
return -EOPNOTSUPP;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- struct netdev_phys_item_id ppid = { };
+ ret = dev_get_port_parent_id(netdev, &ppid, false);
+ if (!ret)
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- ret = dev_get_port_parent_id(netdev, &ppid, false);
- if (!ret)
- ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- }
rtnl_unlock();
return ret;
@@ -605,13 +724,13 @@ static ssize_t threaded_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- if (!rtnl_trylock())
- return restart_syscall();
+ rcu_read_lock();
if (dev_isalive(netdev))
- ret = sysfs_emit(buf, fmt_dec, netdev->threaded);
+ ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded));
+
+ rcu_read_unlock();
- rtnl_unlock();
return ret;
}
@@ -634,7 +753,7 @@ static ssize_t threaded_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+ return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded);
}
static DEVICE_ATTR_RW(threaded);
@@ -937,7 +1056,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
rcu_read_lock();
flow_table = rcu_dereference(queue->rps_flow_table);
if (flow_table)
- val = (unsigned long)flow_table->mask + 1;
+ val = 1UL << flow_table->log;
rcu_read_unlock();
return sysfs_emit(buf, "%lu\n", val);
@@ -990,7 +1109,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
if (!table)
return -ENOMEM;
- table->mask = mask;
+ table->log = ilog2(mask) + 1;
for (count = 0; count <= mask; count++)
table->flows[count].cpu = RPS_NO_CPU;
} else {
@@ -1056,7 +1175,7 @@ static const void *rx_queue_namespace(const struct kobject *kobj)
struct device *dev = &queue->dev->dev;
const void *ns = NULL;
- if (dev->class && dev->class->ns_type)
+ if (dev->class && dev->class->namespace)
ns = dev->class->namespace(dev);
return ns;
@@ -1073,7 +1192,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj,
static const struct kobj_type rx_queue_ktype = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
- .default_groups = rx_queue_default_groups,
.namespace = rx_queue_namespace,
.get_ownership = rx_queue_get_ownership,
};
@@ -1096,6 +1214,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
struct kobject *kobj = &queue->kobj;
int error = 0;
+ /* Rx queues are cleared in rx_queue_release to allow later
+ * re-registration. This is triggered when their kobj refcount is
+ * dropped.
+ *
+ * If a queue is removed while both a read (or write) operation and a
+ * the re-addition of the same queue are pending (waiting on rntl_lock)
+ * it might happen that the re-addition will execute before the read,
+ * making the initial removal to never happen (queue's kobj refcount
+ * won't drop enough because of the pending read). In such rare case,
+ * return to allow the removal operation to complete.
+ */
+ if (unlikely(kobj->state_initialized)) {
+ netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed");
+ return -EAGAIN;
+ }
+
/* Kobject_put later will trigger rx_queue_release call which
* decreases dev refcount: Take that reference here
*/
@@ -1107,20 +1241,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
if (error)
goto err;
+ queue->groups = rx_queue_default_groups;
+ error = sysfs_create_groups(kobj, queue->groups);
+ if (error)
+ goto err;
+
if (dev->sysfs_rx_queue_group) {
error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
if (error)
- goto err;
+ goto err_default_groups;
}
error = rx_queue_default_mask(dev, queue);
if (error)
- goto err;
+ goto err_default_groups;
kobject_uevent(kobj, KOBJ_ADD);
return error;
+err_default_groups:
+ sysfs_remove_groups(kobj, queue->groups);
err:
kobject_put(kobj);
return error;
@@ -1165,12 +1306,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
}
while (--i >= new_num) {
- struct kobject *kobj = &dev->_rx[i].kobj;
+ struct netdev_rx_queue *queue = &dev->_rx[i];
+ struct kobject *kobj = &queue->kobj;
if (!refcount_read(&dev_net(dev)->ns.count))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ sysfs_remove_groups(kobj, queue->groups);
kobject_put(kobj);
}
@@ -1209,9 +1352,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num,
*/
struct netdev_queue_attribute {
struct attribute attr;
- ssize_t (*show)(struct netdev_queue *queue, char *buf);
- ssize_t (*store)(struct netdev_queue *queue,
- const char *buf, size_t len);
+ ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf);
+ ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len);
};
#define to_netdev_queue_attr(_attr) \
container_of(_attr, struct netdev_queue_attribute, attr)
@@ -1228,7 +1373,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- return attribute->show(queue, buf);
+ return attribute->show(kobj, attr, queue, buf);
}
static ssize_t netdev_queue_attr_store(struct kobject *kobj,
@@ -1242,7 +1387,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- return attribute->store(queue, buf, count);
+ return attribute->store(kobj, attr, queue, buf, count);
}
static const struct sysfs_ops netdev_queue_sysfs_ops = {
@@ -1250,7 +1395,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
-static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
+static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout);
@@ -1268,18 +1414,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
-static ssize_t traffic_class_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
- int num_tc, tc;
- int index;
+ int num_tc, tc, index, ret;
if (!netif_is_multiqueue(dev))
return -ENOENT;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
+ if (ret)
+ return ret;
index = get_netdev_queue_index(queue);
@@ -1306,24 +1452,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
}
#ifdef CONFIG_XPS
-static ssize_t tx_maxrate_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
return sysfs_emit(buf, "%lu\n", queue->tx_maxrate);
}
-static ssize_t tx_maxrate_store(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
- struct net_device *dev = queue->dev;
int err, index = get_netdev_queue_index(queue);
+ struct net_device *dev = queue->dev;
u32 rate = 0;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
/* The check is also done later; this helps returning early without
- * hitting the trylock/restart below.
+ * hitting the locking section below.
*/
if (!dev->netdev_ops->ndo_set_tx_maxrate)
return -EOPNOTSUPP;
@@ -1332,18 +1479,23 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue,
if (err < 0)
return err;
- if (!rtnl_trylock())
- return restart_syscall();
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err)
+ return err;
err = -EOPNOTSUPP;
+ netdev_lock_ops(dev);
if (dev->netdev_ops->ndo_set_tx_maxrate)
err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate);
+ netdev_unlock_ops(dev);
- rtnl_unlock();
if (!err) {
queue->tx_maxrate = rate;
+ rtnl_unlock();
return len;
}
+
+ rtnl_unlock();
return err;
}
@@ -1387,16 +1539,17 @@ static ssize_t bql_set(const char *buf, const size_t count,
return count;
}
-static ssize_t bql_show_hold_time(struct netdev_queue *queue,
- char *buf)
+static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
}
-static ssize_t bql_set_hold_time(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
struct dql *dql = &queue->dql;
unsigned int value;
@@ -1415,15 +1568,17 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
= __ATTR(hold_time, 0644,
bql_show_hold_time, bql_set_hold_time);
-static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf)
+static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
- return sprintf(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
}
-static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
struct dql *dql = &queue->dql;
unsigned int value;
@@ -1449,13 +1604,15 @@ static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =
__ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs);
-static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf)
+static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
- return sprintf(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
+ return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
}
-static ssize_t bql_set_stall_max(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
WRITE_ONCE(queue->dql.stall_max, 0);
return len;
@@ -1464,18 +1621,19 @@ static ssize_t bql_set_stall_max(struct netdev_queue *queue,
static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init =
__ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max);
-static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)
+static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
- return sprintf(buf, "%lu\n", dql->stall_cnt);
+ return sysfs_emit(buf, "%lu\n", dql->stall_cnt);
}
static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init =
__ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL);
-static ssize_t bql_show_inflight(struct netdev_queue *queue,
- char *buf)
+static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
@@ -1486,13 +1644,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
__ATTR(inflight, 0444, bql_show_inflight, NULL);
#define BQL_ATTR(NAME, FIELD) \
-static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
- char *buf) \
+static ssize_t bql_show_ ## NAME(struct kobject *kobj, \
+ struct attribute *attr, \
+ struct netdev_queue *queue, char *buf) \
{ \
return bql_show(buf, queue->dql.FIELD); \
} \
\
-static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+static ssize_t bql_set_ ## NAME(struct kobject *kobj, \
+ struct attribute *attr, \
+ struct netdev_queue *queue, \
const char *buf, size_t len) \
{ \
return bql_set(buf, len, &queue->dql.FIELD); \
@@ -1524,7 +1685,7 @@ static const struct attribute_group dql_group = {
};
#else
/* Fake declaration, all the code using it should be dead */
-extern const struct attribute_group dql_group;
+static const struct attribute_group dql_group = {};
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
@@ -1578,19 +1739,21 @@ out_no_maps:
return len < PAGE_SIZE ? len : -EINVAL;
}
-static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
+static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
unsigned int index;
- int len, tc;
+ int len, tc, ret;
if (!netif_is_multiqueue(dev))
return -ENOENT;
index = get_netdev_queue_index(queue);
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
+ if (ret)
+ return ret;
/* If queue belongs to subordinate dev use its map */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
@@ -1601,18 +1764,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
return -EINVAL;
}
- /* Make sure the subordinate device can't be freed */
- get_device(&dev->dev);
+ /* Increase the net device refcnt to make sure it won't be freed while
+ * xps_queue_show is running.
+ */
+ dev_hold(dev);
rtnl_unlock();
len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
- put_device(&dev->dev);
+ dev_put(dev);
return len;
}
-static ssize_t xps_cpus_store(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
struct net_device *dev = queue->dev;
unsigned int index;
@@ -1636,9 +1802,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
return err;
}
- if (!rtnl_trylock()) {
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err) {
free_cpumask_var(mask);
- return restart_syscall();
+ return err;
}
err = netif_set_xps_queue(dev, mask, index);
@@ -1652,26 +1819,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
-static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
unsigned int index;
- int tc;
+ int tc, ret;
index = get_netdev_queue_index(queue);
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(kobj, attr, dev);
+ if (ret)
+ return ret;
tc = netdev_txq_to_tc(dev, index);
+
+ /* Increase the net device refcnt to make sure it won't be freed while
+ * xps_queue_show is running.
+ */
+ dev_hold(dev);
rtnl_unlock();
- if (tc < 0)
- return -EINVAL;
- return xps_queue_show(dev, index, tc, buf, XPS_RXQS);
+ ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL;
+ dev_put(dev);
+ return ret;
}
-static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
size_t len)
{
struct net_device *dev = queue->dev;
@@ -1695,9 +1870,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
return err;
}
- if (!rtnl_trylock()) {
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err) {
bitmap_free(mask);
- return restart_syscall();
+ return err;
}
cpus_read_lock();
@@ -1740,7 +1916,7 @@ static const void *netdev_queue_namespace(const struct kobject *kobj)
struct device *dev = &queue->dev->dev;
const void *ns = NULL;
- if (dev->class && dev->class->ns_type)
+ if (dev->class && dev->class->namespace)
ns = dev->class->namespace(dev);
return ns;
@@ -1757,15 +1933,13 @@ static void netdev_queue_get_ownership(const struct kobject *kobj,
static const struct kobj_type netdev_queue_ktype = {
.sysfs_ops = &netdev_queue_sysfs_ops,
.release = netdev_queue_release,
- .default_groups = netdev_queue_default_groups,
.namespace = netdev_queue_namespace,
.get_ownership = netdev_queue_get_ownership,
};
static bool netdev_uses_bql(const struct net_device *dev)
{
- if (dev->features & NETIF_F_LLTX ||
- dev->priv_flags & IFF_NO_QUEUE)
+ if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE))
return false;
return IS_ENABLED(CONFIG_BQL);
@@ -1777,6 +1951,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
struct kobject *kobj = &queue->kobj;
int error = 0;
+ /* Tx queues are cleared in netdev_queue_release to allow later
+ * re-registration. This is triggered when their kobj refcount is
+ * dropped.
+ *
+ * If a queue is removed while both a read (or write) operation and a
+ * the re-addition of the same queue are pending (waiting on rntl_lock)
+ * it might happen that the re-addition will execute before the read,
+ * making the initial removal to never happen (queue's kobj refcount
+ * won't drop enough because of the pending read). In such rare case,
+ * return to allow the removal operation to complete.
+ */
+ if (unlikely(kobj->state_initialized)) {
+ netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed");
+ return -EAGAIN;
+ }
+
/* Kobject_put later will trigger netdev_queue_release call
* which decreases dev refcount: Take that reference here
*/
@@ -1788,15 +1978,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
if (error)
goto err;
+ queue->groups = netdev_queue_default_groups;
+ error = sysfs_create_groups(kobj, queue->groups);
+ if (error)
+ goto err;
+
if (netdev_uses_bql(dev)) {
error = sysfs_create_group(kobj, &dql_group);
if (error)
- goto err;
+ goto err_default_groups;
}
kobject_uevent(kobj, KOBJ_ADD);
return 0;
+err_default_groups:
+ sysfs_remove_groups(kobj, queue->groups);
err:
kobject_put(kobj);
return error;
@@ -1851,6 +2048,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
if (netdev_uses_bql(dev))
sysfs_remove_group(&queue->kobj, &dql_group);
+ sysfs_remove_groups(&queue->kobj, queue->groups);
kobject_put(&queue->kobj);
}
@@ -1950,8 +2148,10 @@ static void remove_queue_kobjects(struct net_device *dev)
net_rx_queue_update_kobjects(dev, real_rx, 0);
netdev_queue_update_kobjects(dev, real_tx, 0);
+ netdev_lock_ops(dev);
dev->real_num_rx_queues = 0;
dev->real_num_tx_queues = 0;
+ netdev_unlock_ops(dev);
#ifdef CONFIG_SYSFS
kset_unregister(dev->queues_kset);
#endif
@@ -2028,7 +2228,7 @@ static void netdev_release(struct device *d)
* device is dead and about to be freed.
*/
kfree(rcu_access_pointer(dev->ifalias));
- netdev_freemem(dev);
+ kvfree(dev);
}
static const void *net_namespace(const struct device *d)
@@ -2046,7 +2246,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid)
net_ns_get_ownership(net, uid, gid);
}
-static struct class net_class __ro_after_init = {
+static const struct class net_class = {
.name = "net",
.dev_release = netdev_release,
.dev_groups = net_class_groups,
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 6aef976bc1da..f2fa34b1d78d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -23,7 +23,7 @@
#include <linux/net_dropmon.h>
#include <linux/slab.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <asm/bitops.h>
#define CREATE_TRACE_POINTS
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index f0540c557515..ae54f26709ca 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -56,7 +56,6 @@ static bool init_net_initialized;
* outside.
*/
DECLARE_RWSEM(pernet_ops_rwsem);
-EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
#define MIN_PERNET_OPS_ID \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -69,12 +68,15 @@ DEFINE_COOKIE(net_cookie);
static struct net_generic *net_alloc_generic(void)
{
+ unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
+ unsigned int generic_size;
struct net_generic *ng;
- unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+ generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
- ng->s.len = max_gen_ptrs;
+ ng->s.len = gen_ptrs;
return ng;
}
@@ -122,7 +124,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net)
int err = -ENOMEM;
void *data = NULL;
- if (ops->id && ops->size) {
+ if (ops->id) {
data = kzalloc(ops->size, GFP_KERNEL);
if (!data)
goto out;
@@ -137,7 +139,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net)
if (!err)
return 0;
- if (ops->id && ops->size) {
+ if (ops->id) {
ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&pernet_ops_rwsem));
ng->ptr[*ops->id] = NULL;
@@ -161,16 +163,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops,
}
}
+static void ops_exit_rtnl_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ const struct pernet_operations *saved_ops = ops;
+ LIST_HEAD(dev_kill_list);
+ struct net *net;
+
+ rtnl_lock();
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ __rtnl_net_lock(net);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ if (ops->exit_rtnl)
+ ops->exit_rtnl(net, &dev_kill_list);
+ }
+
+ __rtnl_net_unlock(net);
+ }
+
+ unregister_netdevice_many(&dev_kill_list);
+
+ rtnl_unlock();
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
- struct net *net;
if (ops->exit) {
+ struct net *net;
+
list_for_each_entry(net, net_exit_list, exit_list) {
ops->exit(net);
cond_resched();
}
}
+
if (ops->exit_batch)
ops->exit_batch(net_exit_list);
}
@@ -179,12 +210,63 @@ static void ops_free_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
struct net *net;
- if (ops->size && ops->id) {
+
+ if (ops->id) {
list_for_each_entry(net, net_exit_list, exit_list)
kfree(net_generic(net, *ops->id));
}
}
+static void ops_undo_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list,
+ bool expedite_rcu)
+{
+ const struct pernet_operations *saved_ops;
+ bool hold_rtnl = false;
+
+ if (!ops)
+ ops = list_entry(ops_list, typeof(*ops), list);
+
+ saved_ops = ops;
+
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ hold_rtnl |= !!ops->exit_rtnl;
+ ops_pre_exit_list(ops, net_exit_list);
+ }
+
+ /* Another CPU might be rcu-iterating the list, wait for it.
+ * This needs to be before calling the exit() notifiers, so the
+ * rcu_barrier() after ops_undo_list() isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
+ */
+ if (expedite_rcu)
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
+
+ if (hold_rtnl)
+ ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_exit_list(ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_free_list(ops, net_exit_list);
+}
+
+static void ops_undo_single(struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ LIST_HEAD(ops_list);
+
+ list_add(&ops->list, &ops_list);
+ ops_undo_list(&ops_list, NULL, net_exit_list, false);
+ list_del(&ops->list);
+}
+
/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
@@ -305,36 +387,56 @@ struct net *get_net_ns_by_id(const struct net *net, int id)
}
EXPORT_SYMBOL_GPL(get_net_ns_by_id);
+static __net_init void preinit_net_sysctl(struct net *net)
+{
+ net->core.sysctl_somaxconn = SOMAXCONN;
+ /* Limits per socket sk_omem_alloc usage.
+ * TCP zerocopy regular usage needs 128 KB.
+ */
+ net->core.sysctl_optmem_max = 128 * 1024;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+ net->core.sysctl_tstamp_allow_data = 1;
+}
+
/* init code that must occur even if setup_net() is not called. */
-static __net_init void preinit_net(struct net *net)
+static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
{
+ refcount_set(&net->passive, 1);
+ refcount_set(&net->ns.count, 1);
+ ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");
+
+ get_random_bytes(&net->hash_mix, sizeof(u32));
+ net->dev_base_seq = 1;
+ net->user_ns = user_ns;
+
+ idr_init(&net->netns_ids);
+ spin_lock_init(&net->nsid_lock);
+ mutex_init(&net->ipv4.ra_mutex);
+
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+ mutex_init(&net->rtnl_mutex);
+ lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
+#endif
+
+ INIT_LIST_HEAD(&net->ptype_all);
+ INIT_LIST_HEAD(&net->ptype_specific);
+ preinit_net_sysctl(net);
}
/*
* setup_net runs the initializers for the network namespace object.
*/
-static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
+static __net_init int setup_net(struct net *net)
{
/* Must be called with pernet_ops_rwsem held */
- const struct pernet_operations *ops, *saved_ops;
+ const struct pernet_operations *ops;
LIST_HEAD(net_exit_list);
- LIST_HEAD(dev_kill_list);
int error = 0;
- refcount_set(&net->ns.count, 1);
- ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
-
- refcount_set(&net->passive, 1);
- get_random_bytes(&net->hash_mix, sizeof(u32));
preempt_disable();
net->net_cookie = gen_cookie_next(&net_cookie);
preempt_enable();
- net->dev_base_seq = 1;
- net->user_ns = user_ns;
- idr_init(&net->netns_ids);
- spin_lock_init(&net->nsid_lock);
- mutex_init(&net->ipv4.ra_mutex);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
@@ -352,59 +454,11 @@ out_undo:
* for the pernet modules whose init functions did not fail.
*/
list_add(&net->exit_list, &net_exit_list);
- saved_ops = ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_pre_exit_list(ops, &net_exit_list);
-
- synchronize_rcu();
-
- ops = saved_ops;
- rtnl_lock();
- list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
- if (ops->exit_batch_rtnl)
- ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
- }
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
-
- ops = saved_ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_exit_list(ops, &net_exit_list);
-
- ops = saved_ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_free_list(ops, &net_exit_list);
-
+ ops_undo_list(&pernet_list, ops, &net_exit_list, false);
rcu_barrier();
goto out;
}
-static int __net_init net_defaults_init_net(struct net *net)
-{
- net->core.sysctl_somaxconn = SOMAXCONN;
- /* Limits per socket sk_omem_alloc usage.
- * TCP zerocopy regular usage needs 128 KB.
- */
- net->core.sysctl_optmem_max = 128 * 1024;
- net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
-
- return 0;
-}
-
-static struct pernet_operations net_defaults_ops = {
- .init = net_defaults_init_net,
-};
-
-static __init int net_defaults_init(void)
-{
- if (register_pernet_subsys(&net_defaults_ops))
- panic("Cannot initialize net default settings");
-
- return 0;
-}
-
-core_initcall(net_defaults_init);
-
#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
{
@@ -453,7 +507,22 @@ out_free:
goto out;
}
-static void net_free(struct net *net)
+static LLIST_HEAD(defer_free_list);
+
+static void net_complete_free(void)
+{
+ struct llist_node *kill_list;
+ struct net *net, *next;
+
+ /* Get the list of namespaces to free from last round. */
+ kill_list = llist_del_all(&defer_free_list);
+
+ llist_for_each_entry_safe(net, next, kill_list, defer_free_list)
+ kmem_cache_free(net_cachep, net);
+
+}
+
+void net_passive_dec(struct net *net)
{
if (refcount_dec_and_test(&net->passive)) {
kfree(rcu_access_pointer(net->gen));
@@ -461,7 +530,8 @@ static void net_free(struct net *net)
/* There should not be any trackers left there. */
ref_tracker_dir_exit(&net->notrefcnt_tracker);
- kmem_cache_free(net_cachep, net);
+ /* Wait for an extra rcu_barrier() before final free. */
+ llist_add(&net->defer_free_list, &defer_free_list);
}
}
@@ -470,7 +540,7 @@ void net_drop_ns(void *p)
struct net *net = (struct net *)p;
if (net)
- net_free(net);
+ net_passive_dec(net);
}
struct net *copy_net_ns(unsigned long flags,
@@ -493,8 +563,7 @@ struct net *copy_net_ns(unsigned long flags,
goto dec_ucounts;
}
- preinit_net(net);
- refcount_set(&net->passive, 1);
+ preinit_net(net, user_ns);
net->ucounts = ucounts;
get_user_ns(user_ns);
@@ -502,7 +571,7 @@ struct net *copy_net_ns(unsigned long flags,
if (rv < 0)
goto put_userns;
- rv = setup_net(net, user_ns);
+ rv = setup_net(net);
up_read(&pernet_ops_rwsem);
@@ -512,7 +581,7 @@ put_userns:
key_remove_domain(net->key_domain);
#endif
put_user_ns(user_ns);
- net_free(net);
+ net_passive_dec(net);
dec_ucounts:
dec_net_namespaces(ucounts);
return ERR_PTR(rv);
@@ -577,13 +646,15 @@ static void unhash_nsid(struct net *net, struct net *last)
static LLIST_HEAD(cleanup_list);
+struct task_struct *cleanup_net_task;
+
static void cleanup_net(struct work_struct *work)
{
- const struct pernet_operations *ops;
- struct net *net, *tmp, *last;
struct llist_node *net_kill_list;
+ struct net *net, *tmp, *last;
LIST_HEAD(net_exit_list);
- LIST_HEAD(dev_kill_list);
+
+ WRITE_ONCE(cleanup_net_task, current);
/* Atomically snapshot the list of namespaces to cleanup */
net_kill_list = llist_del_all(&cleanup_list);
@@ -612,33 +683,7 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
- /* Run all of the network namespace pre_exit methods */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_pre_exit_list(ops, &net_exit_list);
-
- /*
- * Another CPU might be rcu-iterating the list, wait for it.
- * This needs to be before calling the exit() notifiers, so
- * the rcu_barrier() below isn't sufficient alone.
- * Also the pre_exit() and exit() methods need this barrier.
- */
- synchronize_rcu_expedited();
-
- rtnl_lock();
- list_for_each_entry_reverse(ops, &pernet_list, list) {
- if (ops->exit_batch_rtnl)
- ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
- }
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
-
- /* Run all of the network namespace exit methods */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_exit_list(ops, &net_exit_list);
-
- /* Free the net generic variables */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_free_list(ops, &net_exit_list);
+ ops_undo_list(&pernet_list, NULL, &net_exit_list, true);
up_read(&pernet_ops_rwsem);
@@ -647,6 +692,8 @@ static void cleanup_net(struct work_struct *work)
*/
rcu_barrier();
+ net_complete_free();
+
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
@@ -655,8 +702,9 @@ static void cleanup_net(struct work_struct *work)
key_remove_domain(net->key_domain);
#endif
put_user_ns(net->user_ns);
- net_free(net);
+ net_passive_dec(net);
}
+ WRITE_ONCE(cleanup_net_task, NULL);
}
/**
@@ -690,30 +738,33 @@ EXPORT_SYMBOL_GPL(__put_net);
* get_net_ns - increment the refcount of the network namespace
* @ns: common namespace (net)
*
- * Returns the net's common namespace.
+ * Returns the net's common namespace or ERR_PTR() if ref is zero.
*/
struct ns_common *get_net_ns(struct ns_common *ns)
{
- return &get_net(container_of(ns, struct net, ns))->ns;
+ struct net *net;
+
+ net = maybe_get_net(container_of(ns, struct net, ns));
+ if (net)
+ return &net->ns;
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns);
struct net *get_net_ns_by_fd(int fd)
{
- struct fd f = fdget(fd);
- struct net *net = ERR_PTR(-EINVAL);
+ CLASS(fd, f)(fd);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (proc_ns_file(f.file)) {
- struct ns_common *ns = get_proc_ns(file_inode(f.file));
+ if (proc_ns_file(fd_file(f))) {
+ struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
if (ns->ops == &netns_operations)
- net = get_net(container_of(ns, struct net, ns));
+ return get_net(container_of(ns, struct net, ns));
}
- fdput(f);
- return net;
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
#endif
@@ -1090,7 +1141,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
end:
if (net_cb.fillargs.add_ref)
put_net(net_cb.tgt_net);
- return err < 0 ? err : skb->len;
+ return err;
}
static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
@@ -1159,13 +1210,23 @@ static void __init netns_ipv4_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_early_demux);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_l3mdev_accept);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_rmem);
- CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18);
+ CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
}
#endif
+static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid,
+ .dumpit = rtnl_net_dumpid,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+};
+
void __init net_ns_init(void)
{
struct net_generic *ng;
@@ -1191,9 +1252,10 @@ void __init net_ns_init(void)
#ifdef CONFIG_KEYS
init_net.key_domain = &init_net_key_domain;
#endif
+ preinit_net(&init_net, &init_user_ns);
+
down_write(&pernet_ops_rwsem);
- preinit_net(&init_net);
- if (setup_net(&init_net, &init_user_ns))
+ if (setup_net(&init_net))
panic("Could not setup the initial network namespace");
init_net_initialized = true;
@@ -1202,40 +1264,19 @@ void __init net_ns_init(void)
if (register_pernet_subsys(&net_ns_ops))
panic("Could not register network namespace subsystems");
- rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,
- RTNL_FLAG_DOIT_UNLOCKED);
- rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
- RTNL_FLAG_DOIT_UNLOCKED);
-}
-
-static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
-{
- ops_pre_exit_list(ops, net_exit_list);
- synchronize_rcu();
-
- if (ops->exit_batch_rtnl) {
- LIST_HEAD(dev_kill_list);
-
- rtnl_lock();
- ops->exit_batch_rtnl(net_exit_list, &dev_kill_list);
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
- }
- ops_exit_list(ops, net_exit_list);
-
- ops_free_list(ops, net_exit_list);
+ rtnl_register_many(net_ns_rtnl_msg_handlers);
}
#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
+ LIST_HEAD(net_exit_list);
struct net *net;
int error;
- LIST_HEAD(net_exit_list);
list_add_tail(&ops->list, list);
- if (ops->init || (ops->id && ops->size)) {
+ if (ops->init || ops->id) {
/* We held write locked pernet_ops_rwsem, and parallel
* setup_net() and cleanup_net() are not possible.
*/
@@ -1251,21 +1292,21 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
- free_exit_list(ops, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
return error;
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
- struct net *net;
LIST_HEAD(net_exit_list);
+ struct net *net;
- list_del(&ops->list);
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
- free_exit_list(ops, &net_exit_list);
+ list_del(&ops->list);
+ ops_undo_single(ops, &net_exit_list);
}
#else
@@ -1287,8 +1328,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
list_del(&ops->list);
} else {
LIST_HEAD(net_exit_list);
+
list_add(&init_net.exit_list, &net_exit_list);
- free_exit_list(ops, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
}
}
@@ -1301,13 +1343,20 @@ static int register_pernet_operations(struct list_head *list,
{
int error;
+ if (WARN_ON(!!ops->id ^ !!ops->size))
+ return -EINVAL;
+
if (ops->id) {
error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
GFP_KERNEL);
if (error < 0)
return error;
*ops->id = error;
- max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
+ /* This does not require READ_ONCE as writers already hold
+ * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
+ * net_alloc_generic.
+ */
+ WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
}
error = __register_pernet_operations(list, ops);
if (error) {
diff --git a/net/core/gso_test.c b/net/core/net_test.c
index 358c44680d91..9c3a590865d2 100644
--- a/net/core/gso_test.c
+++ b/net/core/net_test.c
@@ -1,6 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <kunit/test.h>
+
+/* GSO */
+
#include <linux/skbuff.h>
static const char hdr[] = "abcdefgh";
@@ -258,17 +261,127 @@ free_gso_skb:
consume_skb(skb);
}
-static struct kunit_case gso_test_cases[] = {
- KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params),
- {}
+/* IP tunnel flags */
+
+#include <net/ip_tunnels.h>
+
+struct ip_tunnel_flags_test {
+ const char *name;
+
+ const u16 *src_bits;
+ const u16 *exp_bits;
+ u8 src_num;
+ u8 exp_num;
+
+ __be16 exp_val;
+ bool exp_comp;
+};
+
+#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \
+ .name = (n), \
+ .src_bits = (src), \
+ .src_num = ARRAY_SIZE(src), \
+ .exp_comp = (comp), \
+ .exp_val = (eval), \
+ .exp_bits = (exp), \
+ .exp_num = ARRAY_SIZE(exp), \
+}
+
+/* These are __be16-compatible and can be compared as is */
+static const u16 ip_tunnel_flags_1[] = {
+ IP_TUNNEL_KEY_BIT,
+ IP_TUNNEL_STRICT_BIT,
+ IP_TUNNEL_ERSPAN_OPT_BIT,
+};
+
+/* Due to the previous flags design limitation, setting either
+ * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT``
+ * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they
+ * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is
+ * backward-compatible.
+ */
+#ifdef __LITTLE_ENDIAN
+#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT
+#else
+#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT
+#endif
+
+static const u16 ip_tunnel_flags_2_src[] = {
+ IP_TUNNEL_CONFLICT_BIT,
+};
+
+static const u16 ip_tunnel_flags_2_exp[] = {
+ IP_TUNNEL_CONFLICT_BIT,
+ IP_TUNNEL_SIT_ISATAP_BIT,
};
-static struct kunit_suite gso_test_suite = {
- .name = "net_core_gso",
- .test_cases = gso_test_cases,
+/* Bits 17 and higher are not compatible with __be16 flags */
+static const u16 ip_tunnel_flags_3_src[] = {
+ IP_TUNNEL_VXLAN_OPT_BIT,
+ 17,
+ 18,
+ 20,
};
-kunit_test_suite(gso_test_suite);
+static const u16 ip_tunnel_flags_3_exp[] = {
+ IP_TUNNEL_VXLAN_OPT_BIT,
+};
+
+static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = {
+ IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true,
+ cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) |
+ BIT(IP_TUNNEL_STRICT_BIT) |
+ BIT(IP_TUNNEL_ERSPAN_OPT_BIT)),
+ ip_tunnel_flags_1),
+ IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true,
+ VTI_ISVTI, ip_tunnel_flags_2_exp),
+ IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false,
+ cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)),
+ ip_tunnel_flags_3_exp),
+};
+
+static void
+ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t,
+ char *desc)
+{
+ strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE);
+}
+KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test,
+ ip_tunnel_flags_test_case_to_desc);
+
+static void ip_tunnel_flags_test_run(struct kunit *test)
+{
+ const struct ip_tunnel_flags_test *t = test->param_value;
+ IP_TUNNEL_DECLARE_FLAGS(src) = { };
+ IP_TUNNEL_DECLARE_FLAGS(exp) = { };
+ IP_TUNNEL_DECLARE_FLAGS(out);
+
+ for (u32 j = 0; j < t->src_num; j++)
+ __set_bit(t->src_bits[j], src);
+ for (u32 j = 0; j < t->exp_num; j++)
+ __set_bit(t->exp_bits[j], exp);
+
+ KUNIT_ASSERT_EQ(test, t->exp_comp,
+ ip_tunnel_flags_is_be16_compat(src));
+ KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val,
+ (__force u16)ip_tunnel_flags_to_be16(src));
+
+ ip_tunnel_flags_from_be16(out, t->exp_val);
+ KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out));
+}
+
+static struct kunit_case net_test_cases[] = {
+ KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params),
+ KUNIT_CASE_PARAM(ip_tunnel_flags_test_run,
+ ip_tunnel_flags_test_gen_params),
+ { },
+};
+
+static struct kunit_suite net_test_suite = {
+ .name = "net_core",
+ .test_cases = net_test_cases,
+};
+kunit_test_suite(net_test_suite);
+MODULE_DESCRIPTION("KUnit tests for networking core");
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("KUnit tests for segmentation offload");
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 8d8ace9ef87f..4fc44587f493 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -9,16 +9,21 @@
#include "netdev-genl-gen.h"
#include <uapi/linux/netdev.h>
+#include <net/netdev_netlink.h>
/* Integer value ranges */
static const struct netlink_range_validation netdev_a_page_pool_id_range = {
.min = 1ULL,
- .max = 4294967295ULL,
+ .max = U32_MAX,
};
static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = {
.min = 1ULL,
- .max = 2147483647ULL,
+ .max = S32_MAX,
+};
+
+static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
+ .max = S32_MAX,
};
/* Common nested types */
@@ -27,6 +32,11 @@ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFIND
[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
};
+const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = {
+ [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, },
+ [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+};
+
/* NETDEV_CMD_DEV_GET - do */
static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
[NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
@@ -70,9 +80,31 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN
/* NETDEV_CMD_QSTATS_GET - dump */
static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = {
+ [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
[NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1),
};
+/* NETDEV_CMD_BIND_RX - do */
+static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+ [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+};
+
+/* NETDEV_CMD_NAPI_SET - do */
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = {
+ [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+ [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
+ [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
+ [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
+};
+
+/* NETDEV_CMD_BIND_TX - do */
+static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+};
+
/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
{
@@ -150,6 +182,27 @@ static const struct genl_split_ops netdev_nl_ops[] = {
.maxattr = NETDEV_A_QSTATS_SCOPE,
.flags = GENL_CMD_CAP_DUMP,
},
+ {
+ .cmd = NETDEV_CMD_BIND_RX,
+ .doit = netdev_nl_bind_rx_doit,
+ .policy = netdev_bind_rx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_SET,
+ .doit = netdev_nl_napi_set_doit,
+ .policy = netdev_napi_set_nl_policy,
+ .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_BIND_TX,
+ .doit = netdev_nl_bind_tx_doit,
+ .policy = netdev_bind_tx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_CMD_CAP_DO,
+ },
};
static const struct genl_multicast_group netdev_nl_mcgrps[] = {
@@ -157,6 +210,16 @@ static const struct genl_multicast_group netdev_nl_mcgrps[] = {
[NETDEV_NLGRP_PAGE_POOL] = { "page-pool", },
};
+static void __netdev_nl_sock_priv_init(void *priv)
+{
+ netdev_nl_sock_priv_init(priv);
+}
+
+static void __netdev_nl_sock_priv_destroy(void *priv)
+{
+ netdev_nl_sock_priv_destroy(priv);
+}
+
struct genl_family netdev_nl_family __ro_after_init = {
.name = NETDEV_FAMILY_NAME,
.version = NETDEV_FAMILY_VERSION,
@@ -167,4 +230,7 @@ struct genl_family netdev_nl_family __ro_after_init = {
.n_split_ops = ARRAY_SIZE(netdev_nl_ops),
.mcgrps = netdev_nl_mcgrps,
.n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps),
+ .sock_priv_size = sizeof(struct netdev_nl_sock),
+ .sock_priv_init = __netdev_nl_sock_priv_init,
+ .sock_priv_destroy = __netdev_nl_sock_priv_destroy,
};
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index 4db40fd5b4a9..cf3fad74511f 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -10,9 +10,11 @@
#include <net/genetlink.h>
#include <uapi/linux/netdev.h>
+#include <net/netdev_netlink.h>
/* Common nested types */
extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
+extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
@@ -30,6 +32,9 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
enum {
NETDEV_NLGRP_MGMT,
@@ -38,4 +43,7 @@ enum {
extern struct genl_family netdev_nl_family;
+void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv);
+void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv);
+
#endif /* _LINUX_NETDEV_GEN_H */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 7004b3399c2b..2afa7b2141aa 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -3,16 +3,18 @@
#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/rtnetlink.h>
+#include <net/busy_poll.h>
#include <net/net_namespace.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <net/xdp_sock.h>
-#include <net/netdev_rx_queue.h>
-#include <net/netdev_queues.h>
-#include <net/busy_poll.h>
+#include <net/page_pool/memory_provider.h>
-#include "netdev-genl-gen.h"
#include "dev.h"
+#include "devmem.h"
+#include "netdev-genl-gen.h"
struct netdev_nl_dump_ctx {
unsigned long ifindex;
@@ -23,7 +25,7 @@ struct netdev_nl_dump_ctx {
static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb)
{
- NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx);
+ NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx);
return (struct netdev_nl_dump_ctx *)cb->ctx;
}
@@ -36,6 +38,8 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
u64 xdp_rx_meta = 0;
void *hdr;
+ netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */
+
hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
@@ -51,6 +55,8 @@ XDP_METADATA_KFUNC_xxx
xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO;
}
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
@@ -59,22 +65,22 @@ XDP_METADATA_KFUNC_xxx
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
xdp_rx_meta, NETDEV_A_DEV_PAD) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
- xsk_features, NETDEV_A_DEV_PAD)) {
- genlmsg_cancel(rsp, hdr);
- return -EINVAL;
- }
+ xsk_features, NETDEV_A_DEV_PAD))
+ goto err_cancel_msg;
if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
- netdev->xdp_zc_max_segs)) {
- genlmsg_cancel(rsp, hdr);
- return -EINVAL;
- }
+ netdev->xdp_zc_max_segs))
+ goto err_cancel_msg;
}
genlmsg_end(rsp, hdr);
return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
}
static void
@@ -118,15 +124,14 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
-
- netdev = __dev_get_by_index(genl_info_net(info), ifindex);
- if (netdev)
- err = netdev_nl_dev_fill(netdev, rsp, info);
- else
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
err = -ENODEV;
+ goto err_free_msg;
+ }
- rtnl_unlock();
+ err = netdev_nl_dev_fill(netdev, rsp, info);
+ netdev_unlock(netdev);
if (err)
goto err_free_msg;
@@ -142,38 +147,35 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
struct net *net = sock_net(skb->sk);
- struct net_device *netdev;
- int err = 0;
+ int err;
- rtnl_lock();
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb));
if (err < 0)
- break;
+ return err;
}
- rtnl_unlock();
- return err;
+ return 0;
}
static int
netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
const struct genl_info *info)
{
+ unsigned long irq_suspend_timeout;
+ unsigned long gro_flush_timeout;
+ u32 napi_defer_hard_irqs;
void *hdr;
pid_t pid;
- if (WARN_ON_ONCE(!napi->dev))
- return -EINVAL;
- if (!(napi->dev->flags & IFF_UP))
+ if (!napi->dev->up)
return 0;
hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
- if (napi->napi_id >= MIN_NAPI_ID &&
- nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
goto nla_put_failure;
if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex))
@@ -188,6 +190,21 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
goto nla_put_failure;
}
+ napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi);
+ if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS,
+ napi_defer_hard_irqs))
+ goto nla_put_failure;
+
+ irq_suspend_timeout = napi_get_irq_suspend_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+ irq_suspend_timeout))
+ goto nla_put_failure;
+
+ gro_flush_timeout = napi_get_gro_flush_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+ gro_flush_timeout))
+ goto nla_put_failure;
+
genlmsg_end(rsp, hdr);
return 0;
@@ -213,18 +230,21 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
-
- napi = napi_by_id(napi_id);
- if (napi)
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
+ if (napi) {
err = netdev_nl_napi_fill_one(rsp, napi, info);
- else
- err = -EINVAL;
-
- rtnl_unlock();
+ netdev_unlock(napi->dev);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
- if (err)
+ if (err) {
+ goto err_free_msg;
+ } else if (!rsp->len) {
+ err = -ENOENT;
goto err_free_msg;
+ }
return genlmsg_reply(rsp, info);
@@ -239,12 +259,21 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
struct netdev_nl_dump_ctx *ctx)
{
struct napi_struct *napi;
+ unsigned int prev_id;
int err = 0;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return err;
+ prev_id = UINT_MAX;
list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (!napi_id_valid(napi->napi_id))
+ continue;
+
+ /* Dump continuation below depends on the list being sorted */
+ WARN_ON_ONCE(napi->napi_id >= prev_id);
+ prev_id = napi->napi_id;
+
if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
continue;
@@ -268,30 +297,86 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
if (info->attrs[NETDEV_A_NAPI_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock(net, ifindex);
+ if (netdev) {
err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
- else
+ netdev_unlock(netdev);
+ } else {
err = -ENODEV;
+ }
} else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
ctx->napi_id = 0;
}
}
- rtnl_unlock();
return err;
}
static int
+netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
+{
+ u64 irq_suspend_timeout = 0;
+ u64 gro_flush_timeout = 0;
+ u32 defer = 0;
+
+ if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
+ defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
+ napi_set_defer_hard_irqs(napi, defer);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) {
+ irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]);
+ napi_set_irq_suspend_timeout(napi, irq_suspend_timeout);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
+ gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
+ napi_set_gro_flush_timeout(napi, gro_flush_timeout);
+ }
+
+ return 0;
+}
+
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct napi_struct *napi;
+ unsigned int napi_id;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+ return -EINVAL;
+
+ napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
+ if (napi) {
+ err = netdev_nl_napi_set_config(napi, info);
+ netdev_unlock(napi->dev);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
+
+ return err;
+}
+
+static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi)
+{
+ if (napi && napi_id_valid(napi->napi_id))
+ return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id);
+ return 0;
+}
+
+static int
netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
u32 q_idx, u32 q_type, const struct genl_info *info)
{
+ struct pp_memory_provider_params *params;
struct netdev_rx_queue *rxq;
struct netdev_queue *txq;
void *hdr;
@@ -308,15 +393,30 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
switch (q_type) {
case NETDEV_QUEUE_TYPE_RX:
rxq = __netif_get_rx_queue(netdev, q_idx);
- if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID,
- rxq->napi->napi_id))
+ if (nla_put_napi_id(rsp, rxq->napi))
goto nla_put_failure;
+
+ params = &rxq->mp_params;
+ if (params->mp_ops &&
+ params->mp_ops->nl_fill(params->mp_priv, rsp, rxq))
+ goto nla_put_failure;
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool)
+ if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
+ goto nla_put_failure;
+#endif
+
break;
case NETDEV_QUEUE_TYPE_TX:
txq = netdev_get_tx_queue(netdev, q_idx);
- if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID,
- txq->napi->napi_id))
+ if (nla_put_napi_id(rsp, txq->napi))
goto nla_put_failure;
+#ifdef CONFIG_XDP_SOCKETS
+ if (txq->pool)
+ if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
+ goto nla_put_failure;
+#endif
+ break;
}
genlmsg_end(rsp, hdr);
@@ -347,10 +447,10 @@ static int
netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx,
u32 q_type, const struct genl_info *info)
{
- int err = 0;
+ int err;
- if (!(netdev->flags & IFF_UP))
- return err;
+ if (!netdev->up)
+ return -ENOENT;
err = netdev_nl_queue_validate(netdev, q_idx, q_type);
if (err)
@@ -379,15 +479,14 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
-
- netdev = __dev_get_by_index(genl_info_net(info), ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info),
+ ifindex);
+ if (netdev) {
err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
- else
+ netdev_unlock_ops_compat(netdev);
+ } else {
err = -ENODEV;
-
- rtnl_unlock();
+ }
if (err)
goto err_free_msg;
@@ -405,24 +504,21 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp,
struct netdev_nl_dump_ctx *ctx)
{
int err = 0;
- int i;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return err;
- for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) {
- err = netdev_nl_queue_fill_one(rsp, netdev, i,
+ for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx,
NETDEV_QUEUE_TYPE_RX, info);
if (err)
return err;
- ctx->rxq_idx = i++;
}
- for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) {
- err = netdev_nl_queue_fill_one(rsp, netdev, i,
+ for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx,
NETDEV_QUEUE_TYPE_TX, info);
if (err)
return err;
- ctx->txq_idx = i++;
}
return err;
@@ -440,15 +536,17 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
if (info->attrs[NETDEV_A_QUEUE_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (netdev) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
- else
+ netdev_unlock_ops_compat(netdev);
+ } else {
err = -ENODEV;
+ }
} else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_ops_compat_scoped(net, netdev,
+ ctx->ifindex) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
@@ -456,7 +554,6 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
ctx->txq_idx = 0;
}
}
- rtnl_unlock();
return err;
}
@@ -489,7 +586,18 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
{
if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||
netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) ||
- netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail))
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits))
return -EMSGSIZE;
return 0;
}
@@ -498,7 +606,18 @@ static int
netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)
{
if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) ||
- netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes))
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake))
return -EMSGSIZE;
return 0;
}
@@ -567,7 +686,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
i, info);
if (err)
return err;
- ctx->rxq_idx = i++;
+ ctx->rxq_idx = ++i;
}
i = ctx->txq_idx;
while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) {
@@ -575,7 +694,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
i, info);
if (err)
return err;
- ctx->txq_idx = i++;
+ ctx->txq_idx = ++i;
}
ctx->rxq_idx = 0;
@@ -583,25 +702,66 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
return 0;
}
+/**
+ * netdev_stat_queue_sum() - add up queue stats from range of queues
+ * @netdev: net_device
+ * @rx_start: index of the first Rx queue to query
+ * @rx_end: index after the last Rx queue (first *not* to query)
+ * @rx_sum: output Rx stats, should be already initialized
+ * @tx_start: index of the first Tx queue to query
+ * @tx_end: index after the last Tx queue (first *not* to query)
+ * @tx_sum: output Tx stats, should be already initialized
+ *
+ * Add stats from [start, end) range of queue IDs to *x_sum structs.
+ * The sum structs must be already initialized. Usually this
+ * helper is invoked from the .get_base_stats callbacks of drivers
+ * to account for stats of disabled queues. In that case the ranges
+ * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues).
+ */
+void netdev_stat_queue_sum(struct net_device *netdev,
+ int rx_start, int rx_end,
+ struct netdev_queue_stats_rx *rx_sum,
+ int tx_start, int tx_end,
+ struct netdev_queue_stats_tx *tx_sum)
+{
+ const struct netdev_stat_ops *ops;
+ struct netdev_queue_stats_rx rx;
+ struct netdev_queue_stats_tx tx;
+ int i;
+
+ ops = netdev->stat_ops;
+
+ for (i = rx_start; i < rx_end; i++) {
+ memset(&rx, 0xff, sizeof(rx));
+ if (ops->get_queue_stats_rx)
+ ops->get_queue_stats_rx(netdev, i, &rx);
+ netdev_nl_stats_add(rx_sum, &rx, sizeof(rx));
+ }
+ for (i = tx_start; i < tx_end; i++) {
+ memset(&tx, 0xff, sizeof(tx));
+ if (ops->get_queue_stats_tx)
+ ops->get_queue_stats_tx(netdev, i, &tx);
+ netdev_nl_stats_add(tx_sum, &tx, sizeof(tx));
+ }
+}
+EXPORT_SYMBOL(netdev_stat_queue_sum);
+
static int
netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
- struct netdev_queue_stats_rx rx_sum, rx;
- struct netdev_queue_stats_tx tx_sum, tx;
- const struct netdev_stat_ops *ops;
+ struct netdev_queue_stats_rx rx_sum;
+ struct netdev_queue_stats_tx tx_sum;
void *hdr;
- int i;
- ops = netdev->stat_ops;
/* Netdev can't guarantee any complete counters */
- if (!ops->get_base_stats)
+ if (!netdev->stat_ops->get_base_stats)
return 0;
memset(&rx_sum, 0xff, sizeof(rx_sum));
memset(&tx_sum, 0xff, sizeof(tx_sum));
- ops->get_base_stats(netdev, &rx_sum, &tx_sum);
+ netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum);
/* The op was there, but nothing reported, don't bother */
if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) &&
@@ -614,18 +774,8 @@ netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex))
goto nla_put_failure;
- for (i = 0; i < netdev->real_num_rx_queues; i++) {
- memset(&rx, 0xff, sizeof(rx));
- if (ops->get_queue_stats_rx)
- ops->get_queue_stats_rx(netdev, i, &rx);
- netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx));
- }
- for (i = 0; i < netdev->real_num_tx_queues; i++) {
- memset(&tx, 0xff, sizeof(tx));
- if (ops->get_queue_stats_tx)
- ops->get_queue_stats_tx(netdev, i, &tx);
- netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx));
- }
+ netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum,
+ 0, netdev->real_num_tx_queues, &tx_sum);
if (netdev_nl_stats_write_rx(rsp, &rx_sum) ||
netdev_nl_stats_write_tx(rsp, &tx_sum))
@@ -639,6 +789,24 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int
+netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope,
+ struct sk_buff *skb, const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ if (!netdev->stat_ops)
+ return 0;
+
+ switch (scope) {
+ case 0:
+ return netdev_nl_stats_by_netdev(netdev, skb, info);
+ case NETDEV_QSTATS_SCOPE_QUEUE:
+ return netdev_nl_stats_by_queue(netdev, skb, info, ctx);
+ }
+
+ return -EINVAL; /* Should not happen, per netlink policy */
+}
+
int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb)
{
@@ -646,6 +814,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
const struct genl_info *info = genl_info_dump(cb);
struct net *net = sock_net(skb->sk);
struct net_device *netdev;
+ unsigned int ifindex;
unsigned int scope;
int err = 0;
@@ -653,27 +822,259 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
if (info->attrs[NETDEV_A_QSTATS_SCOPE])
scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]);
- rtnl_lock();
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
- if (!netdev->stat_ops)
- continue;
+ ifindex = 0;
+ if (info->attrs[NETDEV_A_QSTATS_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]);
- switch (scope) {
- case 0:
- err = netdev_nl_stats_by_netdev(netdev, skb, info);
- break;
- case NETDEV_QSTATS_SCOPE_QUEUE:
- err = netdev_nl_stats_by_queue(netdev, skb, info, ctx);
- break;
+ if (ifindex) {
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (!netdev) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ return -ENODEV;
+ }
+ if (netdev->stat_ops) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ } else {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ err = -EOPNOTSUPP;
}
+ netdev_unlock_ops_compat(netdev);
+ return err;
+ }
+
+ for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
if (err < 0)
break;
}
- rtnl_unlock();
return err;
}
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
+ struct net_devmem_dmabuf_binding *binding;
+ u32 ifindex, dmabuf_fd, rxq_idx;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ struct sk_buff *rsp;
+ struct nlattr *attr;
+ int rem, err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ err = 0;
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+ if (!netif_device_present(netdev))
+ err = -ENODEV;
+ else if (!netdev_need_ops_lock(netdev))
+ err = -EOPNOTSUPP;
+ if (err) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_DEV_IFINDEX]);
+ goto err_unlock;
+ }
+
+ binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
+ priv, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_unlock;
+ }
+
+ nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
+ genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ err = nla_parse_nested(
+ tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr,
+ netdev_queue_id_nl_policy, info->extack);
+ if (err < 0)
+ goto err_unbind;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
+ NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) {
+ err = -EINVAL;
+ goto err_unbind;
+ }
+
+ if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
+ err = -EINVAL;
+ goto err_unbind;
+ }
+
+ rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+
+ err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
+ info->extack);
+ if (err)
+ goto err_unbind;
+ }
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ err = genlmsg_reply(rsp, info);
+ if (err)
+ goto err_unbind;
+
+ netdev_unlock(netdev);
+
+ mutex_unlock(&priv->lock);
+
+ return 0;
+
+err_unbind:
+ net_devmem_unbind_dmabuf(binding);
+err_unlock:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ u32 ifindex, dmabuf_fd;
+ struct sk_buff *rsp;
+ int err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+
+ if (!netif_device_present(netdev)) {
+ err = -ENODEV;
+ goto err_unlock_netdev;
+ }
+
+ if (!netdev->netmem_tx) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(info->extack,
+ "Driver does not support netmem TX");
+ goto err_unlock_netdev;
+ }
+
+ binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv,
+ info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_unlock_netdev;
+ }
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ netdev_unlock(netdev);
+ mutex_unlock(&priv->lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_unlock_netdev:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
+void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
+{
+ INIT_LIST_HEAD(&priv->bindings);
+ mutex_init(&priv->lock);
+}
+
+void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct net_devmem_dmabuf_binding *temp;
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+
+ mutex_lock(&priv->lock);
+ list_for_each_entry_safe(binding, temp, &priv->bindings, list) {
+ mutex_lock(&binding->lock);
+ dev = binding->dev;
+ if (!dev) {
+ mutex_unlock(&binding->lock);
+ net_devmem_unbind_dmabuf(binding);
+ continue;
+ }
+ netdev_hold(dev, &dev_tracker, GFP_KERNEL);
+ mutex_unlock(&binding->lock);
+
+ netdev_lock(dev);
+ net_devmem_unbind_dmabuf(binding);
+ netdev_unlock(dev);
+ netdev_put(dev, &dev_tracker);
+ }
+ mutex_unlock(&priv->lock);
+}
+
static int netdev_genl_netdevice_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
@@ -681,10 +1082,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb,
switch (event) {
case NETDEV_REGISTER:
+ netdev_lock_ops_to_full(netdev);
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF);
+ netdev_unlock_full_to_ops(netdev);
break;
case NETDEV_UNREGISTER:
+ netdev_lock(netdev);
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF);
+ netdev_unlock(netdev);
break;
case NETDEV_XDP_FEAT_CHANGE:
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF);
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
new file mode 100644
index 000000000000..d126f10197bf
--- /dev/null
+++ b/net/core/netdev_rx_queue.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/memory_provider.h>
+
+#include "page_pool_priv.h"
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
+ const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
+ void *new_mem, *old_mem;
+ int err;
+
+ if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free ||
+ !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start)
+ return -EOPNOTSUPP;
+
+ netdev_assert_locked(dev);
+
+ new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
+ if (!new_mem)
+ return -ENOMEM;
+
+ old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
+ if (!old_mem) {
+ err = -ENOMEM;
+ goto err_free_new_mem;
+ }
+
+ err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_free_old_mem;
+
+ err = page_pool_check_memory_provider(dev, rxq);
+ if (err)
+ goto err_free_new_queue_mem;
+
+ if (netif_running(dev)) {
+ err = qops->ndo_queue_stop(dev, old_mem, rxq_idx);
+ if (err)
+ goto err_free_new_queue_mem;
+
+ err = qops->ndo_queue_start(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_start_queue;
+ } else {
+ swap(new_mem, old_mem);
+ }
+
+ qops->ndo_queue_mem_free(dev, old_mem);
+
+ kvfree(old_mem);
+ kvfree(new_mem);
+
+ return 0;
+
+err_start_queue:
+ /* Restarting the queue with old_mem should be successful as we haven't
+ * changed any of the queue configuration, and there is not much we can
+ * do to recover from a failure here.
+ *
+ * WARN if we fail to recover the old rx queue, and at least free
+ * old_mem so we don't also leak that.
+ */
+ if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) {
+ WARN(1,
+ "Failed to restart old queue in error path. RX queue %d may be unhealthy.",
+ rxq_idx);
+ qops->ndo_queue_mem_free(dev, old_mem);
+ }
+
+err_free_new_queue_mem:
+ qops->ndo_queue_mem_free(dev, new_mem);
+
+err_free_old_mem:
+ kvfree(old_mem);
+
+err_free_new_mem:
+ kvfree(new_mem);
+
+ return err;
+}
+EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
+
+int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_rx_queue *rxq;
+ int ret;
+
+ if (!netdev_need_ops_lock(dev))
+ return -EOPNOTSUPP;
+
+ if (rxq_idx >= dev->real_num_rx_queues)
+ return -EINVAL;
+ rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
+
+ if (rxq_idx >= dev->real_num_rx_queues) {
+ NL_SET_ERR_MSG(extack, "rx queue index out of range");
+ return -ERANGE;
+ }
+ if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
+ return -EINVAL;
+ }
+ if (dev->cfg->hds_thresh) {
+ NL_SET_ERR_MSG(extack, "hds-thresh is not zero");
+ return -EINVAL;
+ }
+ if (dev_xdp_prog_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
+ return -EEXIST;
+ }
+
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
+ if (rxq->mp_params.mp_ops) {
+ NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
+ return -EEXIST;
+ }
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool) {
+ NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP");
+ return -EBUSY;
+ }
+#endif
+
+ rxq->mp_params = *p;
+ ret = netdev_rx_queue_restart(dev, rxq_idx);
+ if (ret) {
+ rxq->mp_params.mp_ops = NULL;
+ rxq->mp_params.mp_priv = NULL;
+ }
+ return ret;
+}
+
+int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ struct pp_memory_provider_params *p)
+{
+ int ret;
+
+ netdev_lock(dev);
+ ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL);
+ netdev_unlock(dev);
+ return ret;
+}
+
+void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
+ const struct pp_memory_provider_params *old_p)
+{
+ struct netdev_rx_queue *rxq;
+ int err;
+
+ if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
+ return;
+
+ rxq = __netif_get_rx_queue(dev, ifq_idx);
+
+ /* Callers holding a netdev ref may get here after we already
+ * went thru shutdown via dev_memory_provider_uninstall().
+ */
+ if (dev->reg_state > NETREG_REGISTERED &&
+ !rxq->mp_params.mp_ops)
+ return;
+
+ if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops ||
+ rxq->mp_params.mp_priv != old_p->mp_priv))
+ return;
+
+ rxq->mp_params.mp_ops = NULL;
+ rxq->mp_params.mp_priv = NULL;
+ err = netdev_rx_queue_restart(dev, ifq_idx);
+ WARN_ON(err && err != -ENETDOWN);
+}
+
+void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
+ struct pp_memory_provider_params *old_p)
+{
+ netdev_lock(dev);
+ __net_mp_close_rxq(dev, ifq_idx, old_p);
+ netdev_unlock(dev);
+}
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
new file mode 100644
index 000000000000..cd95394399b4
--- /dev/null
+++ b/net/core/netmem_priv.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __NETMEM_PRIV_H
+#define __NETMEM_PRIV_H
+
+static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
+{
+ return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
+}
+
+static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
+{
+ __netmem_clear_lsb(netmem)->pp_magic |= pp_magic;
+}
+
+static inline void netmem_clear_pp_magic(netmem_ref netmem)
+{
+ WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK);
+
+ __netmem_clear_lsb(netmem)->pp_magic = 0;
+}
+
+static inline bool netmem_is_pp(netmem_ref netmem)
+{
+ return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+
+static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
+{
+ __netmem_clear_lsb(netmem)->pp = pool;
+}
+
+static inline void netmem_set_dma_addr(netmem_ref netmem,
+ unsigned long dma_addr)
+{
+ __netmem_clear_lsb(netmem)->dma_addr = dma_addr;
+}
+
+static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return 0;
+
+ magic = __netmem_clear_lsb(netmem)->pp_magic;
+
+ return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
+}
+
+static inline void netmem_set_dma_index(netmem_ref netmem,
+ unsigned long id)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return;
+
+ magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
+ __netmem_clear_lsb(netmem)->pp_magic = magic;
+}
+#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 543007f159f9..6ad84d4a2b46 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -34,7 +34,7 @@
#include <net/addrconf.h>
#include <net/ndisc.h>
#include <net/ip6_checksum.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <trace/events/napi.h>
#include <linux/kconfig.h>
@@ -45,11 +45,6 @@
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
-
-static struct sk_buff_head skb_pool;
-
-DEFINE_STATIC_SRCU(netpoll_srcu);
-
#define USEC_PER_POLL 50
#define MAX_SKB_SIZE \
@@ -162,7 +157,7 @@ static void poll_one_napi(struct napi_struct *napi)
if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state))
return;
- /* We explicilty pass the polling call a budget of 0 to
+ /* We explicitly pass the polling call a budget of 0 to
* indicate that we are clearing the Tx path only.
*/
work = napi->poll(napi, 0);
@@ -220,41 +215,39 @@ EXPORT_SYMBOL(netpoll_poll_dev);
void netpoll_poll_disable(struct net_device *dev)
{
struct netpoll_info *ni;
- int idx;
+
might_sleep();
- idx = srcu_read_lock(&netpoll_srcu);
- ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
+ ni = rtnl_dereference(dev->npinfo);
if (ni)
down(&ni->dev_lock);
- srcu_read_unlock(&netpoll_srcu, idx);
}
-EXPORT_SYMBOL(netpoll_poll_disable);
void netpoll_poll_enable(struct net_device *dev)
{
struct netpoll_info *ni;
- rcu_read_lock();
- ni = rcu_dereference(dev->npinfo);
+
+ ni = rtnl_dereference(dev->npinfo);
if (ni)
up(&ni->dev_lock);
- rcu_read_unlock();
}
-EXPORT_SYMBOL(netpoll_poll_enable);
-static void refill_skbs(void)
+static void refill_skbs(struct netpoll *np)
{
+ struct sk_buff_head *skb_pool;
struct sk_buff *skb;
unsigned long flags;
- spin_lock_irqsave(&skb_pool.lock, flags);
- while (skb_pool.qlen < MAX_SKBS) {
+ skb_pool = &np->skb_pool;
+
+ spin_lock_irqsave(&skb_pool->lock, flags);
+ while (skb_pool->qlen < MAX_SKBS) {
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
if (!skb)
break;
- __skb_queue_tail(&skb_pool, skb);
+ __skb_queue_tail(skb_pool, skb);
}
- spin_unlock_irqrestore(&skb_pool.lock, flags);
+ spin_unlock_irqrestore(&skb_pool->lock, flags);
}
static void zap_completion_queue(void)
@@ -291,12 +284,13 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
struct sk_buff *skb;
zap_completion_queue();
- refill_skbs();
repeat:
skb = alloc_skb(len, GFP_ATOMIC);
- if (!skb)
- skb = skb_dequeue(&skb_pool);
+ if (!skb) {
+ skb = skb_dequeue(&np->skb_pool);
+ schedule_work(&np->refill_wq);
+ }
if (!skb) {
if (++count < 10) {
@@ -316,7 +310,7 @@ static int netpoll_owner_active(struct net_device *dev)
struct napi_struct *napi;
list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
- if (napi->poll_owner == smp_processor_id())
+ if (READ_ONCE(napi->poll_owner) == smp_processor_id())
return 1;
}
return 0;
@@ -326,6 +320,7 @@ static int netpoll_owner_active(struct net_device *dev)
static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
netdev_tx_t status = NETDEV_TX_BUSY;
+ netdev_tx_t ret = NET_XMIT_DROP;
struct net_device *dev;
unsigned long tries;
/* It is up to the caller to keep npinfo alive. */
@@ -334,11 +329,12 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
lockdep_assert_irqs_disabled();
dev = np->dev;
+ rcu_read_lock();
npinfo = rcu_dereference_bh(dev->npinfo);
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
dev_kfree_skb_irq(skb);
- return NET_XMIT_DROP;
+ goto out;
}
/* don't get messages out of order, and no recursion */
@@ -377,7 +373,10 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
- return NETDEV_TX_OK;
+ ret = NETDEV_TX_OK;
+out:
+ rcu_read_unlock();
+ return ret;
}
netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
@@ -397,7 +396,7 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
}
EXPORT_SYMBOL(netpoll_send_skb);
-void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+int netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
int total_len, ip_len, udp_len;
struct sk_buff *skb;
@@ -421,7 +420,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb = find_skb(np, total_len + np->dev->needed_tailroom,
total_len - len);
if (!skb)
- return;
+ return -ENOMEM;
skb_copy_to_linear_data(skb, msg, len);
skb_put(skb, len);
@@ -433,8 +432,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
udph->dest = htons(np->remote_port);
udph->len = htons(udp_len);
+ udph->check = 0;
if (np->ipv6) {
- udph->check = 0;
udph->check = csum_ipv6_magic(&np->local_ip.in6,
&np->remote_ip.in6,
udp_len, IPPROTO_UDP,
@@ -462,7 +461,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb_reset_mac_header(skb);
skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
} else {
- udph->check = 0;
udph->check = csum_tcpudp_magic(np->local_ip.ip,
np->remote_ip.ip,
udp_len, IPPROTO_UDP,
@@ -497,7 +495,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb->dev = np->dev;
- netpoll_send_skb(np, skb);
+ return (int)netpoll_send_skb(np, skb);
}
EXPORT_SYMBOL(netpoll_send_udp);
@@ -508,7 +506,8 @@ void netpoll_print_options(struct netpoll *np)
np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
else
np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
- np_info(np, "interface '%s'\n", np->dev_name);
+ np_info(np, "interface name '%s'\n", np->dev_name);
+ np_info(np, "local ethernet address '%pM'\n", np->dev_mac);
np_info(np, "remote port %d\n", np->remote_port);
if (np->ipv6)
np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
@@ -538,6 +537,15 @@ static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
return -1;
}
+static void skb_pool_flush(struct netpoll *np)
+{
+ struct sk_buff_head *skb_pool;
+
+ cancel_work_sync(&np->refill_wq);
+ skb_pool = &np->skb_pool;
+ skb_queue_purge_reason(skb_pool, SKB_CONSUMED);
+}
+
int netpoll_parse_options(struct netpoll *np, char *opt)
{
char *cur=opt, *delim;
@@ -569,11 +577,18 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
cur++;
if (*cur != ',') {
- /* parse out dev name */
+ /* parse out dev_name or dev_mac */
if ((delim = strchr(cur, ',')) == NULL)
goto parse_failed;
*delim = 0;
- strscpy(np->dev_name, cur, sizeof(np->dev_name));
+
+ np->dev_name[0] = '\0';
+ eth_broadcast_addr(np->dev_mac);
+ if (!strchr(cur, ':'))
+ strscpy(np->dev_name, cur, sizeof(np->dev_name));
+ else if (!mac_pton(cur, np->dev_mac))
+ goto parse_failed;
+
cur = delim;
}
cur++;
@@ -620,23 +635,31 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
}
EXPORT_SYMBOL(netpoll_parse_options);
+static void refill_skbs_work_handler(struct work_struct *work)
+{
+ struct netpoll *np =
+ container_of(work, struct netpoll, refill_wq);
+
+ refill_skbs(np);
+}
+
int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
{
struct netpoll_info *npinfo;
const struct net_device_ops *ops;
int err;
- np->dev = ndev;
- strscpy(np->dev_name, ndev->name, IFNAMSIZ);
+ skb_queue_head_init(&np->skb_pool);
if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
np_err(np, "%s doesn't support polling, aborting\n",
- np->dev_name);
+ ndev->name);
err = -ENOTSUPP;
goto out;
}
- if (!ndev->npinfo) {
+ npinfo = rtnl_dereference(ndev->npinfo);
+ if (!npinfo) {
npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
if (!npinfo) {
err = -ENOMEM;
@@ -649,19 +672,24 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
refcount_set(&npinfo->refcnt, 1);
- ops = np->dev->netdev_ops;
+ ops = ndev->netdev_ops;
if (ops->ndo_netpoll_setup) {
- err = ops->ndo_netpoll_setup(ndev, npinfo);
+ err = ops->ndo_netpoll_setup(ndev);
if (err)
goto free_npinfo;
}
} else {
- npinfo = rtnl_dereference(ndev->npinfo);
refcount_inc(&npinfo->refcnt);
}
+ np->dev = ndev;
+ strscpy(np->dev_name, ndev->name, IFNAMSIZ);
npinfo->netpoll = np;
+ /* fill up the skb queue */
+ refill_skbs(np);
+ INIT_WORK(&np->refill_wq, refill_skbs_work_handler);
+
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
@@ -674,26 +702,45 @@ out:
}
EXPORT_SYMBOL_GPL(__netpoll_setup);
+/*
+ * Returns a pointer to a string representation of the identifier used
+ * to select the egress interface for the given netpoll instance. buf
+ * must be a buffer of length at least MAC_ADDR_STR_LEN + 1.
+ */
+static char *egress_dev(struct netpoll *np, char *buf)
+{
+ if (np->dev_name[0])
+ return np->dev_name;
+
+ snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac);
+ return buf;
+}
+
int netpoll_setup(struct netpoll *np)
{
+ struct net *net = current->nsproxy->net_ns;
+ char buf[MAC_ADDR_STR_LEN + 1];
struct net_device *ndev = NULL;
+ bool ip_overwritten = false;
struct in_device *in_dev;
int err;
rtnl_lock();
- if (np->dev_name[0]) {
- struct net *net = current->nsproxy->net_ns;
+ if (np->dev_name[0])
ndev = __dev_get_by_name(net, np->dev_name);
- }
+ else if (is_valid_ether_addr(np->dev_mac))
+ ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
+
if (!ndev) {
- np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
+ np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf));
err = -ENODEV;
goto unlock;
}
netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL);
if (netdev_master_upper_dev_get(ndev)) {
- np_err(np, "%s is a slave device, aborting\n", np->dev_name);
+ np_err(np, "%s is a slave device, aborting\n",
+ egress_dev(np, buf));
err = -EBUSY;
goto put;
}
@@ -701,7 +748,8 @@ int netpoll_setup(struct netpoll *np)
if (!netif_running(ndev)) {
unsigned long atmost;
- np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
+ np_info(np, "device %s not up yet, forcing it\n",
+ egress_dev(np, buf));
err = dev_open(ndev, NULL);
@@ -735,12 +783,13 @@ int netpoll_setup(struct netpoll *np)
if (!ifa) {
put_noaddr:
np_err(np, "no IP address for %s, aborting\n",
- np->dev_name);
+ egress_dev(np, buf));
err = -EDESTADDRREQ;
goto put;
}
np->local_ip.ip = ifa->ifa_local;
+ ip_overwritten = true;
np_info(np, "local IP %pI4\n", &np->local_ip.ip);
} else {
#if IS_ENABLED(CONFIG_IPV6)
@@ -757,6 +806,7 @@ put_noaddr:
!!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL))
continue;
np->local_ip.in6 = ifp->addr;
+ ip_overwritten = true;
err = 0;
break;
}
@@ -764,29 +814,31 @@ put_noaddr:
}
if (err) {
np_err(np, "no IPv6 address for %s, aborting\n",
- np->dev_name);
+ egress_dev(np, buf));
goto put;
} else
np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
#else
np_err(np, "IPv6 is not supported %s, aborting\n",
- np->dev_name);
+ egress_dev(np, buf));
err = -EINVAL;
goto put;
#endif
}
}
- /* fill up the skb queue */
- refill_skbs();
-
err = __netpoll_setup(np, ndev);
if (err)
- goto put;
+ goto flush;
rtnl_unlock();
return 0;
+flush:
+ skb_pool_flush(np);
put:
+ DEBUG_NET_WARN_ON_ONCE(np->dev);
+ if (ip_overwritten)
+ memset(&np->local_ip, 0, sizeof(np->local_ip));
netdev_put(ndev, &np->dev_tracker);
unlock:
rtnl_unlock();
@@ -794,13 +846,6 @@ unlock:
}
EXPORT_SYMBOL(netpoll_setup);
-static int __init netpoll_init(void)
-{
- skb_queue_head_init(&skb_pool);
- return 0;
-}
-core_initcall(netpoll_init);
-
static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
{
struct netpoll_info *npinfo =
@@ -826,8 +871,6 @@ void __netpoll_cleanup(struct netpoll *np)
if (!npinfo)
return;
- synchronize_srcu(&netpoll_srcu);
-
if (refcount_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
@@ -839,6 +882,8 @@ void __netpoll_cleanup(struct netpoll *np)
call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
} else
RCU_INIT_POINTER(np->dev->npinfo, NULL);
+
+ skb_pool_flush(np);
}
EXPORT_SYMBOL_GPL(__netpoll_cleanup);
@@ -853,14 +898,20 @@ void __netpoll_free(struct netpoll *np)
}
EXPORT_SYMBOL_GPL(__netpoll_free);
+void do_netpoll_cleanup(struct netpoll *np)
+{
+ __netpoll_cleanup(np);
+ netdev_put(np->dev, &np->dev_tracker);
+ np->dev = NULL;
+}
+EXPORT_SYMBOL(do_netpoll_cleanup);
+
void netpoll_cleanup(struct netpoll *np)
{
rtnl_lock();
if (!np->dev)
goto out;
- __netpoll_cleanup(np);
- netdev_put(np->dev, &np->dev_tracker);
- np->dev = NULL;
+ do_netpoll_cleanup(np);
out:
rtnl_unlock();
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dd364d738c00..ba7cf3e3c32f 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -5,12 +5,16 @@
* Copyright (C) 2016 Red Hat, Inc.
*/
+#include <linux/error-injection.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/device.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
#include <net/xdp.h>
#include <linux/dma-direction.h>
@@ -23,8 +27,13 @@
#include <trace/events/page_pool.h>
+#include "dev.h"
+#include "mp_dmabuf_devmem.h"
+#include "netmem_priv.h"
#include "page_pool_priv.h"
+DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
+
#define DEFER_TIME (msecs_to_jiffies(1000))
#define DEFER_WARN_INTERVAL (60 * HZ)
@@ -123,9 +132,9 @@ int page_pool_ethtool_stats_get_count(void)
}
EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
-u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
+u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
{
- struct page_pool_stats *pool_stats = stats;
+ const struct page_pool_stats *pool_stats = stats;
*data++ = pool_stats->alloc_stats.fast;
*data++ = pool_stats->alloc_stats.slow;
@@ -144,9 +153,9 @@ u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
EXPORT_SYMBOL(page_pool_ethtool_stats_get);
#else
-#define alloc_stat_inc(pool, __stat)
-#define recycle_stat_inc(pool, __stat)
-#define recycle_stat_add(pool, __stat, val)
+#define alloc_stat_inc(...) do { } while (0)
+#define recycle_stat_inc(...) do { } while (0)
+#define recycle_stat_add(...) do { } while (0)
#endif
static bool page_pool_producer_lock(struct page_pool *pool)
@@ -172,19 +181,33 @@ static void page_pool_producer_unlock(struct page_pool *pool,
spin_unlock_bh(&pool->ring.producer_lock);
}
+static void page_pool_struct_check(void)
+{
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
+ CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
+ PAGE_POOL_FRAG_GROUP_ALIGN);
+}
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params,
int cpuid)
{
unsigned int ring_qsize = 1024; /* Default */
+ struct netdev_rx_queue *rxq;
+ int err;
+
+ page_pool_struct_check();
memcpy(&pool->p, &params->fast, sizeof(pool->p));
memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
pool->cpuid = cpuid;
+ pool->dma_sync_for_cpu = true;
/* Validate only known flags were used */
- if (pool->p.flags & ~(PP_FLAG_ALL))
+ if (pool->slow.flags & ~PP_FLAG_ALL)
return -EINVAL;
if (pool->p.pool_size)
@@ -198,22 +221,26 @@ static int page_pool_init(struct page_pool *pool,
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
* which is the XDP_TX use-case.
*/
- if (pool->p.flags & PP_FLAG_DMA_MAP) {
+ if (pool->slow.flags & PP_FLAG_DMA_MAP) {
if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
(pool->p.dma_dir != DMA_BIDIRECTIONAL))
return -EINVAL;
+
+ pool->dma_map = true;
}
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
+ if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
/* In order to request DMA-sync-for-device the page
* needs to be mapped
*/
- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
return -EINVAL;
if (!pool->p.max_len)
return -EINVAL;
+ pool->dma_sync = true;
+
/* pool->p.offset has to be set according to the address
* offset used by the DMA engine to start copying rx data
*/
@@ -222,7 +249,7 @@ static int page_pool_init(struct page_pool *pool,
pool->has_init_callback = !!pool->slow.init_callback;
#ifdef CONFIG_PAGE_POOL_STATS
- if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) {
+ if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
if (!pool->recycle_stats)
return -ENOMEM;
@@ -232,12 +259,13 @@ static int page_pool_init(struct page_pool *pool,
* (also percpu) page pool instance.
*/
pool->recycle_stats = &pp_system_recycle_stats;
+ pool->system = true;
}
#endif
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
#ifdef CONFIG_PAGE_POOL_STATS
- if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
+ if (!pool->system)
free_percpu(pool->recycle_stats);
#endif
return -ENOMEM;
@@ -248,21 +276,53 @@ static int page_pool_init(struct page_pool *pool,
/* Driver calling page_pool_create() also call page_pool_destroy() */
refcount_set(&pool->user_cnt, 1);
- if (pool->p.flags & PP_FLAG_DMA_MAP)
- get_device(pool->p.dev);
+ xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1);
+
+ if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
+ netdev_assert_locked(pool->slow.netdev);
+ rxq = __netif_get_rx_queue(pool->slow.netdev,
+ pool->slow.queue_idx);
+ pool->mp_priv = rxq->mp_params.mp_priv;
+ pool->mp_ops = rxq->mp_params.mp_ops;
+ }
+
+ if (pool->mp_ops) {
+ if (!pool->dma_map || !pool->dma_sync)
+ return -EOPNOTSUPP;
+
+ if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) {
+ err = -EFAULT;
+ goto free_ptr_ring;
+ }
+
+ err = pool->mp_ops->init(pool);
+ if (err) {
+ pr_warn("%s() mem-provider init failed %d\n", __func__,
+ err);
+ goto free_ptr_ring;
+ }
+
+ static_branch_inc(&page_pool_mem_providers);
+ }
return 0;
+
+free_ptr_ring:
+ ptr_ring_cleanup(&pool->ring, NULL);
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
+#endif
+ return err;
}
static void page_pool_uninit(struct page_pool *pool)
{
ptr_ring_cleanup(&pool->ring, NULL);
-
- if (pool->p.flags & PP_FLAG_DMA_MAP)
- put_device(pool->p.dev);
+ xa_destroy(&pool->dma_mapped);
#ifdef CONFIG_PAGE_POOL_STATS
- if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
+ if (!pool->system)
free_percpu(pool->recycle_stats);
#endif
}
@@ -311,19 +371,18 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
}
EXPORT_SYMBOL(page_pool_create);
-static void page_pool_return_page(struct page_pool *pool, struct page *page);
+static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
-noinline
-static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
+static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
{
struct ptr_ring *r = &pool->ring;
- struct page *page;
+ netmem_ref netmem;
int pref_nid; /* preferred NUMA node */
/* Quicker fallback, avoid locks when ring is empty */
if (__ptr_ring_empty(r)) {
alloc_stat_inc(pool, empty);
- return NULL;
+ return 0;
}
/* Softirq guarantee CPU and thus NUMA node is stable. This,
@@ -338,118 +397,127 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
/* Refill alloc array, but only if NUMA match */
do {
- page = __ptr_ring_consume(r);
- if (unlikely(!page))
+ netmem = (__force netmem_ref)__ptr_ring_consume(r);
+ if (unlikely(!netmem))
break;
- if (likely(page_to_nid(page) == pref_nid)) {
- pool->alloc.cache[pool->alloc.count++] = page;
+ if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
+ pool->alloc.cache[pool->alloc.count++] = netmem;
} else {
/* NUMA mismatch;
* (1) release 1 page to page-allocator and
* (2) break out to fallthrough to alloc_pages_node.
* This limit stress on page buddy alloactor.
*/
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
alloc_stat_inc(pool, waive);
- page = NULL;
+ netmem = 0;
break;
}
} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
/* Return last page */
if (likely(pool->alloc.count > 0)) {
- page = pool->alloc.cache[--pool->alloc.count];
+ netmem = pool->alloc.cache[--pool->alloc.count];
alloc_stat_inc(pool, refill);
}
- return page;
+ return netmem;
}
/* fast path */
-static struct page *__page_pool_get_cached(struct page_pool *pool)
+static netmem_ref __page_pool_get_cached(struct page_pool *pool)
{
- struct page *page;
+ netmem_ref netmem;
/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
if (likely(pool->alloc.count)) {
/* Fast-path */
- page = pool->alloc.cache[--pool->alloc.count];
+ netmem = pool->alloc.cache[--pool->alloc.count];
alloc_stat_inc(pool, fast);
} else {
- page = page_pool_refill_alloc_cache(pool);
+ netmem = page_pool_refill_alloc_cache(pool);
}
- return page;
+ return netmem;
}
-static void page_pool_dma_sync_for_device(struct page_pool *pool,
- struct page *page,
- unsigned int dma_sync_size)
+static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
+ netmem_ref netmem,
+ u32 dma_sync_size)
{
- dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
+ dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
dma_sync_size = min(dma_sync_size, pool->p.max_len);
- dma_sync_single_range_for_device(pool->p.dev, dma_addr,
- pool->p.offset, dma_sync_size,
- pool->p.dma_dir);
+ __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
+ dma_sync_size, pool->p.dma_dir);
+#endif
+}
+
+static __always_inline void
+page_pool_dma_sync_for_device(const struct page_pool *pool,
+ netmem_ref netmem,
+ u32 dma_sync_size)
+{
+ if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) {
+ rcu_read_lock();
+ /* re-check under rcu_read_lock() to sync with page_pool_scrub() */
+ if (pool->dma_sync)
+ __page_pool_dma_sync_for_device(pool, netmem,
+ dma_sync_size);
+ rcu_read_unlock();
+ }
}
-static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
+static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
{
dma_addr_t dma;
+ int err;
+ u32 id;
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
* since dma_addr_t can be either 32 or 64 bits and does not always fit
* into page private data (i.e 32bit cpu with 64bit DMA caps)
* This mapping is kept for lifetime of page, until leaving pool.
*/
- dma = dma_map_page_attrs(pool->p.dev, page, 0,
- (PAGE_SIZE << pool->p.order),
- pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
- DMA_ATTR_WEAK_ORDERING);
+ dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
+ (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_WEAK_ORDERING);
if (dma_mapping_error(pool->p.dev, dma))
return false;
- if (page_pool_set_dma_addr(page, dma))
+ if (page_pool_set_dma_addr_netmem(netmem, dma)) {
+ WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
goto unmap_failed;
+ }
+
+ if (in_softirq())
+ err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ else
+ err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ if (err) {
+ WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
+ goto unset_failed;
+ }
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
- page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+ netmem_set_dma_index(netmem, id);
+ page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
return true;
+unset_failed:
+ page_pool_set_dma_addr_netmem(netmem, 0);
unmap_failed:
- WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
return false;
}
-static void page_pool_set_pp_info(struct page_pool *pool,
- struct page *page)
-{
- page->pp = pool;
- page->pp_magic |= PP_SIGNATURE;
-
- /* Ensuring all pages have been split into one fragment initially:
- * page_pool_set_pp_info() is only called once for every page when it
- * is allocated from the page allocator and page_pool_fragment_page()
- * is dirtying the same cache line as the page->pp_magic above, so
- * the overhead is negligible.
- */
- page_pool_fragment_page(page, 1);
- if (pool->has_init_callback)
- pool->slow.init_callback(page, pool->slow.init_arg);
-}
-
-static void page_pool_clear_pp_info(struct page *page)
-{
- page->pp_magic = 0;
- page->pp = NULL;
-}
-
static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
gfp_t gfp)
{
@@ -460,94 +528,102 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
if (unlikely(!page))
return NULL;
- if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
- unlikely(!page_pool_dma_map(pool, page))) {
+ if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) {
put_page(page);
return NULL;
}
alloc_stat_inc(pool, slow_high_order);
- page_pool_set_pp_info(pool, page);
+ page_pool_set_pp_info(pool, page_to_netmem(page));
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
- trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+ trace_page_pool_state_hold(pool, page_to_netmem(page),
+ pool->pages_state_hold_cnt);
return page;
}
/* slow path */
-noinline
-static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
- gfp_t gfp)
+static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
+ gfp_t gfp)
{
const int bulk = PP_ALLOC_CACHE_REFILL;
- unsigned int pp_flags = pool->p.flags;
unsigned int pp_order = pool->p.order;
- struct page *page;
+ bool dma_map = pool->dma_map;
+ netmem_ref netmem;
int i, nr_pages;
/* Don't support bulk alloc for high-order pages */
if (unlikely(pp_order))
- return __page_pool_alloc_page_order(pool, gfp);
+ return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
/* Unnecessary as alloc cache is empty, but guarantees zero count */
if (unlikely(pool->alloc.count > 0))
return pool->alloc.cache[--pool->alloc.count];
- /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
+ /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
- nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
- pool->alloc.cache);
+ nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
+ (struct page **)pool->alloc.cache);
if (unlikely(!nr_pages))
- return NULL;
+ return 0;
/* Pages have been filled into alloc.cache array, but count is zero and
* page element have not been (possibly) DMA mapped.
*/
for (i = 0; i < nr_pages; i++) {
- page = pool->alloc.cache[i];
- if ((pp_flags & PP_FLAG_DMA_MAP) &&
- unlikely(!page_pool_dma_map(pool, page))) {
- put_page(page);
+ netmem = pool->alloc.cache[i];
+ if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) {
+ put_page(netmem_to_page(netmem));
continue;
}
- page_pool_set_pp_info(pool, page);
- pool->alloc.cache[pool->alloc.count++] = page;
+ page_pool_set_pp_info(pool, netmem);
+ pool->alloc.cache[pool->alloc.count++] = netmem;
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
- trace_page_pool_state_hold(pool, page,
+ trace_page_pool_state_hold(pool, netmem,
pool->pages_state_hold_cnt);
}
/* Return last page */
if (likely(pool->alloc.count > 0)) {
- page = pool->alloc.cache[--pool->alloc.count];
+ netmem = pool->alloc.cache[--pool->alloc.count];
alloc_stat_inc(pool, slow);
} else {
- page = NULL;
+ netmem = 0;
}
/* When page just alloc'ed is should/must have refcnt 1. */
- return page;
+ return netmem;
}
/* For using page_pool replace: alloc_pages() API calls, but provide
* synchronization guarantee for allocation side.
*/
-struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
{
- struct page *page;
+ netmem_ref netmem;
/* Fast-path: Get a page from cache */
- page = __page_pool_get_cached(pool);
- if (page)
- return page;
+ netmem = __page_pool_get_cached(pool);
+ if (netmem)
+ return netmem;
/* Slow-path: cache empty, do real allocation */
- page = __page_pool_alloc_pages_slow(pool, gfp);
- return page;
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
+ netmem = pool->mp_ops->alloc_netmems(pool, gfp);
+ else
+ netmem = __page_pool_alloc_pages_slow(pool, gfp);
+ return netmem;
+}
+EXPORT_SYMBOL(page_pool_alloc_netmems);
+ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
+
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
}
EXPORT_SYMBOL(page_pool_alloc_pages);
@@ -575,24 +651,60 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict)
return inflight;
}
-static __always_inline
-void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
{
+ netmem_set_pp(netmem, pool);
+ netmem_or_pp_magic(netmem, PP_SIGNATURE);
+
+ /* Ensuring all pages have been split into one fragment initially:
+ * page_pool_set_pp_info() is only called once for every page when it
+ * is allocated from the page allocator and page_pool_fragment_page()
+ * is dirtying the same cache line as the page->pp_magic above, so
+ * the overhead is negligible.
+ */
+ page_pool_fragment_netmem(netmem, 1);
+ if (pool->has_init_callback)
+ pool->slow.init_callback(netmem, pool->slow.init_arg);
+}
+
+void page_pool_clear_pp_info(netmem_ref netmem)
+{
+ netmem_clear_pp_magic(netmem);
+ netmem_set_pp(netmem, NULL);
+}
+
+static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ struct page *old, *page = netmem_to_page(netmem);
+ unsigned long id;
dma_addr_t dma;
- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ if (!pool->dma_map)
/* Always account for inflight pages, even if we didn't
* map them
*/
return;
- dma = page_pool_get_dma_addr(page);
+ id = netmem_get_dma_index(netmem);
+ if (!id)
+ return;
+
+ if (in_softirq())
+ old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
+ else
+ old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
+ if (old != page)
+ return;
+
+ dma = page_pool_get_dma_addr_netmem(netmem);
/* When page is unmapped, it cannot be returned to our pool */
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
- page_pool_set_dma_addr(page, 0);
+ page_pool_set_dma_addr_netmem(netmem, 0);
+ netmem_set_dma_index(netmem, 0);
}
/* Disconnects a page (from a page_pool). API users can have a need
@@ -600,42 +712,45 @@ void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
* a regular page (that will eventually be returned to the normal
* page-allocator via put_page).
*/
-void page_pool_return_page(struct page_pool *pool, struct page *page)
+void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
{
int count;
+ bool put;
- __page_pool_release_page_dma(pool, page);
-
- page_pool_clear_pp_info(page);
+ put = true;
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
+ put = pool->mp_ops->release_netmem(pool, netmem);
+ else
+ __page_pool_release_page_dma(pool, netmem);
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
*/
count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
- trace_page_pool_state_release(pool, page, count);
+ trace_page_pool_state_release(pool, netmem, count);
- put_page(page);
+ if (put) {
+ page_pool_clear_pp_info(netmem);
+ put_page(netmem_to_page(netmem));
+ }
/* An optimization would be to call __free_pages(page, pool->p.order)
* knowing page is not part of page-cache (thus avoiding a
* __page_cache_release() call).
*/
}
-static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
+static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
{
- int ret;
- /* BH protection not needed if current is softirq */
- if (in_softirq())
- ret = ptr_ring_produce(&pool->ring, page);
- else
- ret = ptr_ring_produce_bh(&pool->ring, page);
+ bool in_softirq, ret;
- if (!ret) {
+ /* BH protection not needed if current is softirq */
+ in_softirq = page_pool_producer_lock(pool);
+ ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem);
+ if (ret)
recycle_stat_inc(pool, ring);
- return true;
- }
+ page_pool_producer_unlock(pool, in_softirq);
- return false;
+ return ret;
}
/* Only allow direct recycling in special circumstances, into the
@@ -643,7 +758,7 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
*
* Caller must provide appropriate safe context.
*/
-static bool page_pool_recycle_in_cache(struct page *page,
+static bool page_pool_recycle_in_cache(netmem_ref netmem,
struct page_pool *pool)
{
if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
@@ -652,24 +767,26 @@ static bool page_pool_recycle_in_cache(struct page *page,
}
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
- pool->alloc.cache[pool->alloc.count++] = page;
+ pool->alloc.cache[pool->alloc.count++] = netmem;
recycle_stat_inc(pool, cached);
return true;
}
-static bool __page_pool_page_can_be_recycled(const struct page *page)
+static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
{
- return page_ref_count(page) == 1 && !page_is_pfmemalloc(page);
+ return netmem_is_net_iov(netmem) ||
+ (page_ref_count(netmem_to_page(netmem)) == 1 &&
+ !page_is_pfmemalloc(netmem_to_page(netmem)));
}
/* If the page refcnt == 1, this will try to recycle the page.
- * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
+ * If pool->dma_sync is set, we'll try to sync the DMA area for
* the configured size min(dma_sync_size, pool->max_len).
* If the page refcnt != 1, then the page will be returned to memory
* subsystem.
*/
-static __always_inline struct page *
-__page_pool_put_page(struct page_pool *pool, struct page *page,
+static __always_inline netmem_ref
+__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
unsigned int dma_sync_size, bool allow_direct)
{
lockdep_assert_no_hardirq();
@@ -683,20 +800,18 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* page is NOT reusable when allocated when system is under
* some pressure. (page_is_pfmemalloc)
*/
- if (likely(__page_pool_page_can_be_recycled(page))) {
+ if (likely(__page_pool_page_can_be_recycled(netmem))) {
/* Read barrier done in page_ref_count / READ_ONCE */
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
- page_pool_dma_sync_for_device(pool, page,
- dma_sync_size);
+ page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
- if (allow_direct && in_softirq() &&
- page_pool_recycle_in_cache(page, pool))
- return NULL;
+ if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
+ return 0;
/* Page found as candidate for recycling */
- return page;
+ return netmem;
}
+
/* Fallback/non-XDP mode: API user have elevated refcnt.
*
* Many drivers split up the page into fragments, and some
@@ -711,174 +826,257 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* will be invoking put_page.
*/
recycle_stat_inc(pool, released_refcnt);
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
- return NULL;
+ return 0;
}
-void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+static bool page_pool_napi_local(const struct page_pool *pool)
{
- page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
- if (page && !page_pool_recycle_in_ring(pool, page)) {
+ const struct napi_struct *napi;
+ u32 cpuid;
+
+ /* On PREEMPT_RT the softirq can be preempted by the consumer */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
+
+ if (unlikely(!in_softirq()))
+ return false;
+
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ * __page_pool_put_page() makes sure we're not in hardirq context
+ * and interrupts are enabled prior to accessing the cache.
+ */
+ cpuid = smp_processor_id();
+ if (READ_ONCE(pool->cpuid) == cpuid)
+ return true;
+
+ napi = READ_ONCE(pool->p.napi);
+
+ return napi && READ_ONCE(napi->list_owner) == cpuid;
+}
+
+void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ if (!allow_direct)
+ allow_direct = page_pool_napi_local(pool);
+
+ netmem = __page_pool_put_page(pool, netmem, dma_sync_size,
+ allow_direct);
+ if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
/* Cache full, fallback to free pages */
recycle_stat_inc(pool, ring_full);
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
}
}
+EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
+
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
+ allow_direct);
+}
EXPORT_SYMBOL(page_pool_put_unrefed_page);
-/**
- * page_pool_put_page_bulk() - release references on multiple pages
- * @pool: pool from which pages were allocated
- * @data: array holding page pointers
- * @count: number of pages in @data
- *
- * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
- * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
- * will release leftover pages to the page allocator.
- * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
- * completion loop for the XDP_REDIRECT use case.
- *
- * Please note the caller must not use data area after running
- * page_pool_put_page_bulk(), as this function overwrites it.
- */
-void page_pool_put_page_bulk(struct page_pool *pool, void **data,
- int count)
+static void page_pool_recycle_ring_bulk(struct page_pool *pool,
+ netmem_ref *bulk,
+ u32 bulk_len)
{
- int i, bulk_len = 0;
bool in_softirq;
+ u32 i;
- for (i = 0; i < count; i++) {
- struct page *page = virt_to_head_page(data[i]);
-
- /* It is not the last user for the page frag case */
- if (!page_pool_is_last_ref(page))
- continue;
-
- page = __page_pool_put_page(pool, page, -1, false);
- /* Approved for bulk recycling in ptr_ring cache */
- if (page)
- data[bulk_len++] = page;
- }
-
- if (unlikely(!bulk_len))
- return;
-
- /* Bulk producer into ptr_ring page_pool cache */
+ /* Bulk produce into ptr_ring page_pool cache */
in_softirq = page_pool_producer_lock(pool);
+
for (i = 0; i < bulk_len; i++) {
- if (__ptr_ring_produce(&pool->ring, data[i])) {
+ if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
/* ring full */
recycle_stat_inc(pool, ring_full);
break;
}
}
- recycle_stat_add(pool, ring, i);
+
page_pool_producer_unlock(pool, in_softirq);
+ recycle_stat_add(pool, ring, i);
- /* Hopefully all pages was return into ptr_ring */
+ /* Hopefully all pages were returned into ptr_ring */
if (likely(i == bulk_len))
return;
- /* ptr_ring cache full, free remaining pages outside producer lock
- * since put_page() with refcnt == 1 can be an expensive operation
+ /*
+ * ptr_ring cache is full, free remaining pages outside producer lock
+ * since put_page() with refcnt == 1 can be an expensive operation.
*/
for (; i < bulk_len; i++)
- page_pool_return_page(pool, data[i]);
+ page_pool_return_page(pool, bulk[i]);
}
-EXPORT_SYMBOL(page_pool_put_page_bulk);
-static struct page *page_pool_drain_frag(struct page_pool *pool,
- struct page *page)
+/**
+ * page_pool_put_netmem_bulk() - release references on multiple netmems
+ * @data: array holding netmem references
+ * @count: number of entries in @data
+ *
+ * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
+ * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
+ * will release leftover netmems to the memory provider.
+ * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
+ * completion loop for the XDP_REDIRECT use case.
+ *
+ * Please note the caller must not use data area after running
+ * page_pool_put_netmem_bulk(), as this function overwrites it.
+ */
+void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
+{
+ u32 bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ netmem_ref netmem = netmem_compound_head(data[i]);
+
+ if (page_pool_unref_and_test(netmem))
+ data[bulk_len++] = netmem;
+ }
+
+ count = bulk_len;
+ while (count) {
+ netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
+ struct page_pool *pool = NULL;
+ bool allow_direct;
+ u32 foreign = 0;
+
+ bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ struct page_pool *netmem_pp;
+ netmem_ref netmem = data[i];
+
+ netmem_pp = netmem_get_pp(netmem);
+ if (unlikely(!pool)) {
+ pool = netmem_pp;
+ allow_direct = page_pool_napi_local(pool);
+ } else if (netmem_pp != pool) {
+ /*
+ * If the netmem belongs to a different
+ * page_pool, save it for another round.
+ */
+ data[foreign++] = netmem;
+ continue;
+ }
+
+ netmem = __page_pool_put_page(pool, netmem, -1,
+ allow_direct);
+ /* Approved for bulk recycling in ptr_ring cache */
+ if (netmem)
+ bulk[bulk_len++] = netmem;
+ }
+
+ if (bulk_len)
+ page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
+
+ count = foreign;
+ }
+}
+EXPORT_SYMBOL(page_pool_put_netmem_bulk);
+
+static netmem_ref page_pool_drain_frag(struct page_pool *pool,
+ netmem_ref netmem)
{
long drain_count = BIAS_MAX - pool->frag_users;
/* Some user is still using the page frag */
- if (likely(page_pool_unref_page(page, drain_count)))
- return NULL;
-
- if (__page_pool_page_can_be_recycled(page)) {
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
- page_pool_dma_sync_for_device(pool, page, -1);
+ if (likely(page_pool_unref_netmem(netmem, drain_count)))
+ return 0;
- return page;
+ if (__page_pool_page_can_be_recycled(netmem)) {
+ page_pool_dma_sync_for_device(pool, netmem, -1);
+ return netmem;
}
- page_pool_return_page(pool, page);
- return NULL;
+ page_pool_return_page(pool, netmem);
+ return 0;
}
static void page_pool_free_frag(struct page_pool *pool)
{
long drain_count = BIAS_MAX - pool->frag_users;
- struct page *page = pool->frag_page;
+ netmem_ref netmem = pool->frag_page;
- pool->frag_page = NULL;
+ pool->frag_page = 0;
- if (!page || page_pool_unref_page(page, drain_count))
+ if (!netmem || page_pool_unref_netmem(netmem, drain_count))
return;
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
}
-struct page *page_pool_alloc_frag(struct page_pool *pool,
- unsigned int *offset,
- unsigned int size, gfp_t gfp)
+netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
+ unsigned int *offset, unsigned int size,
+ gfp_t gfp)
{
unsigned int max_size = PAGE_SIZE << pool->p.order;
- struct page *page = pool->frag_page;
+ netmem_ref netmem = pool->frag_page;
if (WARN_ON(size > max_size))
- return NULL;
+ return 0;
size = ALIGN(size, dma_get_cache_alignment());
*offset = pool->frag_offset;
- if (page && *offset + size > max_size) {
- page = page_pool_drain_frag(pool, page);
- if (page) {
+ if (netmem && *offset + size > max_size) {
+ netmem = page_pool_drain_frag(pool, netmem);
+ if (netmem) {
+ recycle_stat_inc(pool, cached);
alloc_stat_inc(pool, fast);
goto frag_reset;
}
}
- if (!page) {
- page = page_pool_alloc_pages(pool, gfp);
- if (unlikely(!page)) {
- pool->frag_page = NULL;
- return NULL;
+ if (!netmem) {
+ netmem = page_pool_alloc_netmems(pool, gfp);
+ if (unlikely(!netmem)) {
+ pool->frag_page = 0;
+ return 0;
}
- pool->frag_page = page;
+ pool->frag_page = netmem;
frag_reset:
pool->frag_users = 1;
*offset = 0;
pool->frag_offset = size;
- page_pool_fragment_page(page, BIAS_MAX);
- return page;
+ page_pool_fragment_netmem(netmem, BIAS_MAX);
+ return netmem;
}
pool->frag_users++;
pool->frag_offset = *offset + size;
- alloc_stat_inc(pool, fast);
- return page;
+ return netmem;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
+
+struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
+ unsigned int size, gfp_t gfp)
+{
+ return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
+ gfp));
}
EXPORT_SYMBOL(page_pool_alloc_frag);
static void page_pool_empty_ring(struct page_pool *pool)
{
- struct page *page;
+ netmem_ref netmem;
/* Empty recycle ring */
- while ((page = ptr_ring_consume_bh(&pool->ring))) {
+ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
/* Verify the refcnt invariant of cached pages */
- if (!(page_ref_count(page) == 1))
+ if (!(netmem_ref_count(netmem) == 1))
pr_crit("%s() page_pool refcnt %d violation\n",
- __func__, page_ref_count(page));
+ __func__, netmem_ref_count(netmem));
- page_pool_return_page(pool, page);
+ page_pool_return_page(pool, netmem);
}
}
@@ -889,12 +1087,18 @@ static void __page_pool_destroy(struct page_pool *pool)
page_pool_unlist(pool);
page_pool_uninit(pool);
+
+ if (pool->mp_ops) {
+ pool->mp_ops->destroy(pool);
+ static_branch_dec(&page_pool_mem_providers);
+ }
+
kfree(pool);
}
static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
{
- struct page *page;
+ netmem_ref netmem;
if (pool->destroy_cnt)
return;
@@ -904,15 +1108,36 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
* call concurrently.
*/
while (pool->alloc.count) {
- page = pool->alloc.cache[--pool->alloc.count];
- page_pool_return_page(pool, page);
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ page_pool_return_page(pool, netmem);
}
}
static void page_pool_scrub(struct page_pool *pool)
{
+ unsigned long id;
+ void *ptr;
+
page_pool_empty_alloc_cache_once(pool);
- pool->destroy_cnt++;
+ if (!pool->destroy_cnt++ && pool->dma_map) {
+ if (pool->dma_sync) {
+ /* Disable page_pool_dma_sync_for_device() */
+ pool->dma_sync = false;
+
+ /* Make sure all concurrent returns that may see the old
+ * value of dma_sync (and thus perform a sync) have
+ * finished before doing the unmapping below. Skip the
+ * wait if the device doesn't actually need syncing, or
+ * if there are no outstanding mapped pages.
+ */
+ if (dma_dev_need_sync(pool->p.dev) &&
+ !xa_empty(&pool->dma_mapped))
+ synchronize_net();
+ }
+
+ xa_for_each(&pool->dma_mapped, id, ptr)
+ __page_pool_release_page_dma(pool, page_to_netmem(ptr));
+ }
/* No more consumers should exist, but producers could still
* be in-flight.
@@ -922,10 +1147,14 @@ static void page_pool_scrub(struct page_pool *pool)
static int page_pool_release(struct page_pool *pool)
{
+ bool in_softirq;
int inflight;
page_pool_scrub(pool);
inflight = page_pool_inflight(pool, true);
+ /* Acquire producer lock to make sure producers have exited. */
+ in_softirq = page_pool_producer_lock(pool);
+ page_pool_producer_unlock(pool, in_softirq);
if (!inflight)
__page_pool_destroy(pool);
@@ -940,7 +1169,13 @@ static void page_pool_release_retry(struct work_struct *wq)
int inflight;
inflight = page_pool_release(pool);
- if (!inflight)
+ /* In rare cases, a driver bug may cause inflight to go negative.
+ * Don't reschedule release if inflight is 0 or negative.
+ * - If 0, the page_pool has been destroyed
+ * - if negative, we will never recover
+ * in both cases no reschedule is necessary.
+ */
+ if (inflight <= 0)
return;
/* Periodic warning for page pools the user can't see */
@@ -959,31 +1194,30 @@ static void page_pool_release_retry(struct work_struct *wq)
}
void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
- struct xdp_mem_info *mem)
+ const struct xdp_mem_info *mem)
{
refcount_inc(&pool->user_cnt);
pool->disconnect = disconnect;
pool->xdp_mem_id = mem->id;
}
-static void page_pool_disable_direct_recycling(struct page_pool *pool)
+void page_pool_disable_direct_recycling(struct page_pool *pool)
{
/* Disable direct recycling based on pool->cpuid.
- * Paired with READ_ONCE() in napi_pp_put_page().
+ * Paired with READ_ONCE() in page_pool_napi_local().
*/
WRITE_ONCE(pool->cpuid, -1);
if (!pool->p.napi)
return;
- /* To avoid races with recycling and additional barriers make sure
- * pool and NAPI are unlinked when NAPI is disabled.
- */
- WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
- READ_ONCE(pool->p.napi->list_owner) != -1);
+ napi_assert_will_not_race(pool->p.napi);
+ mutex_lock(&page_pools_lock);
WRITE_ONCE(pool->p.napi, NULL);
+ mutex_unlock(&page_pools_lock);
}
+EXPORT_SYMBOL(page_pool_disable_direct_recycling);
void page_pool_destroy(struct page_pool *pool)
{
@@ -1011,15 +1245,43 @@ EXPORT_SYMBOL(page_pool_destroy);
/* Caller must provide appropriate safe context, e.g. NAPI. */
void page_pool_update_nid(struct page_pool *pool, int new_nid)
{
- struct page *page;
+ netmem_ref netmem;
trace_page_pool_update_nid(pool, new_nid);
pool->p.nid = new_nid;
/* Flush pool alloc cache, as refill will check NUMA node */
while (pool->alloc.count) {
- page = pool->alloc.cache[--pool->alloc.count];
- page_pool_return_page(pool, page);
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ page_pool_return_page(pool, netmem);
}
}
EXPORT_SYMBOL(page_pool_update_nid);
+
+bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr)
+{
+ return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr);
+}
+
+/* Associate a niov with a page pool. Should follow with a matching
+ * net_mp_niov_clear_page_pool()
+ */
+void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_set_pp_info(pool, netmem);
+
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
+}
+
+/* Disassociate a niov from a page pool. Should only be used in the
+ * ->release_netmem() path.
+ */
+void net_mp_niov_clear_page_pool(struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_clear_pp_info(netmem);
+}
diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h
index 90665d40f1eb..2fb06d5f6d55 100644
--- a/net/core/page_pool_priv.h
+++ b/net/core/page_pool_priv.h
@@ -3,10 +3,58 @@
#ifndef __PAGE_POOL_PRIV_H
#define __PAGE_POOL_PRIV_H
+#include <net/page_pool/helpers.h>
+
+#include "netmem_priv.h"
+
+extern struct mutex page_pools_lock;
+
s32 page_pool_inflight(const struct page_pool *pool, bool strict);
int page_pool_list(struct page_pool *pool);
void page_pool_detached(struct page_pool *pool);
void page_pool_unlist(struct page_pool *pool);
+static inline bool
+page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr)
+{
+ if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
+ netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT);
+
+ /* We assume page alignment to shave off bottom bits,
+ * if this "compression" doesn't work we need to drop.
+ */
+ return addr != (dma_addr_t)netmem_get_dma_addr(netmem)
+ << PAGE_SHIFT;
+ }
+
+ netmem_set_dma_addr(netmem, addr);
+ return false;
+}
+
+static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+ return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr);
+}
+
+#if defined(CONFIG_PAGE_POOL)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
+void page_pool_clear_pp_info(netmem_ref netmem);
+int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq);
+#else
+static inline void page_pool_set_pp_info(struct page_pool *pool,
+ netmem_ref netmem)
+{
+}
+static inline void page_pool_clear_pp_info(netmem_ref netmem)
+{
+}
+static inline int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
index 3a3277ba167b..c82a95beceff 100644
--- a/net/core/page_pool_user.c
+++ b/net/core/page_pool_user.c
@@ -3,19 +3,23 @@
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/xarray.h>
+#include <net/busy_poll.h>
#include <net/net_debug.h>
-#include <net/page_pool/types.h>
+#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/memory_provider.h>
#include <net/sock.h>
#include "page_pool_priv.h"
#include "netdev-genl-gen.h"
static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
-/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user.
+/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev,
+ * pool->user.
* Ordering: inside rtnl_lock
*/
-static DEFINE_MUTEX(page_pools_lock);
+DEFINE_MUTEX(page_pools_lock);
/* Page pools are only reachable from user space (via netlink) if they are
* linked to a netdev at creation time. Following page pool "visibility"
@@ -213,6 +217,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
const struct genl_info *info)
{
size_t inflight, refsz;
+ unsigned int napi_id;
void *hdr;
hdr = genlmsg_iput(rsp, info);
@@ -226,8 +231,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
pool->slow.netdev->ifindex))
goto err_cancel;
- if (pool->user.napi_id &&
- nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id))
+
+ napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0;
+ if (napi_id_valid(napi_id) &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id))
goto err_cancel;
inflight = page_pool_inflight(pool, false);
@@ -241,6 +248,9 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
pool->user.detach_time))
goto err_cancel;
+ if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL))
+ goto err_cancel;
+
genlmsg_end(rsp, hdr);
return 0;
@@ -313,8 +323,6 @@ int page_pool_list(struct page_pool *pool)
if (pool->slow.netdev) {
hlist_add_head(&pool->user.list,
&pool->slow.netdev->page_pools);
- pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0;
-
netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF);
}
@@ -344,6 +352,30 @@ void page_pool_unlist(struct page_pool *pool)
mutex_unlock(&page_pools_lock);
}
+int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ void *binding = rxq->mp_params.mp_priv;
+ struct page_pool *pool;
+ struct hlist_node *n;
+
+ if (!binding)
+ return 0;
+
+ mutex_lock(&page_pools_lock);
+ hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) {
+ if (pool->mp_priv != binding)
+ continue;
+
+ if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) {
+ mutex_unlock(&page_pools_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&page_pools_lock);
+ return -ENODATA;
+}
+
static void page_pool_unreg_netdev_wipe(struct net_device *netdev)
{
struct page_pool *pool;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index ea55a758a475..0ebe5461d4d9 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -69,7 +69,7 @@
*
* By design there should only be *one* "controlling" process. In practice
* multiple write accesses gives unpredictable result. Understood by "write"
- * to /proc gives result code thats should be read be the "writer".
+ * to /proc gives result code that should be read be the "writer".
* For practical use this should be no problem.
*
* Note when adding devices to a specific CPU there good idea to also assign
@@ -158,9 +158,7 @@
#include <net/udp.h>
#include <net/ip6_checksum.h>
#include <net/addrconf.h>
-#ifdef CONFIG_XFRM
#include <net/xfrm.h>
-#endif
#include <net/netns/generic.h>
#include <asm/byteorder.h>
#include <linux/rcupdate.h>
@@ -179,7 +177,7 @@
#define MAX_IMIX_ENTRIES 20
#define IMIX_PRECISION 100 /* Precision of IMIX distribution */
-#define func_enter() pr_debug("entering %s\n", __func__);
+#define func_enter() pr_debug("entering %s\n", __func__)
#define PKT_FLAGS \
pf(IPV6) /* Interface in IPV6 Mode */ \
@@ -229,12 +227,12 @@ static char *pkt_flag_names[] = {
/* Xmit modes */
#define M_START_XMIT 0 /* Default normal TX */
-#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
/* If lock -- protects updating of if_list */
-#define if_lock(t) mutex_lock(&(t->if_lock));
-#define if_unlock(t) mutex_unlock(&(t->if_lock));
+#define if_lock(t) mutex_lock(&(t->if_lock))
+#define if_unlock(t) mutex_unlock(&(t->if_lock))
/* Used to help with determining the pkts on receive */
#define PKTGEN_MAGIC 0xbe9be955
@@ -285,7 +283,8 @@ struct pktgen_dev {
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
int removal_mark; /* non-zero => the device is marked for
- * removal by worker thread */
+ * removal by worker thread
+ */
struct page *page;
u64 delay; /* nano-seconds */
@@ -348,10 +347,12 @@ struct pktgen_dev {
__u16 udp_dst_max; /* exclusive, dest UDP port */
/* DSCP + ECN */
- __u8 tos; /* six MSB of (former) IPv4 TOS
- are for dscp codepoint */
- __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
- (see RFC 3260, sec. 4) */
+ __u8 tos; /* six MSB of (former) IPv4 TOS
+ * are for dscp codepoint
+ */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
+ * (see RFC 3260, sec. 4)
+ */
/* IMIX */
unsigned int n_imix_entries;
@@ -391,12 +392,12 @@ struct pktgen_dev {
__u8 hh[14];
/* = {
- 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
-
- We fill in SRC address later
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x08, 0x00
- };
+ * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+ *
+ * We fill in SRC address later
+ * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ * 0x08, 0x00
+ * };
*/
__u16 pad; /* pad out the hh struct to an even 16 bytes */
@@ -460,7 +461,8 @@ struct pktgen_thread {
char result[512];
/* Field for thread to receive "posted" events terminate,
- stop ifs etc. */
+ * stop ifs etc.
+ */
u32 control;
int cpu;
@@ -474,8 +476,7 @@ struct pktgen_thread {
#define FIND 0
static const char version[] =
- "Packet Generator for packet performance testing. "
- "Version: " VERSION "\n";
+ "Packet Generator for packet performance testing. Version: " VERSION "\n";
static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
@@ -517,21 +518,23 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
char data[128];
+ size_t max;
struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (count == 0)
+ if (count < 1)
return -EINVAL;
- if (count > sizeof(data))
- count = sizeof(data);
-
- if (copy_from_user(data, buf, count))
+ max = min(count, sizeof(data) - 1);
+ if (copy_from_user(data, buf, max))
return -EFAULT;
- data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
+ if (data[max - 1] == '\n')
+ data[max - 1] = 0; /* strip trailing '\n', terminate string */
+ else
+ data[max] = 0; /* terminate string */
if (!strcmp(data, "stop"))
pktgen_stop_all_threads(pn);
@@ -624,8 +627,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
seq_printf(seq,
- " udp_src_min: %d udp_src_max: %d"
- " udp_dst_min: %d udp_dst_max: %d\n",
+ " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
pkt_dev->udp_src_min, pkt_dev->udp_src_max,
pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
@@ -744,34 +746,37 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
}
-static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
- __u32 *num)
+static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen,
+ __u32 *num)
{
- int i = 0;
+ size_t i = 0;
+
*num = 0;
for (; i < maxlen; i++) {
int value;
char c;
- *num <<= 4;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
value = hex_to_bin(c);
- if (value >= 0)
+ if (value >= 0) {
+ *num <<= 4;
*num |= value;
- else
+ } else {
break;
+ }
}
return i;
}
-static int count_trail_chars(const char __user * user_buffer,
- unsigned int maxlen)
+static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen)
{
- int i;
+ size_t i;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -790,14 +795,15 @@ done:
return i;
}
-static long num_arg(const char __user *user_buffer, unsigned long maxlen,
- unsigned long *num)
+static ssize_t num_arg(const char __user *user_buffer, size_t maxlen,
+ unsigned long *num)
{
- int i;
+ size_t i;
*num = 0;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
if ((c >= '0') && (c <= '9')) {
@@ -809,12 +815,13 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen,
return i;
}
-static int strn_len(const char __user * user_buffer, unsigned int maxlen)
+static ssize_t strn_len(const char __user *user_buffer, size_t maxlen)
{
- int i;
+ size_t i;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -823,6 +830,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen)
case '\r':
case '\t':
case ' ':
+ case '=':
goto done_str;
default:
break;
@@ -838,11 +846,11 @@ done_str:
* "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example.
*/
static ssize_t get_imix_entries(const char __user *buffer,
+ size_t maxlen,
struct pktgen_dev *pkt_dev)
{
- const int max_digits = 10;
- int i = 0;
- long len;
+ size_t i = 0, max;
+ ssize_t len;
char c;
pkt_dev->n_imix_entries = 0;
@@ -851,21 +859,33 @@ static ssize_t get_imix_entries(const char __user *buffer,
unsigned long weight;
unsigned long size;
- len = num_arg(&buffer[i], max_digits, &size);
+ if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES)
+ return -E2BIG;
+
+ if (i >= maxlen)
+ return -EINVAL;
+
+ max = min(10, maxlen - i);
+ len = num_arg(&buffer[i], max, &size);
if (len < 0)
return len;
i += len;
+ if (i >= maxlen)
+ return -EINVAL;
if (get_user(c, &buffer[i]))
return -EFAULT;
/* Check for comma between size_i and weight_i */
if (c != ',')
return -EINVAL;
i++;
+ if (i >= maxlen)
+ return -EINVAL;
if (size < 14 + 20 + 8)
size = 14 + 20 + 8;
- len = num_arg(&buffer[i], max_digits, &weight);
+ max = min(10, maxlen - i);
+ len = num_arg(&buffer[i], max, &weight);
if (len < 0)
return len;
if (weight <= 0)
@@ -875,42 +895,55 @@ static ssize_t get_imix_entries(const char __user *buffer,
pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight;
i += len;
+ pkt_dev->n_imix_entries++;
+
+ if (i >= maxlen)
+ break;
if (get_user(c, &buffer[i]))
return -EFAULT;
-
i++;
- pkt_dev->n_imix_entries++;
-
- if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES)
- return -E2BIG;
} while (c == ' ');
return i;
}
-static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+static ssize_t get_labels(const char __user *buffer,
+ size_t maxlen, struct pktgen_dev *pkt_dev)
{
unsigned int n = 0;
+ size_t i = 0, max;
+ ssize_t len;
char c;
- ssize_t i = 0;
- int len;
pkt_dev->nr_labels = 0;
do {
__u32 tmp;
- len = hex32_arg(&buffer[i], 8, &tmp);
- if (len <= 0)
+
+ if (n >= MAX_MPLS_LABELS)
+ return -E2BIG;
+
+ if (i >= maxlen)
+ return -EINVAL;
+
+ max = min(8, maxlen - i);
+ len = hex32_arg(&buffer[i], max, &tmp);
+ if (len < 0)
return len;
+
+ /* return empty list in case of invalid input or zero value */
+ if (len == 0 || tmp == 0)
+ return maxlen;
+
pkt_dev->labels[n] = htonl(tmp);
if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
pkt_dev->flags |= F_MPLS_RND;
i += len;
+ n++;
+ if (i >= maxlen)
+ break;
if (get_user(c, &buffer[i]))
return -EFAULT;
i++;
- n++;
- if (n >= MAX_MPLS_LABELS)
- return -E2BIG;
} while (c == ',');
pkt_dev->nr_labels = n;
@@ -947,16 +980,16 @@ static __u32 pktgen_read_flag(const char *f, bool *disable)
}
static ssize_t pktgen_if_write(struct file *file,
- const char __user * user_buffer, size_t count,
- loff_t * offset)
+ const char __user *user_buffer, size_t count,
+ loff_t *offset)
{
struct seq_file *seq = file->private_data;
struct pktgen_dev *pkt_dev = seq->private;
- int i, max, len;
+ size_t i, max;
+ ssize_t len;
char name[16], valstr[32];
unsigned long value = 0;
char *pg_result = NULL;
- int tmp = 0;
char buf[128];
pg_result = &(pkt_dev->result[0]);
@@ -967,16 +1000,16 @@ static ssize_t pktgen_if_write(struct file *file,
}
max = count;
- tmp = count_trail_chars(user_buffer, max);
- if (tmp < 0) {
+ len = count_trail_chars(user_buffer, max);
+ if (len < 0) {
pr_warn("illegal format\n");
- return tmp;
+ return len;
}
- i = tmp;
+ i = len;
/* Read variable name */
-
- len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ max = min(sizeof(name) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1004,11 +1037,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "min_pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->min_pkt_size) {
@@ -1021,11 +1054,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "max_pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->max_pkt_size) {
@@ -1040,11 +1073,11 @@ static ssize_t pktgen_if_write(struct file *file,
/* Shortcut for min = max */
if (!strcmp(name, "pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->min_pkt_size) {
@@ -1060,43 +1093,43 @@ static ssize_t pktgen_if_write(struct file *file,
if (pkt_dev->clone_skb > 0)
return -EINVAL;
- len = get_imix_entries(&user_buffer[i], pkt_dev);
+ max = count - i;
+ len = get_imix_entries(&user_buffer[i], max, pkt_dev);
if (len < 0)
return len;
fill_imix_distribution(pkt_dev);
- i += len;
return count;
}
if (!strcmp(name, "debug")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
debug = value;
sprintf(pg_result, "OK: debug=%u", debug);
return count;
}
if (!strcmp(name, "frags")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->nfrags = value;
sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags);
return count;
}
if (!strcmp(name, "delay")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value == 0x7FFFFFFF)
pkt_dev->delay = ULLONG_MAX;
else
@@ -1107,13 +1140,13 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "rate")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (!value)
- return len;
+ return -EINVAL;
pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
if (debug)
pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
@@ -1122,13 +1155,13 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "ratep")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (!value)
- return len;
+ return -EINVAL;
pkt_dev->delay = NSEC_PER_SEC/value;
if (debug)
pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
@@ -1137,11 +1170,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_src_min")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_src_min) {
pkt_dev->udp_src_min = value;
pkt_dev->cur_udp_src = value;
@@ -1150,11 +1183,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_dst_min")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_dst_min) {
pkt_dev->udp_dst_min = value;
pkt_dev->cur_udp_dst = value;
@@ -1163,11 +1196,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_src_max")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_src_max) {
pkt_dev->udp_src_max = value;
pkt_dev->cur_udp_src = value;
@@ -1176,11 +1209,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_dst_max")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_dst_max) {
pkt_dev->udp_dst_max = value;
pkt_dev->cur_udp_dst = value;
@@ -1189,7 +1222,8 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "clone_skb")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
/* clone_skb is not supported for netif_receive xmit_mode and
@@ -1198,34 +1232,33 @@ static ssize_t pktgen_if_write(struct file *file,
if ((value > 0) &&
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
!(pkt_dev->flags & F_SHARED)))
return -EINVAL;
- i += len;
pkt_dev->clone_skb = value;
sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
return count;
}
if (!strcmp(name, "count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->count = value;
sprintf(pg_result, "OK: count=%llu",
(unsigned long long)pkt_dev->count);
return count;
}
if (!strcmp(name, "src_mac_count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (pkt_dev->src_mac_count != value) {
pkt_dev->src_mac_count = value;
pkt_dev->cur_src_mac_offset = 0;
@@ -1235,11 +1268,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "dst_mac_count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (pkt_dev->dst_mac_count != value) {
pkt_dev->dst_mac_count = value;
pkt_dev->cur_dst_mac_offset = 0;
@@ -1249,16 +1282,16 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "burst")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value > 1) &&
((pkt_dev->xmit_mode == M_QUEUE_XMIT) ||
((pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (value > 1 && !(pkt_dev->flags & F_SHARED))
return -EINVAL;
@@ -1268,12 +1301,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "node")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
-
if (node_possible(value)) {
pkt_dev->node = value;
sprintf(pg_result, "OK: node=%d", pkt_dev->node);
@@ -1281,29 +1313,29 @@ static ssize_t pktgen_if_write(struct file *file,
put_page(pkt_dev->page);
pkt_dev->page = NULL;
}
- }
- else
+ } else {
sprintf(pg_result, "ERROR: node not possible");
+ }
return count;
}
if (!strcmp(name, "xmit_mode")) {
char f[32];
- memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
+ memset(f, 0, sizeof(f));
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
if (strcmp(f, "start_xmit") == 0) {
pkt_dev->xmit_mode = M_START_XMIT;
} else if (strcmp(f, "netif_receive") == 0) {
/* clone_skb set earlier, not supported in this mode */
if (pkt_dev->clone_skb > 0)
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
pkt_dev->xmit_mode = M_NETIF_RECEIVE;
@@ -1329,14 +1361,14 @@ static ssize_t pktgen_if_write(struct file *file,
char f[32];
char *end;
- memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
+ memset(f, 0, 32);
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
flag = pktgen_read_flag(f, &disable);
if (flag) {
@@ -1378,7 +1410,8 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
+ max = min(sizeof(pkt_dev->dst_min) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1386,19 +1419,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_min) != 0) {
- memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strcpy(pkt_dev->dst_min, buf);
+ strscpy_pad(pkt_dev->dst_min, buf);
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
if (debug)
pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);
- i += len;
+
sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
return count;
}
if (!strcmp(name, "dst_max")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
+ max = min(sizeof(pkt_dev->dst_max) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1406,19 +1439,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_max) != 0) {
- memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strcpy(pkt_dev->dst_max, buf);
+ strscpy_pad(pkt_dev->dst_max, buf);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
if (debug)
pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);
- i += len;
+
sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
return count;
}
if (!strcmp(name, "dst6")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1436,12 +1469,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6 set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6=%s", buf);
return count;
}
if (!strcmp(name, "dst6_min")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1458,12 +1491,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6_min set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6_min=%s", buf);
return count;
}
if (!strcmp(name, "dst6_max")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1479,12 +1512,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6_max set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6_max=%s", buf);
return count;
}
if (!strcmp(name, "src6")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1502,12 +1535,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("src6 set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: src6=%s", buf);
return count;
}
if (!strcmp(name, "src_min")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
+ max = min(sizeof(pkt_dev->src_min) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1515,19 +1548,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_min) != 0) {
- memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strcpy(pkt_dev->src_min, buf);
+ strscpy_pad(pkt_dev->src_min, buf);
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
if (debug)
pr_debug("src_min set to: %s\n", pkt_dev->src_min);
- i += len;
+
sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
return count;
}
if (!strcmp(name, "src_max")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
+ max = min(sizeof(pkt_dev->src_max) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1535,19 +1568,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_max) != 0) {
- memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strcpy(pkt_dev->src_max, buf);
+ strscpy_pad(pkt_dev->src_max, buf);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
if (debug)
pr_debug("src_max set to: %s\n", pkt_dev->src_max);
- i += len;
+
sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
return count;
}
if (!strcmp(name, "dst_mac")) {
- len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ max = min(sizeof(valstr) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1564,7 +1597,8 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "src_mac")) {
- len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ max = min(sizeof(valstr) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1588,11 +1622,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "flows")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value > MAX_CFLOWS)
value = MAX_CFLOWS;
@@ -1602,44 +1636,44 @@ static ssize_t pktgen_if_write(struct file *file,
}
#ifdef CONFIG_XFRM
if (!strcmp(name, "spi")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->spi = value;
sprintf(pg_result, "OK: spi=%u", pkt_dev->spi);
return count;
}
#endif
if (!strcmp(name, "flowlen")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->lflow = value;
sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
return count;
}
if (!strcmp(name, "queue_map_min")) {
- len = num_arg(&user_buffer[i], 5, &value);
+ max = min(5, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->queue_map_min = value;
sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
return count;
}
if (!strcmp(name, "queue_map_max")) {
- len = num_arg(&user_buffer[i], 5, &value);
+ max = min(5, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->queue_map_max = value;
sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
return count;
@@ -1648,10 +1682,11 @@ static ssize_t pktgen_if_write(struct file *file,
if (!strcmp(name, "mpls")) {
unsigned int n, cnt;
- len = get_labels(&user_buffer[i], pkt_dev);
+ max = count - i;
+ len = get_labels(&user_buffer[i], max, pkt_dev);
if (len < 0)
return len;
- i += len;
+
cnt = sprintf(pg_result, "OK: mpls=");
for (n = 0; n < pkt_dev->nr_labels; n++)
cnt += sprintf(pg_result + cnt,
@@ -1669,11 +1704,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_id")) {
- len = num_arg(&user_buffer[i], 4, &value);
+ max = min(4, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value <= 4095) {
pkt_dev->vlan_id = value; /* turn on VLAN */
@@ -1696,11 +1731,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_p")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
pkt_dev->vlan_p = value;
sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
@@ -1711,11 +1746,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_cfi")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
pkt_dev->vlan_cfi = value;
sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
@@ -1726,11 +1761,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_id")) {
- len = num_arg(&user_buffer[i], 4, &value);
+ max = min(4, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
pkt_dev->svlan_id = value; /* turn on SVLAN */
@@ -1753,11 +1788,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_p")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
pkt_dev->svlan_p = value;
sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
@@ -1768,11 +1803,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_cfi")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
pkt_dev->svlan_cfi = value;
sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
@@ -1783,12 +1818,13 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "tos")) {
- __u32 tmp_value = 0;
- len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ __u32 tmp_value;
+
+ max = min(2, count - i);
+ len = hex32_arg(&user_buffer[i], max, &tmp_value);
if (len < 0)
return len;
- i += len;
if (len == 2) {
pkt_dev->tos = tmp_value;
sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
@@ -1799,12 +1835,13 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "traffic_class")) {
- __u32 tmp_value = 0;
- len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ __u32 tmp_value;
+
+ max = min(2, count - i);
+ len = hex32_arg(&user_buffer[i], max, &tmp_value);
if (len < 0)
return len;
- i += len;
if (len == 2) {
pkt_dev->traffic_class = tmp_value;
sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
@@ -1815,11 +1852,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "skb_priority")) {
- len = num_arg(&user_buffer[i], 9, &value);
+ max = min(9, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->skb_priority = value;
sprintf(pg_result, "OK: skb_priority=%i",
pkt_dev->skb_priority);
@@ -1874,12 +1911,13 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
}
static ssize_t pktgen_thread_write(struct file *file,
- const char __user * user_buffer,
- size_t count, loff_t * offset)
+ const char __user *user_buffer,
+ size_t count, loff_t *offset)
{
struct seq_file *seq = file->private_data;
struct pktgen_thread *t = seq->private;
- int i, max, len, ret;
+ size_t i, max;
+ ssize_t len, ret;
char name[40];
char *pg_result;
@@ -1896,8 +1934,8 @@ static ssize_t pktgen_thread_write(struct file *file,
i = len;
/* Read variable name */
-
- len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ max = min(sizeof(name) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1926,15 +1964,17 @@ static ssize_t pktgen_thread_write(struct file *file,
if (!strcmp(name, "add_device")) {
char f[32];
+
memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0) {
ret = len;
goto out;
}
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
+
mutex_lock(&pktgen_thread_lock);
ret = pktgen_add_device(t, f);
mutex_unlock(&pktgen_thread_lock);
@@ -2285,7 +2325,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
s64 remaining;
struct hrtimer_sleeper t;
- hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
hrtimer_set_expires(&t.timer, spin_until);
remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
@@ -2358,24 +2398,25 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
}
-#ifdef CONFIG_XFRM
/* If there was already an IPSEC SA, we keep it as is, else
* we go look for it ...
-*/
+ */
#define DUMMY_MARK 0
static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
{
+#ifdef CONFIG_XFRM
struct xfrm_state *x = pkt_dev->flows[flow].x;
struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
+
if (!x) {
if (pkt_dev->spi) {
/* We need as quick as possible to find the right SA
- * Searching with minimum criteria to archieve this.
+ * Searching with minimum criteria to achieve, this.
*/
x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
} else {
- /* slow path: we dont already have xfrm_state */
+ /* slow path: we don't already have xfrm_state */
x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
(xfrm_address_t *)&pkt_dev->cur_daddr,
(xfrm_address_t *)&pkt_dev->cur_saddr,
@@ -2390,16 +2431,16 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
}
}
-}
#endif
+}
static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
{
-
if (pkt_dev->flags & F_QUEUE_MAP_CPU)
pkt_dev->cur_queue_map = smp_processor_id();
else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
__u16 t;
+
if (pkt_dev->flags & F_QUEUE_MAP_RND) {
t = get_random_u32_inclusive(pkt_dev->queue_map_min,
pkt_dev->queue_map_max);
@@ -2481,6 +2522,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->flags & F_MPLS_RND) {
unsigned int i;
+
for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
@@ -2525,6 +2567,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
imx = ntohl(pkt_dev->saddr_max);
if (imn < imx) {
__u32 t;
+
if (pkt_dev->flags & F_IPSRC_RND)
t = get_random_u32_inclusive(imn, imx - 1);
else {
@@ -2545,6 +2588,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
__be32 s;
+
if (pkt_dev->flags & F_IPDST_RND) {
do {
@@ -2569,10 +2613,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
pkt_dev->flows[flow].flags |= F_INIT;
pkt_dev->flows[flow].cur_daddr =
pkt_dev->cur_daddr;
-#ifdef CONFIG_XFRM
if (pkt_dev->flags & F_IPSEC)
get_ipsec_sa(pkt_dev, flow);
-#endif
pkt_dev->nflows++;
}
}
@@ -2594,6 +2636,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
+
if (pkt_dev->flags & F_TXSIZE_RND) {
t = get_random_u32_inclusive(pkt_dev->min_pkt_size,
pkt_dev->max_pkt_size - 1);
@@ -2660,7 +2703,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
if (!x)
return 0;
/* XXX: we dont support tunnel mode for now until
- * we resolve the dst issue */
+ * we resolve the dst issue
+ */
if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
return 0;
@@ -2695,8 +2739,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
if (pkt_dev->cflows) {
/* let go of the SAs if we have them */
int i;
+
for (i = 0; i < pkt_dev->cflows; i++) {
struct xfrm_state *x = pkt_dev->flows[i].x;
+
if (x) {
xfrm_state_put(x);
pkt_dev->flows[i].x = NULL;
@@ -2711,6 +2757,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
if (pkt_dev->flags & F_IPSEC) {
struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
int nhead = 0;
+
if (x) {
struct ethhdr *eth;
struct iphdr *iph;
@@ -2754,6 +2801,7 @@ err:
static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
{
unsigned int i;
+
for (i = 0; i < pkt_dev->nr_labels; i++)
*mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
@@ -2866,7 +2914,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
skb->dev = dev;
}
} else {
- skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
/* the caller pre-fetches from skb->data and reserves for the mac hdr */
@@ -2947,7 +2995,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
skb->priority = pkt_dev->skb_priority;
memcpy(eth, pkt_dev->hh, 12);
- *(__be16 *) & eth[12] = protocol;
+ *(__be16 *)&eth[12] = protocol;
/* Eth + IPh + UDPh + mpls */
datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
@@ -3176,11 +3224,11 @@ static void pktgen_run(struct pktgen_thread *t)
set_pkt_overhead(pkt_dev);
- strcpy(pkt_dev->result, "Starting");
+ strscpy(pkt_dev->result, "Starting");
pkt_dev->running = 1; /* Cranke yeself! */
started++;
} else
- strcpy(pkt_dev->result, "Error starting");
+ strscpy(pkt_dev->result, "Error starting");
}
rcu_read_unlock();
if (started)
@@ -3439,6 +3487,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
static void pktgen_resched(struct pktgen_dev *pkt_dev)
{
ktime_t idle_start = ktime_get();
+
schedule();
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
}
@@ -3654,7 +3703,7 @@ static int pktgen_thread_worker(void *arg)
struct pktgen_dev *pkt_dev = NULL;
int cpu = t->cpu;
- WARN_ON(smp_processor_id() != cpu);
+ WARN_ON_ONCE(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
complete(&t->start_done);
@@ -3754,7 +3803,8 @@ static int add_dev_to_thread(struct pktgen_thread *t,
* userspace on another CPU than the kthread. The if_lock()
* is used here to sync with concurrent instances of
* _rem_dev_from_if_list() invoked via kthread, which is also
- * updating the if_list */
+ * updating the if_list
+ */
if_lock(t);
if (pkt_dev->pg_thread) {
@@ -3792,7 +3842,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
if (!pkt_dev)
return -ENOMEM;
- strcpy(pkt_dev->odevname, ifname);
+ strscpy(pkt_dev->odevname, ifname);
pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS,
sizeof(struct flow_state)),
node);
@@ -3838,8 +3888,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
pkt_dev->ipsproto = IPPROTO_ESP;
- /* xfrm tunnel mode needs additional dst to extract outter
- * ip header protocol/ttl/id field, here creat a phony one.
+ /* xfrm tunnel mode needs additional dst to extract outer
+ * ip header protocol/ttl/id field, here create a phony one.
* instead of looking for a valid rt, which definitely hurting
* performance under such circumstance.
*/
@@ -3883,17 +3933,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
list_add_tail(&t->th_list, &pn->pktgen_threads);
init_completion(&t->start_done);
- p = kthread_create_on_node(pktgen_thread_worker,
- t,
- cpu_to_node(cpu),
- "kpktgend_%d", cpu);
+ p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d");
if (IS_ERR(p)) {
pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu);
list_del(&t->th_list);
kfree(t);
return PTR_ERR(p);
}
- kthread_bind(p, cpu);
+
t->tsk = p;
pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
@@ -3952,7 +3999,8 @@ static int pktgen_remove_device(struct pktgen_thread *t,
/* Remove proc before if_list entry, because add_device uses
* list to determine if interface already exist, avoid race
- * with proc_create_data() */
+ * with proc_create_data()
+ */
proc_remove(pkt_dev->entry);
/* And update the thread if_list */
@@ -3989,6 +4037,7 @@ static int __net_init pg_net_init(struct net *net)
goto remove;
}
+ cpus_read_lock();
for_each_online_cpu(cpu) {
int err;
@@ -3997,6 +4046,7 @@ static int __net_init pg_net_init(struct net *net)
pr_warn("Cannot create thread for cpu %d (%d)\n",
cpu, err);
}
+ cpus_read_unlock();
if (list_empty(&pn->pktgen_threads)) {
pr_err("Initialization failed for all threads\n");
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a3d7847ce69d..c57692eb8da9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -53,6 +53,7 @@
#include <net/fib_rules.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#include <net/netdev_lock.h>
#include <net/devlink.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/addrconf.h>
@@ -80,11 +81,15 @@ void rtnl_lock(void)
}
EXPORT_SYMBOL(rtnl_lock);
+int rtnl_lock_interruptible(void)
+{
+ return mutex_lock_interruptible(&rtnl_mutex);
+}
+
int rtnl_lock_killable(void)
{
return mutex_lock_killable(&rtnl_mutex);
}
-EXPORT_SYMBOL(rtnl_lock_killable);
static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
@@ -179,6 +184,176 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+void __rtnl_net_lock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_lock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_lock);
+
+void __rtnl_net_unlock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_unlock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_unlock);
+
+void rtnl_net_lock(struct net *net)
+{
+ rtnl_lock();
+ __rtnl_net_lock(net);
+}
+EXPORT_SYMBOL(rtnl_net_lock);
+
+void rtnl_net_unlock(struct net *net)
+{
+ __rtnl_net_unlock(net);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(rtnl_net_unlock);
+
+int rtnl_net_trylock(struct net *net)
+{
+ int ret = rtnl_trylock();
+
+ if (ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+EXPORT_SYMBOL(rtnl_net_trylock);
+
+int rtnl_net_lock_killable(struct net *net)
+{
+ int ret = rtnl_lock_killable();
+
+ if (!ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ if (net_eq(net_a, net_b))
+ return 0;
+
+ /* always init_net first */
+ if (net_eq(net_a, &init_net))
+ return -1;
+
+ if (net_eq(net_b, &init_net))
+ return 1;
+
+ /* otherwise lock in ascending order */
+ return net_a < net_b ? -1 : 1;
+}
+
+int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b)
+{
+ const struct net *net_a, *net_b;
+
+ net_a = container_of(a, struct net, rtnl_mutex.dep_map);
+ net_b = container_of(b, struct net, rtnl_mutex.dep_map);
+
+ return rtnl_net_cmp_locks(net_a, net_b);
+}
+
+bool rtnl_net_is_locked(struct net *net)
+{
+ return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_net_is_locked);
+
+bool lockdep_rtnl_net_is_held(struct net *net)
+{
+ return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_net_is_held);
+#else
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ /* No need to swap */
+ return -1;
+}
+#endif
+
+struct rtnl_nets {
+ /* ->newlink() needs to freeze 3 netns at most;
+ * 2 for the new device, 1 for its peer.
+ */
+ struct net *net[3];
+ unsigned char len;
+};
+
+static void rtnl_nets_init(struct rtnl_nets *rtnl_nets)
+{
+ memset(rtnl_nets, 0, sizeof(*rtnl_nets));
+}
+
+static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ for (i = 0; i < rtnl_nets->len; i++) {
+ put_net(rtnl_nets->net[i]);
+ rtnl_nets->net[i] = NULL;
+ }
+
+ rtnl_nets->len = 0;
+}
+
+/**
+ * rtnl_nets_add - Add netns to be locked before ->newlink().
+ *
+ * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net().
+ * @net: netns pointer with an extra refcnt held.
+ *
+ * The extra refcnt is released in rtnl_nets_destroy().
+ */
+static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net)
+{
+ int i;
+
+ DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net));
+
+ for (i = 0; i < rtnl_nets->len; i++) {
+ switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) {
+ case 0:
+ put_net(net);
+ return;
+ case 1:
+ swap(rtnl_nets->net[i], net);
+ }
+ }
+
+ rtnl_nets->net[i] = net;
+ rtnl_nets->len++;
+}
+
+static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ rtnl_lock();
+
+ for (i = 0; i < rtnl_nets->len; i++)
+ __rtnl_net_lock(rtnl_nets->net[i]);
+}
+
+static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ for (i = 0; i < rtnl_nets->len; i++)
+ __rtnl_net_unlock(rtnl_nets->net[i]);
+
+ rtnl_unlock();
+}
+
static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
@@ -269,64 +444,13 @@ unlock:
}
/**
- * rtnl_register_module - Register a rtnetlink message type
- *
- * @owner: module registering the hook (THIS_MODULE)
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
- *
- * Like rtnl_register, but for use by removable modules.
- */
-int rtnl_register_module(struct module *owner,
- int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- unsigned int flags)
-{
- return rtnl_register_internal(owner, protocol, msgtype,
- doit, dumpit, flags);
-}
-EXPORT_SYMBOL_GPL(rtnl_register_module);
-
-/**
- * rtnl_register - Register a rtnetlink message type
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
- *
- * Registers the specified function pointers (at least one of them has
- * to be non-NULL) to be called whenever a request message for the
- * specified protocol family and message type is received.
- *
- * The special protocol family PF_UNSPEC may be used to define fallback
- * function pointers for the case when no entry for the specific protocol
- * family exists.
- */
-void rtnl_register(int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- unsigned int flags)
-{
- int err;
-
- err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
- flags);
- if (err)
- pr_err("Unable to register rtnetlink message handler, "
- "protocol = %d, message type = %d\n", protocol, msgtype);
-}
-
-/**
* rtnl_unregister - Unregister a rtnetlink message type
* @protocol: Protocol family or PF_UNSPEC
* @msgtype: rtnetlink message type
*
* Returns 0 on success or a negative error code.
*/
-int rtnl_unregister(int protocol, int msgtype)
+static int rtnl_unregister(int protocol, int msgtype)
{
struct rtnl_link __rcu **tab;
struct rtnl_link *link;
@@ -349,7 +473,6 @@ int rtnl_unregister(int protocol, int msgtype)
return 0;
}
-EXPORT_SYMBOL_GPL(rtnl_unregister);
/**
* rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
@@ -384,46 +507,86 @@ void rtnl_unregister_all(int protocol)
}
EXPORT_SYMBOL_GPL(rtnl_unregister_all);
-static LIST_HEAD(link_ops);
-
-static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+/**
+ * __rtnl_register_many - Register rtnetlink message types
+ * @handlers: Array of struct rtnl_msg_handlers
+ * @n: The length of @handlers
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * When one element of @handlers fails to register,
+ * 1) built-in: panics.
+ * 2) modules : the previous successful registrations are unwinded
+ * and an error is returned.
+ *
+ * Use rtnl_register_many().
+ */
+int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n)
{
- const struct rtnl_link_ops *ops;
+ const struct rtnl_msg_handler *handler;
+ int i, err;
+
+ for (i = 0, handler = handlers; i < n; i++, handler++) {
+ err = rtnl_register_internal(handler->owner, handler->protocol,
+ handler->msgtype, handler->doit,
+ handler->dumpit, handler->flags);
+ if (err) {
+ if (!handler->owner)
+ panic("Unable to register rtnetlink message "
+ "handlers, %pS\n", handlers);
- list_for_each_entry(ops, &link_ops, list) {
- if (!strcmp(ops->kind, kind))
- return ops;
+ __rtnl_unregister_many(handlers, i);
+ break;
+ }
}
- return NULL;
+
+ return err;
}
+EXPORT_SYMBOL_GPL(__rtnl_register_many);
-/**
- * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
- * @ops: struct rtnl_link_ops * to register
- *
- * The caller must hold the rtnl_mutex. This function should be used
- * by drivers that create devices during module initialization. It
- * must be called before registering the devices.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_link_register(struct rtnl_link_ops *ops)
+void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n)
{
- if (rtnl_link_ops_get(ops->kind))
- return -EEXIST;
+ const struct rtnl_msg_handler *handler;
+ int i;
- /* The check for alloc/setup is here because if ops
- * does not have that filled up, it is not possible
- * to use the ops for creating device. So do not
- * fill up dellink as well. That disables rtnl_dellink.
- */
- if ((ops->alloc || ops->setup) && !ops->dellink)
- ops->dellink = unregister_netdevice_queue;
+ for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--)
+ rtnl_unregister(handler->protocol, handler->msgtype);
+}
+EXPORT_SYMBOL_GPL(__rtnl_unregister_many);
- list_add_tail(&ops->list, &link_ops);
- return 0;
+static DEFINE_MUTEX(link_ops_mutex);
+static LIST_HEAD(link_ops);
+
+static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index)
+{
+ struct rtnl_link_ops *ops;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind)) {
+ *srcu_index = srcu_read_lock(&ops->srcu);
+ goto unlock;
+ }
+ }
+
+ ops = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return ops;
+}
+
+static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index)
+{
+ srcu_read_unlock(&ops->srcu, srcu_index);
}
-EXPORT_SYMBOL_GPL(__rtnl_link_register);
/**
* rtnl_link_register - Register rtnl_link_ops with rtnetlink.
@@ -433,6 +596,7 @@ EXPORT_SYMBOL_GPL(__rtnl_link_register);
*/
int rtnl_link_register(struct rtnl_link_ops *ops)
{
+ struct rtnl_link_ops *tmp;
int err;
/* Sanity-check max sizes to avoid stack buffer overflow. */
@@ -440,9 +604,31 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
return -EINVAL;
- rtnl_lock();
- err = __rtnl_link_register(ops);
- rtnl_unlock();
+ /* The check for alloc/setup is here because if ops
+ * does not have that filled up, it is not possible
+ * to use the ops for creating device. So do not
+ * fill up dellink as well. That disables rtnl_dellink.
+ */
+ if ((ops->alloc || ops->setup) && !ops->dellink)
+ ops->dellink = unregister_netdevice_queue;
+
+ err = init_srcu_struct(&ops->srcu);
+ if (err)
+ return err;
+
+ mutex_lock(&link_ops_mutex);
+
+ list_for_each_entry(tmp, &link_ops, list) {
+ if (!strcmp(ops->kind, tmp->kind)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ list_add_tail_rcu(&ops->list, &link_ops);
+unlock:
+ mutex_unlock(&link_ops_mutex);
+
return err;
}
EXPORT_SYMBOL_GPL(rtnl_link_register);
@@ -459,25 +645,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
unregister_netdevice_many(&list_kill);
}
-/**
- * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
- * @ops: struct rtnl_link_ops * to unregister
- *
- * The caller must hold the rtnl_mutex and guarantee net_namespace_list
- * integrity (hold pernet_ops_rwsem for writing to close the race
- * with setup_net() and cleanup_net()).
- */
-void __rtnl_link_unregister(struct rtnl_link_ops *ops)
-{
- struct net *net;
-
- for_each_net(net) {
- __rtnl_kill_links(net, ops);
- }
- list_del(&ops->list);
-}
-EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
-
/* Return with the rtnl_lock held when there are no network
* devices unregistering in any network namespace.
*/
@@ -506,10 +673,22 @@ static void rtnl_lock_unregistering_all(void)
*/
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
+ struct net *net;
+
+ mutex_lock(&link_ops_mutex);
+ list_del_rcu(&ops->list);
+ mutex_unlock(&link_ops_mutex);
+
+ synchronize_srcu(&ops->srcu);
+ cleanup_srcu_struct(&ops->srcu);
+
/* Close the race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
rtnl_lock_unregistering_all();
- __rtnl_link_unregister(ops);
+
+ for_each_net(net)
+ __rtnl_kill_links(net, ops);
+
rtnl_unlock();
up_write(&pernet_ops_rwsem);
}
@@ -566,31 +745,51 @@ static size_t rtnl_link_get_size(const struct net_device *dev)
static LIST_HEAD(rtnl_af_ops);
-static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
+static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index)
{
- const struct rtnl_af_ops *ops;
+ struct rtnl_af_ops *ops;
ASSERT_RTNL();
- list_for_each_entry(ops, &rtnl_af_ops, list) {
- if (ops->family == family)
- return ops;
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
+ if (ops->family == family) {
+ *srcu_index = srcu_read_lock(&ops->srcu);
+ goto unlock;
+ }
}
- return NULL;
+ ops = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return ops;
+}
+
+static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index)
+{
+ srcu_read_unlock(&ops->srcu, srcu_index);
}
/**
* rtnl_af_register - Register rtnl_af_ops with rtnetlink.
* @ops: struct rtnl_af_ops * to register
*
- * Returns 0 on success or a negative error code.
+ * Return: 0 on success or a negative error code.
*/
-void rtnl_af_register(struct rtnl_af_ops *ops)
+int rtnl_af_register(struct rtnl_af_ops *ops)
{
+ int err = init_srcu_struct(&ops->srcu);
+
+ if (err)
+ return err;
+
rtnl_lock();
list_add_tail_rcu(&ops->list, &rtnl_af_ops);
rtnl_unlock();
+
+ return 0;
}
EXPORT_SYMBOL_GPL(rtnl_af_register);
@@ -605,6 +804,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops)
rtnl_unlock();
synchronize_rcu();
+ synchronize_srcu(&ops->srcu);
+ cleanup_srcu_struct(&ops->srcu);
}
EXPORT_SYMBOL_GPL(rtnl_af_unregister);
@@ -842,7 +1043,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
}
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
-void netdev_set_operstate(struct net_device *dev, int newstate)
+void netif_set_operstate(struct net_device *dev, int newstate)
{
unsigned int old = READ_ONCE(dev->operstate);
@@ -851,9 +1052,9 @@ void netdev_set_operstate(struct net_device *dev, int newstate)
return;
} while (!try_cmpxchg(&dev->operstate, &old, newstate));
- netdev_state_change(dev);
+ netif_state_change(dev);
}
-EXPORT_SYMBOL(netdev_set_operstate);
+EXPORT_SYMBOL(netif_set_operstate);
static void set_operstate(struct net_device *dev, unsigned char transition)
{
@@ -879,7 +1080,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
break;
}
- netdev_set_operstate(dev, operstate);
+ netif_set_operstate(dev, operstate);
}
static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
@@ -976,6 +1177,9 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
/* IFLA_VF_STATS_TX_DROPPED */
nla_total_size_64bit(sizeof(__u64)));
}
+ if (dev->netdev_ops->ndo_get_vf_guid)
+ size += num_vfs * 2 *
+ nla_total_size(sizeof(struct ifla_vf_guid));
return size;
} else
return 0;
@@ -1036,8 +1240,8 @@ static size_t rtnl_proto_down_size(const struct net_device *dev)
{
size_t size = nla_total_size(1);
- if (dev->proto_down_reason)
- size += nla_total_size(0) + nla_total_size(4);
+ /* Assume dev->proto_down_reason is not zero. */
+ size += nla_total_size(0) + nla_total_size(4);
return size;
}
@@ -1092,6 +1296,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(4) /* IFLA_LINK_NETNSID */
+ nla_total_size(4) /* IFLA_GROUP */
@@ -1118,6 +1323,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ rtnl_devlink_port_size(dev)
+ rtnl_dpll_pin_size(dev)
+ + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
+ 0;
}
@@ -1477,13 +1683,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb,
static u32 rtnl_xdp_prog_skb(struct net_device *dev)
{
const struct bpf_prog *generic_xdp_prog;
+ u32 res = 0;
- ASSERT_RTNL();
+ rcu_read_lock();
+ generic_xdp_prog = rcu_dereference(dev->xdp_prog);
+ if (generic_xdp_prog)
+ res = generic_xdp_prog->aux->id;
+ rcu_read_unlock();
- generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
- if (!generic_xdp_prog)
- return 0;
- return generic_xdp_prog->aux->id;
+ return res;
}
static u32 rtnl_xdp_prog_drv(struct net_device *dev)
@@ -1603,7 +1811,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
upper_dev = netdev_master_upper_dev_get_rcu(dev);
if (upper_dev)
- ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex);
+ ret = nla_put_u32(skb, IFLA_MASTER,
+ READ_ONCE(upper_dev->ifindex));
rcu_read_unlock();
return ret;
@@ -1736,10 +1945,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb,
struct nlattr *pr;
u32 preason;
- if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
+ if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down)))
goto nla_put_failure;
- preason = dev->proto_down_reason;
+ preason = READ_ONCE(dev->proto_down_reason);
if (!preason)
return 0;
@@ -1812,6 +2021,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
u32 event, int *new_nsid, int new_ifindex,
int tgt_netnsid, gfp_t gfp)
{
+ char devname[IFNAMSIZ];
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
struct Qdisc *qdisc;
@@ -1824,41 +2034,54 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
ifm = nlmsg_data(nlh);
ifm->ifi_family = AF_UNSPEC;
ifm->__ifi_pad = 0;
- ifm->ifi_type = dev->type;
- ifm->ifi_index = dev->ifindex;
+ ifm->ifi_type = READ_ONCE(dev->type);
+ ifm->ifi_index = READ_ONCE(dev->ifindex);
ifm->ifi_flags = dev_get_flags(dev);
ifm->ifi_change = change;
if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
goto nla_put_failure;
- qdisc = rtnl_dereference(dev->qdisc);
- if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
- nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
+ netdev_copy_name(dev, devname);
+ if (nla_put_string(skb, IFLA_IFNAME, devname))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) ||
nla_put_u8(skb, IFLA_OPERSTATE,
- netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
- nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
- nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
- nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
- nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
- nla_put_u32(skb, IFLA_GROUP, dev->group) ||
- nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
- nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) ||
- nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
- nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
- nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
- nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) ||
- nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) ||
- nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) ||
- nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) ||
- nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) ||
+ netif_running(dev) ? READ_ONCE(dev->operstate) :
+ IF_OPER_DOWN) ||
+ nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) ||
+ nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) ||
+ nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
+ nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) ||
+ nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) ||
+ nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) ||
+ nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) ||
+ nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) ||
+ nla_put_u32(skb, IFLA_NUM_TX_QUEUES,
+ READ_ONCE(dev->num_tx_queues)) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SEGS,
+ READ_ONCE(dev->gso_max_segs)) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SIZE,
+ READ_ONCE(dev->gso_max_size)) ||
+ nla_put_u32(skb, IFLA_GRO_MAX_SIZE,
+ READ_ONCE(dev->gro_max_size)) ||
+ nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE,
+ READ_ONCE(dev->gso_ipv4_max_size)) ||
+ nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE,
+ READ_ONCE(dev->gro_ipv4_max_size)) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SIZE,
+ READ_ONCE(dev->tso_max_size)) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
+ READ_ONCE(dev->tso_max_segs)) ||
+ nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON,
+ READ_ONCE(dev->max_pacing_offload_horizon)) ||
#ifdef CONFIG_RPS
- nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
+ nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
+ READ_ONCE(dev->num_rx_queues)) ||
#endif
put_master_ifindex(skb, dev) ||
nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
- (qdisc &&
- nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) ||
nla_put_ifalias(skb, dev) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
atomic_read(&dev->carrier_up_count) +
@@ -1909,9 +2132,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
}
- if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp))
- goto nla_put_failure;
-
if (new_nsid &&
nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
goto nla_put_failure;
@@ -1924,6 +2144,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
rcu_read_lock();
+ if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC))
+ goto nla_put_failure_rcu;
+ qdisc = rcu_dereference(dev->qdisc);
+ if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id))
+ goto nla_put_failure_rcu;
if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
goto nla_put_failure_rcu;
if (rtnl_fill_link_ifmap(skb, dev))
@@ -1959,6 +2184,7 @@ nla_put_failure:
}
static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN },
[IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
[IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
@@ -1987,7 +2213,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
[IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 },
- [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
[IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
@@ -2012,8 +2238,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT },
[IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT },
[IFLA_ALLMULTI] = { .type = NLA_REJECT },
- [IFLA_GSO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
[IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2067,10 +2294,11 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
[IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
};
-static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
+static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla,
+ int *ops_srcu_index)
{
- const struct rtnl_link_ops *ops = NULL;
struct nlattr *linfo[IFLA_INFO_MAX + 1];
+ struct rtnl_link_ops *ops = NULL;
if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0)
return NULL;
@@ -2079,7 +2307,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla
char kind[MODULE_NAME_LEN];
nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
- ops = rtnl_link_ops_get(kind);
+ ops = rtnl_link_ops_get(kind, ops_srcu_index);
}
return ops;
@@ -2162,12 +2390,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for link dump");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change) {
NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
@@ -2199,8 +2427,8 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
- const struct rtnl_link_ops *kind_ops = NULL;
struct netlink_ext_ack *extack = cb->extack;
+ struct rtnl_link_ops *kind_ops = NULL;
const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
unsigned int flags = NLM_F_MULTI;
@@ -2211,6 +2439,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
struct net *tgt_net = net;
u32 ext_filter_mask = 0;
struct net_device *dev;
+ int ops_srcu_index;
int master_idx = 0;
int netnsid = -1;
int err, i;
@@ -2234,7 +2463,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
if (IS_ERR(tgt_net)) {
NL_SET_ERR_MSG(extack, "Invalid target network namespace id");
- return PTR_ERR(tgt_net);
+ err = PTR_ERR(tgt_net);
+ netnsid = -1;
+ goto out;
}
break;
case IFLA_EXT_MASK:
@@ -2244,12 +2475,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
master_idx = nla_get_u32(tb[i]);
break;
case IFLA_LINKINFO:
- kind_ops = linkinfo_to_kind_ops(tb[i]);
+ kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index);
break;
default:
if (cb->strict_check) {
NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request");
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
}
}
@@ -2270,8 +2502,15 @@ walk_entries:
if (err < 0)
break;
}
+
+
cb->seq = tgt_net->dev_base_seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+out:
+
+ if (kind_ops)
+ rtnl_link_ops_put(kind_ops, ops_srcu_index);
if (netnsid >= 0)
put_net(tgt_net);
@@ -2300,9 +2539,10 @@ int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
}
EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg);
-struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[])
{
- struct net *net;
+ struct net *net = NULL;
+
/* Examine the link attributes and figure out which
* network namespace we are talking about.
*/
@@ -2310,8 +2550,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
else if (tb[IFLA_NET_NS_FD])
net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
- else
+
+ return net;
+}
+
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+ struct net *net = rtnl_link_get_net_ifla(tb);
+
+ if (!net)
net = get_net(src_net);
+
return net;
}
EXPORT_SYMBOL(rtnl_link_get_net);
@@ -2451,20 +2700,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
int rem, err;
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
- const struct rtnl_af_ops *af_ops;
+ struct rtnl_af_ops *af_ops;
+ int af_ops_srcu_index;
- af_ops = rtnl_af_lookup(nla_type(af));
+ af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
if (!af_ops)
return -EAFNOSUPPORT;
if (!af_ops->set_link_af)
- return -EOPNOTSUPP;
-
- if (af_ops->validate_link_af) {
+ err = -EOPNOTSUPP;
+ else if (af_ops->validate_link_af)
err = af_ops->validate_link_af(dev, af, extack);
- if (err < 0)
- return err;
- }
+ else
+ err = 0;
+
+ rtnl_af_put(af_ops, af_ops_srcu_index);
+
+ if (err < 0)
+ return err;
}
}
@@ -2530,7 +2783,7 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
- nla_len(attr) < NLA_HDRLEN) {
+ nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) {
return -EINVAL;
}
if (len >= MAX_VLAN_LIST_LEN)
@@ -2663,12 +2916,19 @@ static int do_set_master(struct net_device *dev, int ifindex,
const struct net_device_ops *ops;
int err;
+ /* Release the lower lock, the upper is responsible for locking
+ * the lower if needed. None of the existing upper devices
+ * use netdev instance lock, so don't grab it.
+ */
+
if (upper_dev) {
if (upper_dev->ifindex == ifindex)
return 0;
ops = upper_dev->netdev_ops;
if (ops->ndo_del_slave) {
+ netdev_unlock_ops(dev);
err = ops->ndo_del_slave(upper_dev, dev);
+ netdev_lock_ops(dev);
if (err)
return err;
} else {
@@ -2682,7 +2942,9 @@ static int do_set_master(struct net_device *dev, int ifindex,
return -EINVAL;
ops = upper_dev->netdev_ops;
if (ops->ndo_add_slave) {
+ netdev_unlock_ops(dev);
err = ops->ndo_add_slave(upper_dev, dev, extack);
+ netdev_lock_ops(dev);
if (err)
return err;
} else {
@@ -2708,7 +2970,7 @@ static int do_set_proto_down(struct net_device *dev,
bool proto_down;
int err;
- if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) {
+ if (!dev->change_proto_down) {
NL_SET_ERR_MSG(extack, "Protodown not supported by device");
return -EOPNOTSUPP;
}
@@ -2732,7 +2994,7 @@ static int do_set_proto_down(struct net_device *dev,
if (pdreason[IFLA_PROTO_DOWN_REASON_MASK])
mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]);
- dev_change_proto_down_reason(dev, mask, value);
+ netdev_change_proto_down_reason_locked(dev, mask, value);
}
if (nl_proto_down) {
@@ -2743,8 +3005,7 @@ static int do_set_proto_down(struct net_device *dev,
NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
return -EBUSY;
}
- err = dev_change_proto_down(dev,
- proto_down);
+ err = netif_change_proto_down(dev, proto_down);
if (err)
return err;
}
@@ -2755,8 +3016,8 @@ static int do_set_proto_down(struct net_device *dev,
#define DO_SETLINK_MODIFIED 0x01
/* notify flag means notify + modified. */
#define DO_SETLINK_NOTIFY 0x03
-static int do_setlink(const struct sk_buff *skb,
- struct net_device *dev, struct ifinfomsg *ifm,
+static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
+ struct net *tgt_net, struct ifinfomsg *ifm,
struct netlink_ext_ack *extack,
struct nlattr **tb, int status)
{
@@ -2764,35 +3025,31 @@ static int do_setlink(const struct sk_buff *skb,
char ifname[IFNAMSIZ];
int err;
+ err = validate_linkmsg(dev, tb, extack);
+ if (err < 0)
+ return err;
+
if (tb[IFLA_IFNAME])
nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
else
ifname[0] = '\0';
- if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
+ if (!net_eq(tgt_net, dev_net(dev))) {
const char *pat = ifname[0] ? ifname : NULL;
- struct net *net;
int new_ifindex;
- net = rtnl_link_get_net_capable(skb, dev_net(dev),
- tb, CAP_NET_ADMIN);
- if (IS_ERR(net)) {
- err = PTR_ERR(net);
- goto errout;
- }
-
- if (tb[IFLA_NEW_IFINDEX])
- new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]);
- else
- new_ifindex = 0;
+ new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0);
- err = __dev_change_net_namespace(dev, net, pat, new_ifindex);
- put_net(net);
+ err = __dev_change_net_namespace(dev, tgt_net, pat,
+ new_ifindex, extack);
if (err)
- goto errout;
+ return err;
+
status |= DO_SETLINK_MODIFIED;
}
+ netdev_lock_ops(dev);
+
if (tb[IFLA_MAP]) {
struct rtnl_link_ifmap *u_map;
struct ifmap k_map;
@@ -2823,35 +3080,35 @@ static int do_setlink(const struct sk_buff *skb,
}
if (tb[IFLA_ADDRESS]) {
- struct sockaddr *sa;
- int len;
-
- len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len,
- sizeof(*sa));
- sa = kmalloc(len, GFP_KERNEL);
- if (!sa) {
- err = -ENOMEM;
+ struct sockaddr_storage ss = { };
+
+ netdev_unlock_ops(dev);
+
+ /* dev_addr_sem is an outer lock, enforce proper ordering */
+ down_write(&dev_addr_sem);
+ netdev_lock_ops(dev);
+
+ ss.ss_family = dev->type;
+ memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len);
+ err = netif_set_mac_address(dev, &ss, extack);
+ if (err) {
+ up_write(&dev_addr_sem);
goto errout;
}
- sa->sa_family = dev->type;
- memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
- dev->addr_len);
- err = dev_set_mac_address_user(dev, sa, extack);
- kfree(sa);
- if (err)
- goto errout;
status |= DO_SETLINK_MODIFIED;
+
+ up_write(&dev_addr_sem);
}
if (tb[IFLA_MTU]) {
- err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
+ err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_GROUP]) {
- dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
status |= DO_SETLINK_NOTIFY;
}
@@ -2861,15 +3118,15 @@ static int do_setlink(const struct sk_buff *skb,
* requested.
*/
if (ifm->ifi_index > 0 && ifname[0]) {
- err = dev_change_name(dev, ifname);
+ err = netif_change_name(dev, ifname);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_IFALIAS]) {
- err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
- nla_len(tb[IFLA_IFALIAS]));
+ err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+ nla_len(tb[IFLA_IFALIAS]));
if (err < 0)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2881,8 +3138,8 @@ static int do_setlink(const struct sk_buff *skb,
}
if (ifm->ifi_flags || ifm->ifi_change) {
- err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
- extack);
+ err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ extack);
if (err < 0)
goto errout;
}
@@ -2895,7 +3152,7 @@ static int do_setlink(const struct sk_buff *skb,
}
if (tb[IFLA_CARRIER]) {
- err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+ err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
if (err)
goto errout;
status |= DO_SETLINK_MODIFIED;
@@ -2904,7 +3161,7 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_TXQLEN]) {
unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);
- err = dev_change_tx_queue_len(dev, value);
+ err = netif_change_tx_queue_len(dev, value);
if (err)
goto errout;
status |= DO_SETLINK_MODIFIED;
@@ -3048,11 +3305,18 @@ static int do_setlink(const struct sk_buff *skb,
int rem;
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
- const struct rtnl_af_ops *af_ops;
+ struct rtnl_af_ops *af_ops;
+ int af_ops_srcu_index;
- BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af))));
+ af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
+ if (!af_ops) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
err = af_ops->set_link_af(dev, af, extack);
+ rtnl_af_put(af_ops, af_ops_srcu_index);
+
if (err < 0)
goto errout;
@@ -3121,13 +3385,15 @@ static int do_setlink(const struct sk_buff *skb,
errout:
if (status & DO_SETLINK_MODIFIED) {
if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
- netdev_state_change(dev);
+ netif_state_change(dev);
if (err < 0)
net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
dev->name);
}
+ netdev_unlock_ops(dev);
+
return err;
}
@@ -3149,11 +3415,13 @@ static struct net_device *rtnl_dev_get(struct net *net,
static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct ifinfomsg *ifm = nlmsg_data(nlh);
struct net *net = sock_net(skb->sk);
- struct ifinfomsg *ifm;
- struct net_device *dev;
- int err;
struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev = NULL;
+ struct rtnl_nets rtnl_nets;
+ struct net *tgt_net;
+ int err;
err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
ifla_policy, extack);
@@ -3164,25 +3432,32 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
- err = -EINVAL;
- ifm = nlmsg_data(nlh);
+ tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
+ if (IS_ERR(tgt_net)) {
+ err = PTR_ERR(tgt_net);
+ goto errout;
+ }
+
+ rtnl_nets_init(&rtnl_nets);
+ rtnl_nets_add(&rtnl_nets, get_net(net));
+ rtnl_nets_add(&rtnl_nets, tgt_net);
+
+ rtnl_nets_lock(&rtnl_nets);
+
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
dev = rtnl_dev_get(net, tb);
else
- goto errout;
+ err = -EINVAL;
- if (dev == NULL) {
+ if (dev)
+ err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
+ else if (!err)
err = -ENODEV;
- goto errout;
- }
-
- err = validate_linkmsg(dev, tb, extack);
- if (err < 0)
- goto errout;
- err = do_setlink(skb, dev, ifm, extack, tb, 0);
+ rtnl_nets_unlock(&rtnl_nets);
+ rtnl_nets_destroy(&rtnl_nets);
errout:
return err;
}
@@ -3242,14 +3517,14 @@ EXPORT_SYMBOL_GPL(rtnl_delete_link);
static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct ifinfomsg *ifm = nlmsg_data(nlh);
struct net *net = sock_net(skb->sk);
u32 portid = NETLINK_CB(skb).portid;
- struct net *tgt_net = net;
- struct net_device *dev = NULL;
- struct ifinfomsg *ifm;
struct nlattr *tb[IFLA_MAX+1];
- int err;
+ struct net_device *dev = NULL;
+ struct net *tgt_net = net;
int netnsid = -1;
+ int err;
err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
ifla_policy, extack);
@@ -3267,27 +3542,24 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(tgt_net);
}
- err = -EINVAL;
- ifm = nlmsg_data(nlh);
+ rtnl_net_lock(tgt_net);
+
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
- dev = rtnl_dev_get(net, tb);
+ dev = rtnl_dev_get(tgt_net, tb);
+
+ if (dev)
+ err = rtnl_delete_link(dev, portid, nlh);
+ else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ err = -ENODEV;
else if (tb[IFLA_GROUP])
err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
else
- goto out;
-
- if (!dev) {
- if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0)
- err = -ENODEV;
-
- goto out;
- }
+ err = -EINVAL;
- err = rtnl_delete_link(dev, portid, nlh);
+ rtnl_net_unlock(tgt_net);
-out:
if (netnsid >= 0)
put_net(tgt_net);
@@ -3297,7 +3569,7 @@ out:
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
u32 portid, const struct nlmsghdr *nlh)
{
- unsigned int old_flags;
+ unsigned int old_flags, changed;
int err;
old_flags = dev->flags;
@@ -3308,12 +3580,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
return err;
}
- if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
- __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh);
- } else {
- dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
- __dev_notify_flags(dev, old_flags, ~0U, portid, nlh);
+ changed = old_flags ^ dev->flags;
+ if (dev->rtnl_link_initializing) {
+ dev->rtnl_link_initializing = false;
+ changed = ~0U;
}
+
+ __dev_notify_flags(dev, old_flags, changed, portid, nlh);
return 0;
}
EXPORT_SYMBOL(rtnl_configure_link);
@@ -3371,7 +3644,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
- dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
+ dev->rtnl_link_initializing = true;
if (tb[IFLA_MTU]) {
u32 mtu = nla_get_u32(tb[IFLA_MTU]);
@@ -3398,7 +3671,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
if (tb[IFLA_LINKMODE])
dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
if (tb[IFLA_GROUP])
- dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
if (tb[IFLA_GSO_MAX_SIZE])
netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
if (tb[IFLA_GSO_MAX_SEGS])
@@ -3414,21 +3687,90 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
}
EXPORT_SYMBOL(rtnl_create_link);
+struct rtnl_newlink_tbs {
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
+};
+
+static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh,
+ const struct rtnl_link_ops *ops,
+ struct net_device *dev, struct net *tgt_net,
+ struct rtnl_newlink_tbs *tbs,
+ struct nlattr **data,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr ** const linkinfo = tbs->linkinfo;
+ struct nlattr ** const tb = tbs->tb;
+ int status = 0;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops || !ops->changelink)
+ return -EOPNOTSUPP;
+
+ err = ops->changelink(dev, tb, data, extack);
+ if (err < 0)
+ return err;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ const struct rtnl_link_ops *m_ops = NULL;
+ struct nlattr **slave_data = NULL;
+ struct net_device *master_dev;
+
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ m_ops = master_dev->rtnl_link_ops;
+
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
+
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
+
+ if (m_ops->slave_maxtype) {
+ err = nla_parse_nested_deprecated(tbs->slave_attr,
+ m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy, extack);
+ if (err < 0)
+ return err;
+
+ slave_data = tbs->slave_attr;
+ }
+
+ err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack);
+ if (err < 0)
+ return err;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status);
+}
+
static int rtnl_group_changelink(const struct sk_buff *skb,
- struct net *net, int group,
- struct ifinfomsg *ifm,
- struct netlink_ext_ack *extack,
- struct nlattr **tb)
+ struct net *net, struct net *tgt_net,
+ int group, struct ifinfomsg *ifm,
+ struct netlink_ext_ack *extack,
+ struct nlattr **tb)
{
struct net_device *dev, *aux;
int err;
for_each_netdev_safe(net, dev, aux) {
if (dev->group == group) {
- err = validate_linkmsg(dev, tb, extack);
- if (err < 0)
- return err;
- err = do_setlink(skb, dev, ifm, extack, tb, 0);
+ err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
if (err < 0)
return err;
}
@@ -3439,14 +3781,21 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
const struct rtnl_link_ops *ops,
+ struct net *tgt_net, struct net *link_net,
+ struct net *peer_net,
const struct nlmsghdr *nlh,
struct nlattr **tb, struct nlattr **data,
struct netlink_ext_ack *extack)
{
unsigned char name_assign_type = NET_NAME_USER;
- struct net *net = sock_net(skb->sk);
+ struct rtnl_newlink_params params = {
+ .src_net = sock_net(skb->sk),
+ .link_net = link_net,
+ .peer_net = peer_net,
+ .tb = tb,
+ .data = data,
+ };
u32 portid = NETLINK_CB(skb).portid;
- struct net *dest_net, *link_net;
struct net_device *dev;
char ifname[IFNAMSIZ];
int err;
@@ -3461,28 +3810,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
name_assign_type = NET_NAME_ENUM;
}
- dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
- if (IS_ERR(dest_net))
- return PTR_ERR(dest_net);
-
- if (tb[IFLA_LINK_NETNSID]) {
- int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
-
- link_net = get_net_ns_by_id(dest_net, id);
- if (!link_net) {
- NL_SET_ERR_MSG(extack, "Unknown network namespace id");
- err = -EINVAL;
- goto out;
- }
- err = -EPERM;
- if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
- goto out;
- } else {
- link_net = NULL;
- }
-
- dev = rtnl_create_link(link_net ? : dest_net, ifname,
- name_assign_type, ops, tb, extack);
+ dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb,
+ extack);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out;
@@ -3491,7 +3820,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
dev->ifindex = ifm->ifi_index;
if (ops->newlink)
- err = ops->newlink(link_net ? : net, dev, tb, data, extack);
+ err = ops->newlink(dev, &params, extack);
else
err = register_netdevice(dev);
if (err < 0) {
@@ -3499,25 +3828,22 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
goto out;
}
+ netdev_lock_ops(dev);
+
err = rtnl_configure_link(dev, ifm, portid, nlh);
if (err < 0)
goto out_unregister;
- if (link_net) {
- err = dev_change_net_namespace(dev, dest_net, ifname);
- if (err < 0)
- goto out_unregister;
- }
if (tb[IFLA_MASTER]) {
err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
if (err)
goto out_unregister;
}
+
+ netdev_unlock_ops(dev);
out:
- if (link_net)
- put_net(link_net);
- put_net(dest_net);
return err;
out_unregister:
+ netdev_unlock_ops(dev);
if (ops->newlink) {
LIST_HEAD(list_kill);
@@ -3529,202 +3855,212 @@ out_unregister:
goto out;
}
-struct rtnl_newlink_tbs {
+static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops,
+ struct nlattr *tbp[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
struct nlattr *tb[IFLA_MAX + 1];
- struct nlattr *attr[RTNL_MAX_TYPE + 1];
- struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
-};
+ int err;
+
+ if (!data || !data[ops->peer_type])
+ return rtnl_link_get_net_ifla(tbp);
+
+ err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (ops->validate) {
+ err = ops->validate(tb, NULL, extack);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ return rtnl_link_get_net_ifla(tb);
+}
static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ const struct rtnl_link_ops *ops,
+ struct net *tgt_net, struct net *link_net,
+ struct net *peer_net,
struct rtnl_newlink_tbs *tbs,
+ struct nlattr **data,
struct netlink_ext_ack *extack)
{
- struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
struct nlattr ** const tb = tbs->tb;
- const struct rtnl_link_ops *m_ops;
- struct net_device *master_dev;
struct net *net = sock_net(skb->sk);
- const struct rtnl_link_ops *ops;
- struct nlattr **slave_data;
- char kind[MODULE_NAME_LEN];
+ struct net *device_net;
struct net_device *dev;
struct ifinfomsg *ifm;
- struct nlattr **data;
bool link_specified;
- int err;
-#ifdef CONFIG_MODULES
-replay:
-#endif
- err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
- ifla_policy, extack);
- if (err < 0)
- return err;
-
- err = rtnl_ensure_unique_netns(tb, extack, false);
- if (err < 0)
- return err;
+ /* When creating, lookup for existing device in target net namespace */
+ device_net = (nlh->nlmsg_flags & NLM_F_CREATE) &&
+ (nlh->nlmsg_flags & NLM_F_EXCL) ?
+ tgt_net : net;
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0) {
link_specified = true;
- dev = __dev_get_by_index(net, ifm->ifi_index);
+ dev = __dev_get_by_index(device_net, ifm->ifi_index);
} else if (ifm->ifi_index < 0) {
NL_SET_ERR_MSG(extack, "ifindex can't be negative");
return -EINVAL;
} else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
link_specified = true;
- dev = rtnl_dev_get(net, tb);
+ dev = rtnl_dev_get(device_net, tb);
} else {
link_specified = false;
dev = NULL;
}
- master_dev = NULL;
- m_ops = NULL;
- if (dev) {
- master_dev = netdev_master_upper_dev_get(dev);
- if (master_dev)
- m_ops = master_dev->rtnl_link_ops;
+ if (dev)
+ return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack);
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
+ * or it's for a group
+ */
+ if (link_specified || !tb[IFLA_GROUP])
+ return -ENODEV;
+
+ return rtnl_group_changelink(skb, net, tgt_net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, extack, tb);
}
+ if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
+
+ if (!ops) {
+ NL_SET_ERR_MSG(extack, "Unknown device type");
+ return -EOPNOTSUPP;
+ }
+
+ return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh,
+ tb, data, extack);
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *tgt_net, *link_net = NULL, *peer_net = NULL;
+ struct nlattr **tb, **linkinfo, **data = NULL;
+ struct rtnl_link_ops *ops = NULL;
+ struct rtnl_newlink_tbs *tbs;
+ struct rtnl_nets rtnl_nets;
+ int ops_srcu_index;
+ int ret;
+
+ tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
+ if (!tbs)
+ return -ENOMEM;
+
+ tb = tbs->tb;
+ ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, extack);
+ if (ret < 0)
+ goto free;
+
+ ret = rtnl_ensure_unique_netns(tb, extack, false);
+ if (ret < 0)
+ goto free;
+
+ linkinfo = tbs->linkinfo;
if (tb[IFLA_LINKINFO]) {
- err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
+ ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
tb[IFLA_LINKINFO],
ifla_info_policy, NULL);
- if (err < 0)
- return err;
- } else
- memset(linkinfo, 0, sizeof(linkinfo));
+ if (ret < 0)
+ goto free;
+ } else {
+ memset(linkinfo, 0, sizeof(tbs->linkinfo));
+ }
if (linkinfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
+
nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
- ops = rtnl_link_ops_get(kind);
- } else {
- kind[0] = '\0';
- ops = NULL;
+ ops = rtnl_link_ops_get(kind, &ops_srcu_index);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ request_module("rtnl-link-%s", kind);
+ ops = rtnl_link_ops_get(kind, &ops_srcu_index);
+ }
+#endif
}
- data = NULL;
+ rtnl_nets_init(&rtnl_nets);
+
if (ops) {
- if (ops->maxtype > RTNL_MAX_TYPE)
- return -EINVAL;
+ if (ops->maxtype > RTNL_MAX_TYPE) {
+ ret = -EINVAL;
+ goto put_ops;
+ }
if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
- err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
+ ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
linkinfo[IFLA_INFO_DATA],
ops->policy, extack);
- if (err < 0)
- return err;
+ if (ret < 0)
+ goto put_ops;
+
data = tbs->attr;
}
+
if (ops->validate) {
- err = ops->validate(tb, data, extack);
- if (err < 0)
- return err;
+ ret = ops->validate(tb, data, extack);
+ if (ret < 0)
+ goto put_ops;
}
- }
- slave_data = NULL;
- if (m_ops) {
- if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
- return -EINVAL;
-
- if (m_ops->slave_maxtype &&
- linkinfo[IFLA_INFO_SLAVE_DATA]) {
- err = nla_parse_nested_deprecated(tbs->slave_attr,
- m_ops->slave_maxtype,
- linkinfo[IFLA_INFO_SLAVE_DATA],
- m_ops->slave_policy,
- extack);
- if (err < 0)
- return err;
- slave_data = tbs->slave_attr;
+ if (ops->peer_type) {
+ peer_net = rtnl_get_peer_net(ops, tb, data, extack);
+ if (IS_ERR(peer_net)) {
+ ret = PTR_ERR(peer_net);
+ goto put_ops;
+ }
+ if (peer_net)
+ rtnl_nets_add(&rtnl_nets, peer_net);
}
}
- if (dev) {
- int status = 0;
-
- if (nlh->nlmsg_flags & NLM_F_EXCL)
- return -EEXIST;
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
- return -EOPNOTSUPP;
+ tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN);
+ if (IS_ERR(tgt_net)) {
+ ret = PTR_ERR(tgt_net);
+ goto put_net;
+ }
- err = validate_linkmsg(dev, tb, extack);
- if (err < 0)
- return err;
+ rtnl_nets_add(&rtnl_nets, tgt_net);
- if (linkinfo[IFLA_INFO_DATA]) {
- if (!ops || ops != dev->rtnl_link_ops ||
- !ops->changelink)
- return -EOPNOTSUPP;
+ if (tb[IFLA_LINK_NETNSID]) {
+ int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
- err = ops->changelink(dev, tb, data, extack);
- if (err < 0)
- return err;
- status |= DO_SETLINK_NOTIFY;
+ link_net = get_net_ns_by_id(tgt_net, id);
+ if (!link_net) {
+ NL_SET_ERR_MSG(extack, "Unknown network namespace id");
+ ret = -EINVAL;
+ goto put_net;
}
- if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
- if (!m_ops || !m_ops->slave_changelink)
- return -EOPNOTSUPP;
+ rtnl_nets_add(&rtnl_nets, link_net);
- err = m_ops->slave_changelink(master_dev, dev, tb,
- slave_data, extack);
- if (err < 0)
- return err;
- status |= DO_SETLINK_NOTIFY;
+ if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ goto put_net;
}
-
- return do_setlink(skb, dev, ifm, extack, tb, status);
}
- if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
- * or it's for a group
- */
- if (link_specified)
- return -ENODEV;
- if (tb[IFLA_GROUP])
- return rtnl_group_changelink(skb, net,
- nla_get_u32(tb[IFLA_GROUP]),
- ifm, extack, tb);
- return -ENODEV;
- }
+ rtnl_nets_lock(&rtnl_nets);
+ ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack);
+ rtnl_nets_unlock(&rtnl_nets);
- if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
- return -EOPNOTSUPP;
-
- if (!ops) {
-#ifdef CONFIG_MODULES
- if (kind[0]) {
- __rtnl_unlock();
- request_module("rtnl-link-%s", kind);
- rtnl_lock();
- ops = rtnl_link_ops_get(kind);
- if (ops)
- goto replay;
- }
-#endif
- NL_SET_ERR_MSG(extack, "Unknown device type");
- return -EOPNOTSUPP;
- }
-
- return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack);
-}
-
-static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
-{
- struct rtnl_newlink_tbs *tbs;
- int ret;
-
- tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
- if (!tbs)
- return -ENOMEM;
-
- ret = __rtnl_newlink(skb, nlh, tbs, extack);
+put_net:
+ rtnl_nets_destroy(&rtnl_nets);
+put_ops:
+ if (ops)
+ rtnl_link_ops_put(ops, ops_srcu_index);
+free:
kfree(tbs);
return ret;
}
@@ -3737,7 +4073,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb,
struct ifinfomsg *ifm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for get link");
return -EINVAL;
}
@@ -3746,7 +4083,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
ifla_policy, extack);
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change) {
NL_SET_ERR_MSG(extack, "Invalid values in header for get link request");
@@ -3953,22 +4289,28 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
}
-static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb,
+ struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
size_t min_ifinfo_dump_size = 0;
- struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
struct net_device *dev;
- int hdrlen;
+ struct nlattr *nla;
+ int hdrlen, rem;
/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
- if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) {
- if (tb[IFLA_EXT_MASK])
- ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+ return NLMSG_GOODSIZE;
+
+ nla_for_each_attr_type(nla, IFLA_EXT_MASK,
+ nlmsg_attrdata(nlh, hdrlen),
+ nlmsg_attrlen(nlh, hdrlen), rem) {
+ if (nla_len(nla) == sizeof(u32))
+ ext_filter_mask = nla_get_u32(nla);
}
if (!ext_filter_mask)
@@ -4065,8 +4407,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
}
return skb;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
return NULL;
}
@@ -4291,9 +4632,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
+ bool notified = false;
err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
- nlh->nlmsg_flags, extack);
+ nlh->nlmsg_flags, &notified, extack);
if (err)
goto out;
else
@@ -4302,16 +4644,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Embedded bridge, macvlan, and any other device support */
if ((ndm->ndm_flags & NTF_SELF)) {
+ bool notified = false;
+
if (dev->netdev_ops->ndo_fdb_add)
err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
vid,
nlh->nlmsg_flags,
- extack);
+ &notified, extack);
else
err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
nlh->nlmsg_flags);
- if (!err) {
+ if (!err && !notified) {
rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
@@ -4411,11 +4755,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ bool notified = false;
ops = br_dev->netdev_ops;
if (!del_bulk) {
if (ops->ndo_fdb_del)
- err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+ &notified, extack);
} else {
if (ops->ndo_fdb_del_bulk)
err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
@@ -4429,10 +4775,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Embedded bridge, macvlan, and any other device support */
if (ndm->ndm_flags & NTF_SELF) {
+ bool notified = false;
+
ops = dev->netdev_ops;
if (!del_bulk) {
if (ops->ndo_fdb_del)
- err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+ &notified, extack);
else
err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
} else {
@@ -4443,7 +4792,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
}
if (!err) {
- if (!del_bulk)
+ if (!del_bulk && !notified)
rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
@@ -4459,15 +4808,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
int *idx,
struct netdev_hw_addr_list *list)
{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
struct netdev_hw_addr *ha;
- int err;
u32 portid, seq;
+ int err;
portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
list_for_each_entry(ha, &list->list, list) {
- if (*idx < cb->args[2])
+ if (*idx < ctx->fdb_idx)
goto skip;
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
@@ -4523,12 +4873,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_flags || ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
@@ -4606,18 +4956,16 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct net_device *dev;
- struct net_device *br_dev = NULL;
- const struct net_device_ops *ops = NULL;
- const struct net_device_ops *cops = NULL;
+ const struct net_device_ops *ops = NULL, *cops = NULL;
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
+ struct net_device *dev, *br_dev = NULL;
struct net *net = sock_net(skb->sk);
- struct hlist_head *head;
int brport_idx = 0;
int br_idx = 0;
- int h, s_h;
- int idx = 0, s_idx;
- int err = 0;
int fidx = 0;
+ int err;
+
+ NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context);
if (cb->strict_check)
err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
@@ -4636,70 +4984,51 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
-
- if (brport_idx && (dev->ifindex != brport_idx))
- continue;
-
- if (!br_idx) { /* user did not specify a specific bridge */
- if (netif_is_bridge_port(dev)) {
- br_dev = netdev_master_upper_dev_get(dev);
- cops = br_dev->netdev_ops;
- }
- } else {
- if (dev != br_dev &&
- !netif_is_bridge_port(dev))
- continue;
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
- if (br_dev != netdev_master_upper_dev_get(dev) &&
- !netif_is_bridge_master(dev))
- continue;
- cops = ops;
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (netif_is_bridge_port(dev)) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
}
+ } else {
+ if (dev != br_dev &&
+ !netif_is_bridge_port(dev))
+ continue;
- if (idx < s_idx)
- goto cont;
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !netif_is_bridge_master(dev))
+ continue;
+ cops = ops;
+ }
- if (netif_is_bridge_port(dev)) {
- if (cops && cops->ndo_fdb_dump) {
- err = cops->ndo_fdb_dump(skb, cb,
- br_dev, dev,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
- }
+ if (netif_is_bridge_port(dev)) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ break;
}
+ }
- if (dev->netdev_ops->ndo_fdb_dump)
- err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
- dev, NULL,
- &fidx);
- else
- err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx);
+ if (err == -EMSGSIZE)
+ break;
- cops = NULL;
+ cops = NULL;
- /* reset fdb offset to 0 for rest of the interfaces */
- cb->args[2] = 0;
- fidx = 0;
-cont:
- idx++;
- }
+ /* reset fdb offset to 0 for rest of the interfaces */
+ ctx->fdb_idx = 0;
+ fidx = 0;
}
-out:
- cb->args[0] = h;
- cb->args[1] = idx;
- cb->args[2] = fidx;
+ ctx->fdb_idx = fidx;
return skb->len;
}
@@ -4712,12 +5041,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for fdb get request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request");
@@ -4984,12 +5313,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change || ifm->ifi_index) {
NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
@@ -5245,15 +5574,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
if (br_spec) {
- nla_for_each_nested(attr, br_spec, rem) {
- if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
- if (nla_len(attr) < sizeof(flags))
- return -EINVAL;
+ nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec,
+ rem) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
- have_flags = true;
- flags = nla_get_u16(attr);
- break;
- }
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
}
}
@@ -5882,7 +6210,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
{
struct if_stats_msg *ifsm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) {
+ ifsm = nlmsg_payload(nlh, sizeof(*ifsm));
+ if (!ifsm) {
NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
return -EINVAL;
}
@@ -5890,8 +6219,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
if (!strict_check)
return 0;
- ifsm = nlmsg_data(nlh);
-
/* only requests using strict checks can pass data to influence
* the dump. The legacy exception is filter_mask.
*/
@@ -5962,19 +6289,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netlink_ext_ack *extack = cb->extack;
- int h, s_h, err, s_idx, s_idxattr, s_prividx;
struct rtnl_stats_dump_filters filters;
struct net *net = sock_net(skb->sk);
unsigned int flags = NLM_F_MULTI;
struct if_stats_msg *ifsm;
- struct hlist_head *head;
+ struct {
+ unsigned long ifindex;
+ int idxattr;
+ int prividx;
+ } *ctx = (void *)cb->ctx;
struct net_device *dev;
- int idx = 0;
-
- s_h = cb->args[0];
- s_idx = cb->args[1];
- s_idxattr = cb->args[2];
- s_prividx = cb->args[3];
+ int err;
cb->seq = net->dev_base_seq;
@@ -5993,39 +6318,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (err)
return err;
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0,
- flags, &filters,
- &s_idxattr, &s_prividx,
- extack);
- /* If we ran out of room on the first message,
- * we're in trouble
- */
- WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ flags, &filters,
+ &ctx->idxattr, &ctx->prividx,
+ extack);
+ /* If we ran out of room on the first message,
+ * we're in trouble.
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
- if (err < 0)
- goto out;
- s_prividx = 0;
- s_idxattr = 0;
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-cont:
- idx++;
- }
+ if (err < 0)
+ break;
+ ctx->prividx = 0;
+ ctx->idxattr = 0;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
}
-out:
- cb->args[3] = s_prividx;
- cb->args[2] = s_idxattr;
- cb->args[1] = idx;
- cb->args[0] = h;
- return skb->len;
+ return err;
}
void rtnl_offload_xstats_notify(struct net_device *dev)
@@ -6134,12 +6446,12 @@ static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh,
{
struct br_port_msg *bpm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) {
+ bpm = nlmsg_payload(nlh, sizeof(*bpm));
+ if (!bpm) {
NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request");
return -EINVAL;
}
- bpm = nlmsg_data(nlh);
if (bpm->ifindex) {
NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request");
return -EINVAL;
@@ -6164,7 +6476,7 @@ static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
int idx, s_idx;
int err;
- NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx);
+ NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx);
if (cb->strict_check) {
err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack);
@@ -6484,6 +6796,52 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Process one rtnetlink message. */
+static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED);
+ rtnl_dumpit_func dumpit = cb->data;
+ int err;
+
+ /* Previous iteration have already finished, avoid calling->dumpit()
+ * again, it may not expect to be called after it reached the end.
+ */
+ if (!dumpit)
+ return 0;
+
+ if (needs_lock)
+ rtnl_lock();
+ err = dumpit(skb, cb);
+ if (needs_lock)
+ rtnl_unlock();
+
+ /* Old dump handlers used to send NLM_DONE as in a separate recvmsg().
+ * Some applications which parse netlink manually depend on this.
+ */
+ if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) {
+ if (err < 0 && err != -EMSGSIZE)
+ return err;
+ if (!err)
+ cb->data = NULL;
+
+ return skb->len;
+ }
+ return err;
+}
+
+static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *control)
+{
+ if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE ||
+ !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) {
+ WARN_ON(control->data);
+ control->data = control->dump;
+ control->dump = rtnl_dumpit;
+ }
+
+ return netlink_dump_start(ssk, skb, nlh, control);
+}
+
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -6548,7 +6906,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
.module = owner,
.flags = flags,
};
- err = netlink_dump_start(rtnl, skb, nlh, &c);
+ err = rtnetlink_dump_start(rtnl, skb, nlh, &c);
/* netlink_dump_start() will keep a reference on
* module if dump is still in progress.
*/
@@ -6663,7 +7021,6 @@ static int __net_init rtnetlink_net_init(struct net *net)
struct netlink_kernel_cfg cfg = {
.groups = RTNLGRP_MAX,
.input = rtnetlink_rcv,
- .cb_mutex = &rtnl_mutex,
.flags = NL_CFG_F_NONROOT_RECV,
.bind = rtnetlink_bind,
};
@@ -6686,6 +7043,41 @@ static struct pernet_operations rtnetlink_net_ops = {
.exit = rtnetlink_net_exit,
};
+static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELLINK, .doit = rtnl_dellink,
+ .flags = RTNL_FLAG_DOIT_PERNET_WIP},
+ {.msgtype = RTM_GETLINK, .doit = rtnl_getlink,
+ .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+ {.msgtype = RTM_SETLINK, .doit = rtnl_setlink,
+ .flags = RTNL_FLAG_DOIT_PERNET_WIP},
+ {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get,
+ .dumpit = rtnl_stats_dump},
+ {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set},
+ {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop},
+ {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK,
+ .dumpit = rtnl_bridge_getlink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK,
+ .doit = rtnl_bridge_dellink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK,
+ .doit = rtnl_bridge_setlink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del,
+ .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get,
+ .dumpit = rtnl_fdb_dump},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del,
+ .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get,
+ .dumpit = rtnl_mdb_dump},
+};
+
void __init rtnetlink_init(void)
{
if (register_pernet_subsys(&rtnetlink_net_ops))
@@ -6693,34 +7085,5 @@ void __init rtnetlink_init(void)
register_netdevice_notifier(&rtnetlink_dev_notifier);
- rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
- rtnl_dump_ifinfo, 0);
- rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
- rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
- rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);
-
- rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0);
-
- rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL,
- RTNL_FLAG_BULK_DEL_SUPPORTED);
- rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);
-
- rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
- rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
- 0);
- rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
-
- rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0);
- rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL,
- RTNL_FLAG_BULK_DEL_SUPPORTED);
+ rtnl_register_many(rtnetlink_rtnl_msg_handlers);
}
diff --git a/net/core/scm.c b/net/core/scm.c
index 9cd4b0a01cd6..0225bd94170f 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -89,6 +89,12 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
fpl->count_unix = 0;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
+#if IS_ENABLED(CONFIG_UNIX)
+ fpl->inflight = false;
+ fpl->dead = false;
+ fpl->edges = NULL;
+ INIT_LIST_HEAD(&fpl->vertices);
+#endif
}
fpp = &fpl->fp[fpl->count];
@@ -276,6 +282,16 @@ efault:
}
EXPORT_SYMBOL(put_cmsg);
+int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len,
+ void *data)
+{
+ /* Don't produce truncated CMSGs */
+ if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len))
+ return -ETOOSMALL;
+
+ return put_cmsg(msg, level, type, len, data);
+}
+
void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
{
struct scm_timestamping64 tss;
@@ -376,9 +392,137 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
+
new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user);
+#if IS_ENABLED(CONFIG_UNIX)
+ new_fpl->inflight = false;
+ new_fpl->edges = NULL;
+ INIT_LIST_HEAD(&new_fpl->vertices);
+#endif
}
return new_fpl;
}
EXPORT_SYMBOL(scm_fp_dup);
+
+#ifdef CONFIG_SECURITY_NETWORK
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct lsm_context ctx;
+ int err;
+
+ if (sk->sk_scm_security) {
+ err = security_secid_to_secctx(scm->secid, &ctx);
+
+ if (err >= 0) {
+ put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len,
+ ctx.context);
+
+ security_release_secctx(&ctx);
+ }
+ }
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return sk->sk_scm_security;
+}
+#else
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return false;
+}
+#endif
+
+static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct file *pidfd_file = NULL;
+ int len, pidfd;
+
+ /* put_cmsg() doesn't return an error if CMSG is truncated,
+ * that's why we need to opencode these checks here.
+ */
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
+ len = sizeof(struct compat_cmsghdr) + sizeof(int);
+ else
+ len = sizeof(struct cmsghdr) + sizeof(int);
+
+ if (msg->msg_controllen < len) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+
+ if (!scm->pid)
+ return;
+
+ pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
+
+ if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
+ if (pidfd_file) {
+ put_unused_fd(pidfd);
+ fput(pidfd_file);
+ }
+
+ return;
+ }
+
+ if (pidfd_file)
+ fd_install(pidfd, pidfd_file);
+}
+
+static bool __scm_recv_common(struct sock *sk, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!msg->msg_control) {
+ if (sk->sk_scm_credentials || sk->sk_scm_pidfd ||
+ scm->fp || scm_has_secdata(sk))
+ msg->msg_flags |= MSG_CTRUNC;
+
+ scm_destroy(scm);
+ return false;
+ }
+
+ if (sk->sk_scm_credentials) {
+ struct user_namespace *current_ns = current_user_ns();
+ struct ucred ucreds = {
+ .pid = scm->creds.pid,
+ .uid = from_kuid_munged(current_ns, scm->creds.uid),
+ .gid = from_kgid_munged(current_ns, scm->creds.gid),
+ };
+
+ put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
+ }
+
+ scm_passec(sk, msg, scm);
+
+ if (scm->fp)
+ scm_detach_fds(msg, scm);
+
+ return true;
+}
+
+void scm_recv(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ scm_destroy_cred(scm);
+}
+EXPORT_SYMBOL(scm_recv);
+
+void scm_recv_unix(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ if (sock->sk->sk_scm_pidfd)
+ scm_pidfd_recv(msg, scm);
+
+ scm_destroy_cred(scm);
+}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index b0ff6153be62..9a3965680451 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -71,7 +71,7 @@ u32 secure_tcpv6_ts_off(const struct net *net,
return siphash(&combined, offsetofend(typeof(combined), daddr),
&ts_secret);
}
-EXPORT_SYMBOL(secure_tcpv6_ts_off);
+EXPORT_IPV6_MOD(secure_tcpv6_ts_off);
u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport)
@@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
-
-#if IS_ENABLED(CONFIG_IP_DCCP)
-u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
-{
- u64 seq;
- net_secret_init();
- seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u32)sport << 16 | (__force u32)dport,
- &net_secret);
- seq += ktime_get_real_ns();
- seq &= (1ull << 48) - 1;
- return seq;
-}
-EXPORT_SYMBOL(secure_dccp_sequence_number);
-
-#if IS_ENABLED(CONFIG_IPV6)
-u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
- __be16 sport, __be16 dport)
-{
- const struct {
- struct in6_addr saddr;
- struct in6_addr daddr;
- __be16 sport;
- __be16 dport;
- } __aligned(SIPHASH_ALIGNMENT) combined = {
- .saddr = *(struct in6_addr *)saddr,
- .daddr = *(struct in6_addr *)daddr,
- .sport = sport,
- .dport = dport
- };
- u64 seq;
- net_secret_init();
- seq = siphash(&combined, offsetofend(typeof(combined), dport),
- &net_secret);
- seq += ktime_get_real_ns();
- seq &= (1ull << 48) - 1;
- return seq;
-}
-EXPORT_SYMBOL(secure_dccpv6_sequence_number);
-#endif
-#endif
diff --git a/net/core/selftests.c b/net/core/selftests.c
index 8f801e6e3b91..406faf8e5f3f 100644
--- a/net/core/selftests.c
+++ b/net/core/selftests.c
@@ -100,10 +100,10 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev,
ehdr->h_proto = htons(ETH_P_IP);
if (attr->tcp) {
+ memset(thdr, 0, sizeof(*thdr));
thdr->source = htons(attr->sport);
thdr->dest = htons(attr->dport);
thdr->doff = sizeof(struct tcphdr) / 4;
- thdr->check = 0;
} else {
uhdr->source = htons(attr->sport);
uhdr->dest = htons(attr->dport);
@@ -144,16 +144,25 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev,
attr->id = net_test_next_id;
shdr->id = net_test_next_id++;
- if (attr->size)
- skb_put(skb, attr->size);
- if (attr->max_size && attr->max_size > skb->len)
- skb_put(skb, attr->max_size - skb->len);
+ if (attr->size) {
+ void *payload = skb_put(skb, attr->size);
+
+ memset(payload, 0, attr->size);
+ }
+
+ if (attr->max_size && attr->max_size > skb->len) {
+ size_t pad_len = attr->max_size - skb->len;
+ void *pad = skb_put(skb, pad_len);
+
+ memset(pad, 0, pad_len);
+ }
skb->csum = 0;
skb->ip_summed = CHECKSUM_PARTIAL;
if (attr->tcp) {
- thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr,
- ihdr->daddr, 0);
+ int l4len = skb->len - skb_transport_offset(skb);
+
+ thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0);
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct tcphdr, check);
} else {
@@ -299,7 +308,7 @@ static int net_test_phy_loopback_enable(struct net_device *ndev)
if (!ndev->phydev)
return -EOPNOTSUPP;
- return phy_loopback(ndev->phydev, true);
+ return phy_loopback(ndev->phydev, true, 0);
}
static int net_test_phy_loopback_disable(struct net_device *ndev)
@@ -307,7 +316,7 @@ static int net_test_phy_loopback_disable(struct net_device *ndev)
if (!ndev->phydev)
return -EOPNOTSUPP;
- return phy_loopback(ndev->phydev, false);
+ return phy_loopback(ndev->phydev, false, 0);
}
static int net_test_phy_loopback_udp(struct net_device *ndev)
diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c
new file mode 100644
index 000000000000..4235db6bdfad
--- /dev/null
+++ b/net/core/skb_fault_injection.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+static struct {
+ struct fault_attr attr;
+ char devname[IFNAMSIZ];
+ bool filtered;
+} skb_realloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .filtered = false,
+};
+
+static bool should_fail_net_realloc_skb(struct sk_buff *skb)
+{
+ struct net_device *net = skb->dev;
+
+ if (skb_realloc.filtered &&
+ strncmp(net->name, skb_realloc.devname, IFNAMSIZ))
+ /* device name filter set, but names do not match */
+ return false;
+
+ if (!should_fail(&skb_realloc.attr, 1))
+ return false;
+
+ return true;
+}
+ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE);
+
+void skb_might_realloc(struct sk_buff *skb)
+{
+ if (!should_fail_net_realloc_skb(skb))
+ return;
+
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_might_realloc);
+
+static int __init fail_skb_realloc_setup(char *str)
+{
+ return setup_fault_attr(&skb_realloc.attr, str);
+}
+__setup("fail_skb_realloc=", fail_skb_realloc_setup);
+
+static void reset_settings(void)
+{
+ skb_realloc.filtered = false;
+ memset(&skb_realloc.devname, 0, IFNAMSIZ);
+}
+
+static ssize_t devname_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ ssize_t ret;
+
+ reset_settings();
+ ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ,
+ ppos, buffer, count);
+ if (ret < 0)
+ return ret;
+
+ skb_realloc.devname[IFNAMSIZ - 1] = '\0';
+ /* Remove a possible \n at the end of devname */
+ strim(skb_realloc.devname);
+
+ if (strnlen(skb_realloc.devname, IFNAMSIZ))
+ skb_realloc.filtered = true;
+
+ return count;
+}
+
+static ssize_t devname_read(struct file *file,
+ char __user *buffer,
+ size_t size, loff_t *ppos)
+{
+ if (!skb_realloc.filtered)
+ return 0;
+
+ return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname,
+ strlen(skb_realloc.devname));
+}
+
+static const struct file_operations devname_ops = {
+ .write = devname_write,
+ .read = devname_read,
+};
+
+static int __init fail_skb_realloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_skb_realloc", NULL,
+ &skb_realloc.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_file("devname", mode, dir, NULL, &devname_ops);
+
+ return 0;
+}
+
+late_initcall(fail_skb_realloc_debugfs);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b99127712e67..d6420b74ea9c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -51,6 +51,7 @@
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
@@ -63,11 +64,13 @@
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/gro.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
@@ -87,13 +90,17 @@
#include <linux/textsearch.h>
#include "dev.h"
+#include "devmem.h"
+#include "netmem_priv.h"
#include "sock_destructor.h"
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
-#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
+#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
+ GRO_MAX_HEAD_PAD))
/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
* This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
@@ -108,9 +115,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#define SKB_SMALL_HEAD_HEADROOM \
SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
-int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
-EXPORT_SYMBOL(sysctl_max_skb_frags);
-
/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
* iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
* netmem is a page.
@@ -221,98 +225,31 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
#define NAPI_SKB_CACHE_BULK 16
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
-#if PAGE_SIZE == SZ_4K
-
-#define NAPI_HAS_SMALL_PAGE_FRAG 1
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
-
-/* specialized page frag allocator using a single order 0 page
- * and slicing it into 1K sized fragment. Constrained to systems
- * with a very limited amount of 1K fragments fitting a single
- * page - to avoid excessive truesize underestimation
- */
-
-struct page_frag_1k {
- void *va;
- u16 offset;
- bool pfmemalloc;
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
-{
- struct page *page;
- int offset;
-
- offset = nc->offset - SZ_1K;
- if (likely(offset >= 0))
- goto use_frag;
-
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
- if (!page)
- return NULL;
-
- nc->va = page_address(page);
- nc->pfmemalloc = page_is_pfmemalloc(page);
- offset = PAGE_SIZE - SZ_1K;
- page_ref_add(page, offset / SZ_1K);
-
-use_frag:
- nc->offset = offset;
- return nc->va + offset;
-}
-#else
-
-/* the small page is actually unused in this build; add dummy helpers
- * to please the compiler and avoid later preprocessor's conditionals
- */
-#define NAPI_HAS_SMALL_PAGE_FRAG 0
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
-
-struct page_frag_1k {
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
-{
- return NULL;
-}
-
-#endif
-
struct napi_alloc_cache {
+ local_lock_t bh_lock;
struct page_frag_cache page;
- struct page_frag_1k page_small;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
-
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-void napi_get_frags_check(struct napi_struct *napi)
-{
- struct sk_buff *skb;
-
- local_bh_disable();
- skb = napi_get_frags(napi);
- WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
- napi_free_frags(napi);
- local_bh_enable();
-}
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ void *data;
fragsz = SKB_DATA_ALIGN(fragsz);
- return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
- align_mask);
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ data = __page_frag_alloc_align(&nc->page, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN, align_mask);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ return data;
+
}
EXPORT_SYMBOL(__napi_alloc_frag_align);
@@ -320,19 +257,16 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
void *data;
- fragsz = SKB_DATA_ALIGN(fragsz);
if (in_hardirq() || irqs_disabled()) {
struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
- data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
+ fragsz = SKB_DATA_ALIGN(fragsz);
+ data = __page_frag_alloc_align(nc, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN,
align_mask);
} else {
- struct napi_alloc_cache *nc;
-
local_bh_disable();
- nc = this_cpu_ptr(&napi_alloc_cache);
- data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
- align_mask);
+ data = __napi_alloc_frag_align(fragsz, align_mask);
local_bh_enable();
}
return data;
@@ -344,21 +278,87 @@ static struct sk_buff *napi_skb_cache_get(void)
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!nc->skb_count)) {
nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
- GFP_ATOMIC,
+ GFP_ATOMIC | __GFP_NOWARN,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
- if (unlikely(!nc->skb_count))
+ if (unlikely(!nc->skb_count)) {
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return NULL;
+ }
}
skb = nc->skb_cache[--nc->skb_count];
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
return skb;
}
+/**
+ * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
+ * @skbs: pointer to an at least @n-sized array to fill with skb pointers
+ * @n: number of entries to provide
+ *
+ * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
+ * the pointers into the provided array @skbs. If there are less entries
+ * available, tries to replenish the cache and bulk-allocates the diff from
+ * the MM layer if needed.
+ * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
+ * ready for {,__}build_skb_around() and don't have any data buffers attached.
+ * Must be called *only* from the BH context.
+ *
+ * Return: number of successfully allocated skbs (@n if no actual allocation
+ * needed or kmem_cache_alloc_bulk() didn't fail).
+ */
+u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 bulk, total = n;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ if (nc->skb_count >= n)
+ goto get;
+
+ /* No enough cached skbs. Try refilling the cache first */
+ bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
+ nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN, bulk,
+ &nc->skb_cache[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* Still not enough. Bulk-allocate the missing part directly, zeroed */
+ n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
+ n - nc->skb_count, &skbs[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* kmem_cache didn't allocate the number we need, limit the output */
+ total -= n - nc->skb_count;
+ n = nc->skb_count;
+
+get:
+ for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
+ u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
+
+ skbs[i] = nc->skb_cache[base + i];
+
+ kasan_mempool_unpoison_object(skbs[i], cache_size);
+ memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ }
+
+ nc->skb_count -= n;
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);
+
static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
unsigned int size)
{
@@ -412,7 +412,8 @@ struct sk_buff *slab_build_skb(void *data)
struct sk_buff *skb;
unsigned int size;
- skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
@@ -463,7 +464,8 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
- skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
@@ -726,7 +728,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
*/
- if (len <= SKB_WITH_OVERHEAD(1024) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
@@ -743,12 +745,16 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
+
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
}
@@ -775,10 +781,9 @@ skb_fail:
EXPORT_SYMBOL(__netdev_alloc_skb);
/**
- * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
* @napi: napi instance this buffer was allocated for
* @len: length to allocate
- * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
*
* Allocate a new sk_buff for use in NAPI receive. This buffer will
* attempt to allocate the head from a special reserved region used
@@ -787,9 +792,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
- gfp_t gfp_mask)
+struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
+ gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
struct napi_alloc_cache *nc;
struct sk_buff *skb;
bool pfmemalloc;
@@ -800,10 +805,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
- * When the small frag allocator is available, prefer it over kmalloc
- * for small fragments
*/
- if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -813,32 +816,17 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
goto skb_success;
}
- nc = this_cpu_ptr(&napi_alloc_cache);
+ len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
- /* we are artificially inflating the allocation size, but
- * that is not as bad as it may look like, as:
- * - 'len' less than GRO_MAX_HEAD makes little sense
- * - On most systems, larger 'len' values lead to fragment
- * size above 512 bytes
- * - kmalloc would use the kmalloc-1k slab for such values
- * - Builds with smaller GRO_MAX_HEAD will very likely do
- * little networking, as that implies no WiFi and no
- * tunnels support, and 32 bits arches.
- */
- len = SZ_1K;
-
- data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
- pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
- } else {
- len = SKB_HEAD_ALIGN(len);
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ nc = this_cpu_ptr(&napi_alloc_cache);
- data = page_frag_alloc(&nc->page, len, gfp_mask);
- pfmemalloc = nc->page.pfmemalloc;
- }
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!data))
return NULL;
@@ -860,7 +848,7 @@ skb_success:
skb_fail:
return skb;
}
-EXPORT_SYMBOL(__napi_alloc_skb);
+EXPORT_SYMBOL(napi_alloc_skb);
void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
int off, int size, unsigned int truesize)
@@ -907,11 +895,6 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool is_pp_page(struct page *page)
-{
- return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
-}
-
int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
unsigned int headroom)
{
@@ -995,7 +978,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_pp_cow_data);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
- struct bpf_prog *prog)
+ const struct bpf_prog *prog)
{
if (!prog->aux->xdp_has_frags)
return -EINVAL;
@@ -1005,56 +988,25 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_cow_data_for_xdp);
#if IS_ENABLED(CONFIG_PAGE_POOL)
-bool napi_pp_put_page(struct page *page, bool napi_safe)
+bool napi_pp_put_page(netmem_ref netmem)
{
- bool allow_direct = false;
- struct page_pool *pp;
-
- page = compound_head(page);
+ netmem = netmem_compound_head(netmem);
- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely(!is_pp_page(page)))
+ if (unlikely(!netmem_is_pp(netmem)))
return false;
- pp = page->pp;
-
- /* Allow direct recycle if we have reasons to believe that we are
- * in the same context as the consumer would run, so there's
- * no possible race.
- * __page_pool_put_page() makes sure we're not in hardirq context
- * and interrupts are enabled prior to accessing the cache.
- */
- if (napi_safe || in_softirq()) {
- const struct napi_struct *napi = READ_ONCE(pp->p.napi);
- unsigned int cpuid = smp_processor_id();
-
- allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
- allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
- }
-
- /* Driver set this to memory recycling info. Reset it on recycle.
- * This will *not* work for NIC using a split-page memory model.
- * The page will be returned to the pool here regardless of the
- * 'flipped' fragment being in use or not.
- */
- page_pool_put_full_page(pp, page, allow_direct);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif
-static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return napi_pp_put_page(virt_to_page(data), napi_safe);
+ return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
}
/**
@@ -1070,7 +1022,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
static int skb_pp_frag_ref(struct sk_buff *skb)
{
struct skb_shared_info *shinfo;
- struct page *head_page;
+ netmem_ref head_netmem;
int i;
if (!skb->pp_recycle)
@@ -1079,11 +1031,11 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
shinfo = skb_shinfo(skb);
for (i = 0; i < shinfo->nr_frags; i++) {
- head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
- if (likely(is_pp_page(head_page)))
- page_pool_ref_page(head_page);
+ head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
+ if (likely(netmem_is_pp(head_netmem)))
+ page_pool_ref_netmem(head_netmem);
else
- page_ref_inc(head_page);
+ page_ref_inc(netmem_to_page(head_netmem));
}
return 0;
}
@@ -1096,12 +1048,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
kfree(head);
}
-static void skb_free_head(struct sk_buff *skb, bool napi_safe)
+static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
if (skb->head_frag) {
- if (skb_pp_recycle(skb, head, napi_safe))
+ if (skb_pp_recycle(skb, head))
return;
skb_free_frag(head);
} else {
@@ -1109,8 +1061,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)
}
}
-static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
- bool napi_safe)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
@@ -1127,13 +1078,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
}
for (i = 0; i < shinfo->nr_frags; i++)
- napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
+ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
free_head:
if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason);
- skb_free_head(skb, napi_safe);
+ skb_free_head(skb);
exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races
@@ -1194,12 +1145,11 @@ void skb_release_head_state(struct sk_buff *skb)
}
/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
- bool napi_safe)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
skb_release_head_state(skb);
if (likely(skb->head))
- skb_release_data(skb, reason, napi_safe);
+ skb_release_data(skb, reason);
}
/**
@@ -1213,13 +1163,14 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
void __kfree_skb(struct sk_buff *skb)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
static __always_inline
-bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason reason)
{
if (unlikely(!skb_unref(skb)))
return false;
@@ -1232,26 +1183,27 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
if (reason == SKB_CONSUMED)
trace_consume_skb(skb, __builtin_return_address(0));
else
- trace_kfree_skb(skb, __builtin_return_address(0), reason);
+ trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
return true;
}
/**
- * kfree_skb_reason - free an sk_buff with special reason
+ * sk_skb_reason_drop - free an sk_buff with special reason
+ * @sk: the socket to receive @skb, or NULL if not applicable
* @skb: buffer to free
* @reason: reason why this skb is dropped
*
- * Drop a reference to the buffer and free it if the usage count has
- * hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
- * tracepoint.
+ * Drop a reference to the buffer and free it if the usage count has hit
+ * zero. Meanwhile, pass the receiving socket and drop reason to
+ * 'kfree_skb' tracepoint.
*/
void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (__kfree_skb_reason(skb, reason))
+ if (__sk_skb_reason_drop(sk, skb, reason))
__kfree_skb(skb);
}
-EXPORT_SYMBOL(kfree_skb_reason);
+EXPORT_SYMBOL(sk_skb_reason_drop);
#define KFREE_SKB_BULK_SIZE 16
@@ -1270,7 +1222,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return;
}
- skb_release_all(skb, reason, false);
+ skb_release_all(skb, reason);
sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1290,7 +1242,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
while (segs) {
struct sk_buff *next = segs->next;
- if (__kfree_skb_reason(segs, reason)) {
+ if (__sk_skb_reason_drop(NULL, segs, reason)) {
skb_poison_list(segs);
kfree_skb_add_bulk(segs, &sa, reason);
}
@@ -1331,22 +1283,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
has_trans = skb_transport_header_was_set(skb);
printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
- "mac=(%d,%d) net=(%d,%d) trans=%d\n"
+ "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
"shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
- "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
- "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
+ "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
+ "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
+ "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
level, skb->len, headroom, skb_headlen(skb), tailroom,
has_mac ? skb->mac_header : -1,
has_mac ? skb_mac_header_len(skb) : -1,
+ skb->mac_len,
skb->network_header,
has_trans ? skb_network_header_len(skb) : -1,
has_trans ? skb->transport_header : -1,
sh->tx_flags, sh->nr_frags,
sh->gso_size, sh->gso_type, sh->gso_segs,
- skb->csum, skb->ip_summed, skb->csum_complete_sw,
- skb->csum_valid, skb->csum_level,
+ skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
+ skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
skb->hash, skb->sw_hash, skb->l4_hash,
- ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
+ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
+ skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
+ skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
+ skb->inner_network_header, skb->inner_transport_header);
if (dev)
printk("%sdev name=%s feat=%pNF\n",
@@ -1375,6 +1333,14 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
struct page *p;
u8 *vaddr;
+ if (skb_frag_is_net_iov(frag)) {
+ printk("%sskb frag %d: not readable\n", level, i);
+ len -= skb_frag_size(frag);
+ if (!len)
+ break;
+ continue;
+ }
+
skb_frag_foreach_page(frag, skb_frag_off(frag),
skb_frag_size(frag), p, p_off, p_len,
copied) {
@@ -1444,7 +1410,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb, __builtin_return_address(0));
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb);
}
@@ -1456,6 +1422,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
if (!kasan_mempool_poison_object(skb))
return;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
@@ -1467,11 +1434,12 @@ static void napi_skb_cache_put(struct sk_buff *skb)
nc->skb_cache + NAPI_SKB_CACHE_HALF);
nc->skb_count = NAPI_SKB_CACHE_HALF;
}
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
- skb_release_all(skb, reason, true);
+ skb_release_all(skb, reason);
napi_skb_cache_put(skb);
}
@@ -1509,7 +1477,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- skb_release_all(skb, SKB_CONSUMED, !!budget);
+ skb_release_all(skb, SKB_CONSUMED);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1640,7 +1608,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- skb_release_all(dst, SKB_CONSUMED, false);
+ skb_release_all(dst, SKB_CONSUMED);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -1688,7 +1656,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
+ bool devmem)
{
struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
@@ -1703,12 +1672,12 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->mmp.user = NULL;
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
kfree_skb(skb);
return NULL;
}
- uarg->ubuf.callback = msg_zerocopy_callback;
+ uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
@@ -1726,7 +1695,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
}
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg, bool devmem)
{
if (uarg) {
struct ubuf_info_msgzc *uarg_zc;
@@ -1734,7 +1703,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
u32 bytelen, next;
/* there might be non MSG_ZEROCOPY users */
- if (uarg->callback != msg_zerocopy_callback)
+ if (uarg->ops != &msg_zerocopy_ubuf_ops)
return NULL;
/* realloc only when socket is locked (TCP, UDP cork),
@@ -1756,7 +1725,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
- if (mm_account_pinned_pages(&uarg_zc->mmp, size))
+ if (likely(!devmem) &&
+ mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
@@ -1771,7 +1741,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
new_alloc:
- return msg_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size, devmem);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
@@ -1845,8 +1815,8 @@ release:
sock_put(sk);
}
-void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
- bool success)
+static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success)
{
struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
@@ -1855,7 +1825,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
if (refcount_dec_and_test(&uarg->refcnt))
__msg_zerocopy_callback(uarg_zc);
}
-EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
@@ -1865,24 +1834,39 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
uarg_to_msgzc(uarg)->len--;
if (have_uref)
- msg_zerocopy_callback(NULL, uarg, true);
+ msg_zerocopy_complete(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
+const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
+ .complete = msg_zerocopy_complete,
+};
+EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
+
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding)
{
- struct ubuf_info *orig_uarg = skb_zcopy(skb);
int err, orig_len = skb->len;
- /* An skb can only point to one uarg. This edge case happens when
- * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
- */
- if (orig_uarg && uarg != orig_uarg)
- return -EEXIST;
+ if (uarg->ops->link_skb) {
+ err = uarg->ops->link_skb(skb, uarg);
+ if (err)
+ return err;
+ } else {
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
- err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
+ /* An skb can only point to one uarg. This edge case happens
+ * when TCP appends to an skb, but zerocopy_realloc triggered
+ * a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+ }
+
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+ binding);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
@@ -1954,6 +1938,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
return -EINVAL;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
if (!num_frags)
goto release;
@@ -2123,11 +2110,20 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
- int headerlen = skb_headroom(skb);
- unsigned int size = skb_end_offset(skb) + skb->data_len;
- struct sk_buff *n = __alloc_skb(size, gfp_mask,
- skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+ struct sk_buff *n;
+ unsigned int size;
+ int headerlen;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+
+ headerlen = skb_headroom(skb);
+ size = skb_end_offset(skb) + skb->data_len;
+ n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -2272,9 +2268,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
- skb_free_head(skb, false);
+ skb_free_head(skb);
}
off = (data + nhead) - skb->head;
@@ -2455,12 +2451,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask, skb_alloc_rx_flag(skb),
- NUMA_NO_NODE);
- int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
+ struct sk_buff *n;
+ int oldheadroom;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+ oldheadroom = skb_headroom(skb);
+ n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
if (!n)
return NULL;
@@ -2798,6 +2802,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
*/
int i, k, eat = (skb->tail + delta) - skb->end;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
if (eat > 0 || skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
GFP_ATOMIC))
@@ -2951,6 +2958,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
to += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -3139,9 +3149,15 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
/*
* then map the fragments
*/
+ if (!skb_frags_readable(skb))
+ return false;
+
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+ if (WARN_ON_ONCE(!skb_frag_page(f)))
+ return false;
+
if (__splice_segment(skb_frag_page(f),
skb_frag_off(f), skb_frag_size(f),
offset, len, spd, false, sk, pipe))
@@ -3217,7 +3233,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
- int len, sendmsg_func sendmsg)
+ int len, sendmsg_func sendmsg, int flags)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
@@ -3235,7 +3251,7 @@ do_frag_list:
kv.iov_base = skb->data + offset;
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT;
+ msg.msg_flags = MSG_DONTWAIT | flags;
iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
@@ -3272,7 +3288,8 @@ do_frag_list:
while (slen) {
struct bio_vec bvec;
struct msghdr msg = {
- .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
+ flags,
};
bvec_set_page(&bvec, skb_frag_page(frag), slen,
@@ -3318,14 +3335,21 @@ error:
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
+ int offset, int len, int flags)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);
+
/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
}
/**
@@ -3359,6 +3383,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
from += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
@@ -3418,8 +3445,7 @@ fault:
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
- __wsum csum, const struct skb_checksum_ops *ops)
+__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -3430,14 +3456,16 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
- skb->data + offset, copy, csum);
+ csum = csum_partial(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
pos = copy;
}
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
@@ -3458,13 +3486,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = INDIRECT_CALL_1(ops->update,
- csum_partial_ext,
- vaddr + p_off, p_len, 0);
+ csum2 = csum_partial(vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = INDIRECT_CALL_1(ops->combine,
- csum_block_add_ext, csum,
- csum2, pos, p_len);
+ csum = csum_block_add(csum, csum2, pos);
pos += p_len;
}
@@ -3485,10 +3509,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum2;
if (copy > len)
copy = len;
- csum2 = __skb_checksum(frag_iter, offset - start,
- copy, 0, ops);
- csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
- csum, csum2, pos, copy);
+ csum2 = skb_checksum(frag_iter, offset - start, copy,
+ 0);
+ csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -3500,18 +3523,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
return csum;
}
-EXPORT_SYMBOL(__skb_checksum);
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
-{
- const struct skb_checksum_ops ops = {
- .update = csum_partial_ext,
- .combine = csum_block_add_ext,
- };
-
- return __skb_checksum(skb, offset, len, csum, &ops);
-}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
@@ -3538,6 +3549,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
pos = copy;
}
+ if (!skb_frags_readable(skb))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
@@ -3601,6 +3615,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+#ifdef CONFIG_NET_CRC32C
+u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = crc32c(crc, skb->data + offset, copy);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+ if (copy > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ copy = min(copy, len);
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ crc = crc32c(crc, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = skb_crc32c(frag_iter, offset - start, copy, crc);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+
+ return crc;
+}
+EXPORT_SYMBOL(skb_crc32c);
+#endif /* CONFIG_NET_CRC32C */
+
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
__sum16 sum;
@@ -3660,32 +3746,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(__skb_checksum_complete);
-static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
- int offset, int len)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static const struct skb_checksum_ops default_crc32c_ops = {
- .update = warn_crc32c_csum_update,
- .combine = warn_crc32c_csum_combine,
-};
-
-const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
- &default_crc32c_ops;
-EXPORT_SYMBOL(crc32c_csum_stub);
-
/**
* skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
* @from: source buffer
@@ -4029,6 +4089,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb1->unreadable = skb->unreadable;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
@@ -4076,6 +4137,8 @@ static inline void skb_split_no_header(struct sk_buff *skb,
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
+
+ skb1->unreadable = skb->unreadable;
}
/**
@@ -4139,6 +4202,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
if (skb_zcopy(tgt) || skb_zcopy(skb))
return 0;
+ DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
+ DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));
+
todo = shiftlen;
from = 0;
to = skb_shinfo(tgt)->nr_frags;
@@ -4147,8 +4213,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
/* Actual merge is delayed until the point when we know we can
* commit all, so that we don't have to undo partial changes
*/
- if (!to ||
- !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
skb_frag_off(fragfrom))) {
merge = -1;
} else {
@@ -4311,6 +4376,9 @@ next_skb:
return block_limit - abs_offset;
}
+ if (!skb_frags_readable(st->cur_skb))
+ return 0;
+
if (st->frag_idx == 0 && !st->frag_data)
st->stepped_offset += skb_headlen(st->cur_skb);
@@ -4387,6 +4455,41 @@ void skb_abort_seq_read(struct skb_seq_state *st)
}
EXPORT_SYMBOL(skb_abort_seq_read);
+/**
+ * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
+ * @st: source skb_seq_state
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @offset bytes into the source @st to the destination
+ * buffer @to. `offset` should increase (or be unchanged) with each subsequent
+ * call to this function. If offset needs to decrease from the previous use `st`
+ * should be reset first.
+ *
+ * Return: 0 on success or -EINVAL if the copy ended early
+ */
+int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
+{
+ const u8 *data;
+ u32 sqlen;
+
+ for (;;) {
+ sqlen = skb_seq_read(offset, &data, st);
+ if (sqlen == 0)
+ return -EINVAL;
+ if (sqlen >= len) {
+ memcpy(to, data, len);
+ return 0;
+ }
+ memcpy(to, data, sqlen);
+ to += sqlen;
+ offset += sqlen;
+ len -= sqlen;
+ }
+}
+EXPORT_SYMBOL(skb_copy_seq_read);
+
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
@@ -5139,7 +5242,7 @@ EXPORT_SYMBOL_GPL(skb_to_sgvec);
* 3. sg_unmark_end
* 4. skb_to_sgvec(payload2)
*
- * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * When mapping multiple payload conditionally, skb_to_sgvec_nomark
* is more preferable.
*/
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
@@ -5404,7 +5507,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
- if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
+ if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
return true;
read_lock_bh(&sk->sk_callback_lock);
@@ -5437,6 +5540,54 @@ err:
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ int tstype)
+{
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
+ case SCM_TSTAMP_SND:
+ return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
+ SKBTX_SW_TSTAMP);
+ case SCM_TSTAMP_ACK:
+ return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
+ case SCM_TSTAMP_COMPLETION:
+ return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
+ }
+
+ return false;
+}
+
+static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk,
+ int tstype)
+{
+ int op;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
+ break;
+ case SCM_TSTAMP_SND:
+ if (hwtstamps) {
+ op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
+ *skb_hwtstamps(skb) = *hwtstamps;
+ } else {
+ op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
+ }
+ break;
+ case SCM_TSTAMP_ACK:
+ op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
+ break;
+ default:
+ return;
+ }
+
+ bpf_skops_tx_timestamping(sk, skb, op);
+}
+
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
@@ -5449,6 +5600,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
+ skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
+ sk, tstype);
+
+ if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
+ return;
+
tsflags = READ_ONCE(sk->sk_tsflags);
if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
@@ -5923,7 +6081,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (to->pp_recycle != from->pp_recycle)
return false;
- if (len <= skb_tailroom(to)) {
+ if (skb_frags_readable(from) != skb_frags_readable(to))
+ return false;
+
+ if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
@@ -5997,7 +6158,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
* @skb: buffer to clean
* @xnet: packet is crossing netns
*
- * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * skb_scrub_packet can be used after encapsulating or decapsulating a packet
* into/from a tunnel. Some information have to be cleared during these
* operations.
* skb_scrub_packet can also be used to clean a skb before injecting it in
@@ -6018,11 +6179,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->offload_fwd_mark = 0;
skb->offload_l3_fwd_mark = 0;
#endif
+ ipvs_reset(skb);
if (!xnet)
return;
- ipvs_reset(skb);
skb->mark = 0;
skb_clear_tstamp(skb);
}
@@ -6219,7 +6380,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
return err;
skb->protocol = skb->vlan_proto;
- skb->mac_len += VLAN_HLEN;
+ skb->network_header -= VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
}
@@ -6575,12 +6736,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
- skb_free_head(skb, false);
+ skb_free_head(skb);
}
skb->head = data;
@@ -6715,7 +6876,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size);
return -ENOMEM;
}
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
skb->head = data;
skb->head_frag = 0;
@@ -6779,7 +6940,7 @@ void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
- skb_cloned(skb))
+ skb_cloned(skb) || !skb_frags_readable(skb))
return;
/* Nice, we can free page frag(s) right now */
@@ -6995,6 +7156,19 @@ free_now:
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */
+static void kfree_skb_napi_cache(struct sk_buff *skb)
+{
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ local_bh_disable();
+ __napi_kfree_skb(skb, SKB_CONSUMED);
+ local_bh_enable();
+}
+
/**
* skb_attempt_defer_free - queue skb for remote freeing
* @skb: buffer
@@ -7010,10 +7184,10 @@ void skb_attempt_defer_free(struct sk_buff *skb)
unsigned int defer_max;
bool kick;
- if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
- !cpu_online(cpu) ||
- cpu == raw_smp_processor_id()) {
-nodefer: __kfree_skb(skb);
+ if (cpu == raw_smp_processor_id() ||
+ WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+ !cpu_online(cpu)) {
+nodefer: kfree_skb_napi_cache(skb);
return;
}
@@ -7021,7 +7195,7 @@ nodefer: __kfree_skb(skb);
DEBUG_NET_WARN_ON_ONCE(skb->destructor);
sd = &per_cpu(softnet_data, cpu);
- defer_max = READ_ONCE(sysctl_skb_defer_max);
+ defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
if (READ_ONCE(sd->defer_count) >= defer_max)
goto nodefer;
@@ -7039,8 +7213,8 @@ nodefer: __kfree_skb(skb);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
- if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
- smp_call_function_single_async(cpu, &sd->defer_csd);
+ if (unlikely(kick))
+ kick_defer_list_purge(sd, cpu);
}
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
@@ -7073,7 +7247,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
ssize_t maxsize, gfp_t gfp)
{
- size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
+ size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
struct page *pages[8], **ppages = pages;
ssize_t spliced = 0, ret = 0;
unsigned int i;
@@ -7169,3 +7343,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
+void get_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+ get_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(get_netmem);
+
+void put_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+
+ put_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(put_netmem);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 4d75ef9d24bf..34c51eb1a14f 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -293,7 +293,7 @@ out:
/* If we trim data a full sg elem before curr pointer update
* copybreak and current so that any future copy operations
* start at new copy location.
- * However trimed data that has not yet been used in a copy op
+ * However trimmed data that has not yet been used in a copy op
* does not require an update.
*/
if (!msg->sg.size) {
@@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes)
{
int ret = -ENOSPC, i = msg->sg.curr;
+ u32 copy, buf_size, copied = 0;
struct scatterlist *sge;
- u32 copy, buf_size;
void *to;
do {
@@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
goto out;
}
bytes -= copy;
+ copied += copy;
if (!bytes)
break;
msg->sg.copybreak = 0;
@@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
} while (i != msg->sg.end);
out:
msg->sg.curr = i;
- return ret;
+ return (ret < 0) ? ret : copied;
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
@@ -434,7 +435,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
page = sg_page(sge);
if (copied + copy > len)
copy = len - copied;
- copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (copy)
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
if (!copy) {
copied = copied ? copied : -EFAULT;
goto out;
@@ -444,8 +446,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
if (likely(!peek)) {
sge->offset += copy;
sge->length -= copy;
- if (!msg_rx->skb)
+ if (!msg_rx->skb) {
sk_mem_uncharge(sk, copy);
+ atomic_sub(copy, &sk->sk_rmem_alloc);
+ }
msg_rx->sg.size -= copy;
if (!sge->length) {
@@ -526,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
u32 off, u32 len,
struct sk_psock *psock,
struct sock *sk,
- struct sk_msg *msg)
+ struct sk_msg *msg,
+ bool take_ref)
{
int num_sge, copied;
+ /* skb_to_sgvec will fail when the total number of fragments in
+ * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
+ * caller may aggregate multiple skbs.
+ */
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
if (num_sge < 0) {
/* skb linearize may fail with ENOMEM, but lets simply try again
* later if this happens. Under memory pressure we don't want to
* drop the skb. We need to linearize the skb so that the mapping
* in skb_to_sgvec can not error.
+ * Note that skb_linearize requires the skb not to be shared.
*/
if (skb_linearize(skb))
return -EAGAIN;
@@ -545,11 +555,14 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
return num_sge;
}
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ psock->ingress_bytes += len;
+#endif
copied = len;
msg->sg.start = 0;
msg->sg.size = copied;
msg->sg.end = num_sge;
- msg->skb = skb;
+ msg->skb = take_ref ? skb_get(skb) : skb;
sk_psock_queue_msg(psock, msg);
sk_psock_data_ready(sk, psock);
@@ -557,7 +570,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
}
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
- u32 off, u32 len);
+ u32 off, u32 len, bool take_ref);
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len)
@@ -571,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* correctly.
*/
if (unlikely(skb->sk == sk))
- return sk_psock_skb_ingress_self(psock, skb, off, len);
+ return sk_psock_skb_ingress_self(psock, skb, off, len, true);
msg = sk_psock_create_ingress_msg(sk, skb);
if (!msg)
return -EAGAIN;
@@ -583,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* into user buffers.
*/
skb_set_owner_r(skb, sk);
- err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
if (err < 0)
kfree(msg);
return err;
@@ -594,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* because the skb is already accounted for here.
*/
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
- u32 off, u32 len)
+ u32 off, u32 len, bool take_ref)
{
struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
struct sock *sk = psock->sk;
@@ -603,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
if (unlikely(!msg))
return -EAGAIN;
skb_set_owner_r(skb, sk);
- err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
if (err < 0)
kfree(msg);
return err;
@@ -612,18 +625,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
- int err = 0;
-
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
return skb_send_sock(psock->sk, skb, off, len);
}
- skb_get(skb);
- err = sk_psock_skb_ingress(psock, skb, off, len);
- if (err < 0)
- kfree_skb(skb);
- return err;
+
+ return sk_psock_skb_ingress(psock, skb, off, len);
}
static void sk_psock_skb_state(struct sk_psock *psock,
@@ -648,12 +656,14 @@ static void sk_psock_backlog(struct work_struct *work)
bool ingress;
int ret;
+ /* Increment the psock refcnt to synchronize with close(fd) path in
+ * sock_map_close(), ensuring we wait for backlog thread completion
+ * before sk_socket freed. If refcnt increment fails, it indicates
+ * sock_map_close() completed with sk_socket potentially already freed.
+ */
+ if (!sk_psock_get(psock->sk))
+ return;
mutex_lock(&psock->work_mutex);
- if (unlikely(state->len)) {
- len = state->len;
- off = state->off;
- }
-
while ((skb = skb_peek(&psock->ingress_skb))) {
len = skb->len;
off = 0;
@@ -663,6 +673,13 @@ static void sk_psock_backlog(struct work_struct *work)
off = stm->offset;
len = stm->full_len;
}
+
+ /* Resume processing from previous partial state */
+ if (unlikely(state->len)) {
+ len = state->len;
+ off = state->off;
+ }
+
ingress = skb_bpf_ingress(skb);
skb_bpf_redirect_clear(skb);
do {
@@ -673,7 +690,8 @@ static void sk_psock_backlog(struct work_struct *work)
if (ret <= 0) {
if (ret == -EAGAIN) {
sk_psock_skb_state(psock, state, len, off);
-
+ /* Restore redir info we cleared before */
+ skb_bpf_set_redir(skb, psock->sk, ingress);
/* Delay slightly to prioritize any
* other work that might be here.
*/
@@ -690,11 +708,14 @@ static void sk_psock_backlog(struct work_struct *work)
len -= ret;
} while (len);
+ /* The entire skb sent, clear state */
+ sk_psock_skb_state(psock, state, 0, 0);
skb = skb_dequeue(&psock->ingress_skb);
kfree_skb(skb);
}
end:
mutex_unlock(&psock->work_mutex);
+ sk_psock_put(psock->sk, psock);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -771,6 +792,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
list_del(&msg->list);
+ if (!msg->skb)
+ atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
sk_msg_free(psock->sk, msg);
kfree(msg);
}
@@ -1005,7 +1028,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
off = stm->offset;
len = stm->full_len;
}
- err = sk_psock_skb_ingress_self(psock, skb, off, len);
+ err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
}
if (err < 0) {
spin_lock_bh(&psock->ingress_lock);
@@ -1116,9 +1139,9 @@ static void sk_psock_strp_data_ready(struct sock *sk)
if (tls_sw_has_ctx_rx(sk)) {
psock->saved_data_ready(sk);
} else {
- write_lock_bh(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
strp_data_ready(&psock->strp);
- write_unlock_bh(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
}
rcu_read_unlock();
@@ -1138,6 +1161,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
if (!ret)
sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
+ if (sk_is_tcp(sk)) {
+ psock->strp.cb.read_sock = tcp_bpf_strp_read_sock;
+ psock->copied_seq = tcp_sk(sk)->copied_seq;
+ }
return ret;
}
@@ -1226,11 +1253,8 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
rcu_read_lock();
psock = sk_psock(sk);
- if (psock) {
- read_lock_bh(&sk->sk_callback_lock);
+ if (psock)
sk_psock_data_ready(sk, psock);
- read_unlock_bh(&sk->sk_callback_lock);
- }
rcu_read_unlock();
}
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 43bf3818c19e..3b409bc8ef6d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -85,7 +85,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
@@ -124,9 +124,11 @@
#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
+#include <net/proto_memory.h>
#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
@@ -146,6 +148,8 @@
#include <linux/ethtool.h>
+#include <uapi/linux/pidfd.h>
+
#include "dev.h"
static DEFINE_MUTEX(proto_list_mutex);
@@ -283,9 +287,6 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
-int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;
-
-int sysctl_tstamp_allow_data __read_mostly = 1;
DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);
@@ -455,6 +456,13 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
return 0;
}
+static bool sk_set_prio_allowed(const struct sock *sk, int val)
+{
+ return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
+}
+
static bool sock_needs_netstamp(const struct sock *sk)
{
switch (sk->sk_family) {
@@ -482,7 +490,7 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
+ if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
atomic_inc(&sk->sk_drops);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
@@ -552,7 +560,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
skb->dev = NULL;
- if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
+ if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
}
@@ -821,14 +829,11 @@ EXPORT_SYMBOL(sock_set_sndtimeo);
static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
{
+ sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
+ sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
if (val) {
sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
- sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
- sock_set_flag(sk, SOCK_RCVTSTAMP);
sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- } else {
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
}
}
@@ -935,6 +940,7 @@ int sock_set_timestamping(struct sock *sk, int optname,
WRITE_ONCE(sk->sk_tsflags, val);
sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
@@ -945,6 +951,20 @@ int sock_set_timestamping(struct sock *sk, int optname,
return 0;
}
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+ __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
+}
+#endif
+
void sock_set_keepalive(struct sock *sk)
{
lock_sock(sk);
@@ -1049,6 +1069,75 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
return 0;
}
+#ifdef CONFIG_PAGE_POOL
+
+/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
+ * in 1 syscall. The limit exists to limit the amount of memory the kernel
+ * allocates to copy these tokens, and to prevent looping over the frags for
+ * too long.
+ */
+#define MAX_DONTNEED_TOKENS 128
+#define MAX_DONTNEED_FRAGS 1024
+
+static noinline_for_stack int
+sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+ unsigned int num_tokens, i, j, k, netmem_num = 0;
+ struct dmabuf_token *tokens;
+ int ret = 0, num_frags = 0;
+ netmem_ref netmems[16];
+
+ if (!sk_is_tcp(sk))
+ return -EBADF;
+
+ if (optlen % sizeof(*tokens) ||
+ optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
+ return -EINVAL;
+
+ num_tokens = optlen / sizeof(*tokens);
+ tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
+ if (!tokens)
+ return -ENOMEM;
+
+ if (copy_from_sockptr(tokens, optval, optlen)) {
+ kvfree(tokens);
+ return -EFAULT;
+ }
+
+ xa_lock_bh(&sk->sk_user_frags);
+ for (i = 0; i < num_tokens; i++) {
+ for (j = 0; j < tokens[i].token_count; j++) {
+ if (++num_frags > MAX_DONTNEED_FRAGS)
+ goto frag_limit_reached;
+
+ netmem_ref netmem = (__force netmem_ref)__xa_erase(
+ &sk->sk_user_frags, tokens[i].token_start + j);
+
+ if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ continue;
+
+ netmems[netmem_num++] = netmem;
+ if (netmem_num == ARRAY_SIZE(netmems)) {
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+ netmem_num = 0;
+ xa_lock_bh(&sk->sk_user_frags);
+ }
+ ret++;
+ }
+ }
+
+frag_limit_reached:
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+
+ kvfree(tokens);
+ return ret;
+}
+#endif
+
void sockopt_lock_sock(struct sock *sk)
{
/* When current->bpf_ctx is set, the setsockopt is called from
@@ -1083,6 +1172,17 @@ bool sockopt_capable(int cap)
}
EXPORT_SYMBOL(sockopt_capable);
+static int sockopt_validate_clockid(__kernel_clockid_t value)
+{
+ switch (value) {
+ case CLOCK_REALTIME:
+ case CLOCK_MONOTONIC:
+ case CLOCK_TAI:
+ return 0;
+ }
+ return -EINVAL;
+}
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
@@ -1117,22 +1217,11 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
/* handle options which do not require locking the socket. */
switch (optname) {
case SO_PRIORITY:
- if ((val >= 0 && val <= 6) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (sk_set_prio_allowed(sk, val)) {
sock_set_priority(sk, val);
return 0;
}
return -EPERM;
- case SO_PASSSEC:
- assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
- return 0;
- case SO_PASSCRED:
- assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
- return 0;
- case SO_PASSPIDFD:
- assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
- return 0;
case SO_TYPE:
case SO_PROTOCOL:
case SO_DOMAIN:
@@ -1180,6 +1269,8 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
return 0;
}
case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
if (val < -1 || val > 1)
return -EINVAL;
if ((u8)val == SOCK_TXREHASH_DEFAULT)
@@ -1200,6 +1291,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
ret = -EOPNOTSUPP;
return ret;
}
+#ifdef CONFIG_PAGE_POOL
+ case SO_DEVMEM_DONTNEED:
+ return sock_devmem_dontneed(sk, optval, optlen);
+#endif
}
sockopt_lock_sock(sk);
@@ -1215,7 +1310,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
break;
case SO_REUSEPORT:
- sk->sk_reuseport = valbool;
+ if (valbool && !sk_is_inet(sk))
+ ret = -EOPNOTSUPP;
+ else
+ sk->sk_reuseport = valbool;
break;
case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
@@ -1434,6 +1532,10 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
break;
+ case SO_RCVPRIORITY:
+ sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
+ break;
+
case SO_RXQ_OVFL:
sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
break;
@@ -1450,6 +1552,33 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
break;
+ case SO_PASSCRED:
+ if (sk_may_scm_recv(sk))
+ sk->sk_scm_credentials = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSSEC:
+ if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
+ sk->sk_scm_security = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSPIDFD:
+ if (sk_is_unix(sk))
+ sk->sk_scm_pidfd = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (sk_is_unix(sk))
+ sk->sk_scm_rights = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val);
@@ -1497,6 +1626,11 @@ set_sndbuf:
ret = -EPERM;
break;
}
+
+ ret = sockopt_validate_clockid(sk_txtime.clockid);
+ if (ret)
+ break;
+
sock_valbool_flag(sk, SOCK_TXTIME, true);
sk->sk_clockid = sk_txtime.clockid;
sk->sk_txtime_deadline_mode =
@@ -1741,11 +1875,24 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_PASSCRED:
- v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
+ if (!sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_credentials;
break;
case SO_PASSPIDFD:
- v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_pidfd;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_rights;
break;
case SO_PEERCRED:
@@ -1767,6 +1914,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
{
struct pid *peer_pid;
struct file *pidfd_file = NULL;
+ unsigned int flags = 0;
int pidfd;
if (len > sizeof(pidfd))
@@ -1779,7 +1927,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
if (!peer_pid)
return -ENODATA;
- pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
+ /* The use of PIDFD_STALE requires stashing of struct pid
+ * on pidfs with pidfs_register_pid() and only AF_UNIX
+ * were prepared for this.
+ */
+ if (sk->sk_family == AF_UNIX)
+ flags = PIDFD_STALE;
+
+ pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
put_pid(peer_pid);
if (pidfd < 0)
return pidfd;
@@ -1842,7 +1997,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_PASSSEC:
- v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
+ if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_security;
break;
case SO_PEERSEC:
@@ -1857,6 +2015,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
v.val = sock_flag(sk, SOCK_RCVMARK);
break;
+ case SO_RCVPRIORITY:
+ v.val = sock_flag(sk, SOCK_RCVPRIORITY);
+ break;
+
case SO_RXQ_OVFL:
v.val = sock_flag(sk, SOCK_RXQ_OVFL);
break;
@@ -1940,7 +2102,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
v.val = READ_ONCE(sk->sk_napi_id);
/* aggregate non-NAPI IDs down to 0 */
- if (v.val < MIN_NAPI_ID)
+ if (!napi_id_valid(v.val))
v.val = 0;
break;
@@ -1986,6 +2148,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
+
/* Paired with WRITE_ONCE() in sk_setsockopt() */
v.val = READ_ONCE(sk->sk_txrehash);
break;
@@ -2014,6 +2179,8 @@ lenout:
*/
static inline void sock_lock_init(struct sock *sk)
{
+ sk_owner_clear(sk);
+
if (sk->sk_kern_sock)
sock_lock_init_class_and_name(
sk,
@@ -2032,7 +2199,7 @@ static inline void sock_lock_init(struct sock *sk)
/*
* Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
- * even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * even temporarily, because of RCU lookups. sk_node should also be left as is.
* We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
@@ -2110,6 +2277,9 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
cgroup_sk_free(&sk->sk_cgrp_data);
mem_cgroup_sk_free(sk);
security_sk_free(sk);
+
+ sk_owner_put(sk);
+
if (slab != NULL)
kmem_cache_free(slab, sk);
else
@@ -2145,6 +2315,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
} else {
+ net_passive_inc(net);
__netns_tracker_alloc(net, &sk->ns_tracker,
false, priority);
}
@@ -2169,6 +2340,7 @@ EXPORT_SYMBOL(sk_alloc);
static void __sk_destruct(struct rcu_head *head)
{
struct sock *sk = container_of(head, struct sock, sk_rcu);
+ struct net *net = sock_net(sk);
struct sk_filter *filter;
if (sk->sk_destruct)
@@ -2200,14 +2372,28 @@ static void __sk_destruct(struct rcu_head *head)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
- if (likely(sk->sk_net_refcnt))
- put_net_track(sock_net(sk), &sk->ns_tracker);
- else
- __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
-
+ if (likely(sk->sk_net_refcnt)) {
+ put_net_track(net, &sk->ns_tracker);
+ } else {
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ }
sk_prot_free(sk->sk_prot_creator, sk);
}
+void sk_net_refcnt_upgrade(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ WARN_ON_ONCE(sk->sk_net_refcnt);
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+}
+EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
+
void sk_destruct(struct sock *sk)
{
bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
@@ -2262,7 +2448,12 @@ static void sk_init_common(struct sock *sk)
lockdep_set_class_and_name(&sk->sk_error_queue.lock,
af_elock_keys + sk->sk_family,
af_family_elock_key_strings[sk->sk_family]);
- lockdep_set_class_and_name(&sk->sk_callback_lock,
+ if (sk->sk_kern_sock)
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_kern_callback_keys + sk->sk_family,
+ af_family_kern_clock_key_strings[sk->sk_family]);
+ else
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
}
@@ -2299,6 +2490,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
* is not properly dismantling its kernel sockets at netns
* destroy time.
*/
+ net_passive_inc(sock_net(newsk));
__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
false, priority);
}
@@ -2351,17 +2543,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
*/
if (!is_charged)
RCU_INIT_POINTER(newsk->sk_filter, NULL);
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
+
+ goto free;
}
+
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
- if (bpf_sk_storage_clone(sk, newsk)) {
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
+ if (bpf_sk_storage_clone(sk, newsk))
+ goto free;
/* Clear sk_user_data if parent had the pointer tagged
* as not suitable for copying when cloning.
@@ -2391,18 +2580,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
net_enable_timestamp();
out:
return newsk;
-}
-EXPORT_SYMBOL_GPL(sk_clone_lock);
-
-void sk_free_unlock_clone(struct sock *sk)
-{
+free:
/* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- sk->sk_destruct = NULL;
- bh_unlock_sock(sk);
- sk_free(sk);
+ * destructor and make plain sk_free()
+ */
+ newsk->sk_destruct = NULL;
+ bh_unlock_sock(newsk);
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
}
-EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+EXPORT_SYMBOL_GPL(sk_clone_lock);
static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
{
@@ -2427,8 +2615,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
u32 max_segs = 1;
sk->sk_route_caps = dst->dev->features;
- if (sk_is_tcp(sk))
+ if (sk_is_tcp(sk)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
sk->sk_route_caps |= NETIF_F_GSO;
+ icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
+ }
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
if (unlikely(sk->sk_gso_disabled))
@@ -2505,19 +2697,16 @@ void __sock_wfree(struct sk_buff *skb)
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
- skb->sk = sk;
#ifdef CONFIG_INET
- if (unlikely(!sk_fullsock(sk))) {
- skb->destructor = sock_edemux;
- sock_hold(sk);
- return;
- }
+ if (unlikely(!sk_fullsock(sk)))
+ return skb_set_owner_edemux(skb, sk);
#endif
+ skb->sk = sk;
skb->destructor = sock_wfree;
skb_set_hash_from_sk(skb, sk);
/*
* We used to take a refcount on sk, but following operation
- * is enough to guarantee sk_free() wont free this sock until
+ * is enough to guarantee sk_free() won't free this sock until
* all in-flight packets are completed
*/
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
@@ -2526,13 +2715,12 @@ EXPORT_SYMBOL(skb_set_owner_w);
static bool can_skb_orphan_partial(const struct sk_buff *skb)
{
-#ifdef CONFIG_TLS_DEVICE
/* Drivers depend on in-order delivery for crypto offload,
* partial orphan breaks out-of-order-OK logic.
*/
- if (skb->decrypted)
+ if (skb_is_decrypted(skb))
return false;
-#endif
+
return (skb->destructor == sock_wfree ||
(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
}
@@ -2702,6 +2890,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
}
EXPORT_SYMBOL(sock_kmalloc);
+/*
+ * Duplicate the input "src" memory block using the socket's
+ * option memory buffer.
+ */
+void *sock_kmemdup(struct sock *sk, const void *src,
+ int size, gfp_t priority)
+{
+ void *mem;
+
+ mem = sock_kmalloc(sk, size, priority);
+ if (mem)
+ memcpy(mem, src, size);
+ return mem;
+}
+EXPORT_SYMBOL(sock_kmemdup);
+
/* Free an option memory block. Note, we actually want the inline
* here as this allows gcc to detect the nullify and fold away the
* condition entirely.
@@ -2811,6 +3015,8 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
{
u32 tsflags;
+ BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
+
switch (cmsg->cmsg_type) {
case SO_MARK:
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
@@ -2839,10 +3045,33 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
return -EINVAL;
sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
break;
+ case SCM_TS_OPT_ID:
+ if (sk_is_tcp(sk))
+ return -EINVAL;
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
+ return -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
+ sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
+ break;
/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
case SCM_RIGHTS:
case SCM_CREDENTIALS:
break;
+ case SO_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
+ return -EPERM;
+ sockc->priority = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SCM_DEVMEM_DMABUF:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
+ break;
default:
return -EINVAL;
}
@@ -3055,16 +3284,16 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
struct proto *prot = sk->sk_prot;
- bool charged = false;
+ bool charged = true;
long allocated;
sk_memory_allocated_add(sk, amt);
allocated = sk_memory_allocated(sk);
if (memcg) {
- if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
+ charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
+ if (!charged)
goto suppress_allocation;
- charged = true;
}
/* Under limit. */
@@ -3149,7 +3378,7 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
- if (charged)
+ if (memcg && charged)
mem_cgroup_uncharge_skmem(memcg, amt);
return 0;
@@ -3242,8 +3471,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
}
EXPORT_SYMBOL(sock_no_socketpair);
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+int sock_no_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
return -EOPNOTSUPP;
}
@@ -3338,7 +3567,7 @@ static void sock_def_error_report(struct sock *sk)
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, EPOLLERR);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
rcu_read_unlock();
}
@@ -3353,7 +3582,7 @@ void sock_def_readable(struct sock *sk)
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND);
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
@@ -3373,7 +3602,7 @@ static void sock_def_write_space(struct sock *sk)
EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
@@ -3398,7 +3627,7 @@ static void sock_def_write_space_wfree(struct sock *sk)
EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
}
@@ -3409,7 +3638,7 @@ static void sock_def_destruct(struct sock *sk)
void sk_send_sigurg(struct sock *sk)
{
if (sk->sk_socket && sk->sk_socket->file)
- if (send_sigurg(&sk->sk_socket->file->f_owner))
+ if (send_sigurg(sk->sk_socket->file))
sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);
@@ -3424,14 +3653,14 @@ EXPORT_SYMBOL(sk_reset_timer);
void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
- if (del_timer(timer))
+ if (timer_delete(timer))
__sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);
void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
{
- if (del_timer_sync(timer))
+ if (timer_delete_sync(timer))
__sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer_sync);
@@ -3461,18 +3690,6 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
}
sk->sk_uid = uid;
- rwlock_init(&sk->sk_callback_lock);
- if (sk->sk_kern_sock)
- lockdep_set_class_and_name(
- &sk->sk_callback_lock,
- af_kern_callback_keys + sk->sk_family,
- af_family_kern_clock_key_strings[sk->sk_family]);
- else
- lockdep_set_class_and_name(
- &sk->sk_callback_lock,
- af_callback_keys + sk->sk_family,
- af_family_clock_key_strings[sk->sk_family]);
-
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
@@ -3689,7 +3906,7 @@ EXPORT_SYMBOL(sock_recv_errqueue);
*
* FIX: POSIX 1003.1g is very ambiguous here. It states that
* asynchronous errors should be reported by getsockopt. We assume
- * this means if you specify SO_ERROR (otherwise whats the point of it).
+ * this means if you specify SO_ERROR (otherwise what is the point of it).
*/
int sock_common_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
@@ -3771,7 +3988,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
- mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
+ mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
@@ -3837,7 +4054,7 @@ static int assign_proto_idx(struct proto *prot)
{
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
- if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
pr_err("PROTO_INUSE_NR exhausted\n");
return -ENOSPC;
}
@@ -3848,7 +4065,7 @@ static int assign_proto_idx(struct proto *prot)
static void release_proto_idx(struct proto *prot)
{
- if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ if (prot->inuse_idx != PROTO_INUSE_NR)
clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 654122838025..b23594c767f2 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -18,7 +18,7 @@
static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];
-static struct sock_diag_inet_compat __rcu *inet_rcv_compat;
+static const struct sock_diag_inet_compat __rcu *inet_rcv_compat;
static struct workqueue_struct *broadcast_wq;
@@ -187,8 +187,7 @@ void sock_diag_broadcast_destroy(struct sock *sk)
void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)
{
- xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
- ptr);
+ xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr));
}
EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
@@ -196,8 +195,7 @@ void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)
{
const struct sock_diag_inet_compat *old;
- old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
- NULL);
+ old = unrcu_pointer(xchg(&inet_rcv_compat, NULL));
WARN_ON_ONCE(old != ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
@@ -266,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
switch (nlh->nlmsg_type) {
case TCPDIAG_GETSOCK:
- case DCCPDIAG_GETSOCK:
-
if (!rcu_access_pointer(inet_rcv_compat))
sock_load_diag_module(AF_INET, 0);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 27d733c0f65e..82a14f131d00 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,8 +24,16 @@ struct bpf_stab {
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+/* This mutex is used to
+ * - protect race between prog/link attach/detach and link prog update, and
+ * - protect race between releasing and accessing map in bpf_link.
+ * A single global mutex lock is used since it is expected contention is low.
+ */
+static DEFINE_MUTEX(sockmap_mutex);
+
static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which);
+ struct bpf_prog *old, struct bpf_link *link,
+ u32 which);
static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
@@ -59,55 +67,50 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
{
- u32 ufd = attr->target_fd;
struct bpf_map *map;
- struct fd f;
int ret;
if (attr->attach_flags || attr->replace_bpf_fd)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->target_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);
- fdput(f);
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
+ mutex_unlock(&sockmap_mutex);
return ret;
}
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
{
- u32 ufd = attr->target_fd;
struct bpf_prog *prog;
struct bpf_map *map;
- struct fd f;
int ret;
if (attr->attach_flags || attr->replace_bpf_fd)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->target_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
prog = bpf_prog_get(attr->attach_bpf_fd);
- if (IS_ERR(prog)) {
- ret = PTR_ERR(prog);
- goto put_map;
- }
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
if (prog->type != ptype) {
ret = -EINVAL;
goto put_prog;
}
- ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
+ mutex_unlock(&sockmap_mutex);
put_prog:
bpf_prog_put(prog);
-put_map:
- fdput(f);
return ret;
}
@@ -156,6 +159,7 @@ static void sock_map_del_link(struct sock *sk,
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
+ break;
}
}
spin_unlock_bh(&psock->link_lock);
@@ -299,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
write_lock_bh(&sk->sk_callback_lock);
if (stream_parser && stream_verdict && !psock->saved_data_ready) {
- ret = sk_psock_init_strp(sk, psock);
+ if (sk_is_tcp(sk))
+ ret = sk_psock_init_strp(sk, psock);
+ else
+ ret = -EOPNOTSUPP;
if (ret) {
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_put(sk, psock);
@@ -408,12 +415,11 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
struct sock **psk)
{
- struct sock *sk;
+ struct sock *sk = NULL;
int err = 0;
spin_lock_bh(&stab->lock);
- sk = *psk;
- if (!sk_test || sk_test == sk)
+ if (!sk_test || sk_test == *psk)
sk = xchg(psk, NULL);
if (likely(sk))
@@ -538,6 +544,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
if (sk_is_stream_unix(sk))
return (1 << sk->sk_state) & TCPF_ESTABLISHED;
+ if (sk_is_vsock(sk) &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET))
+ return (1 << sk->sk_state) & TCPF_ESTABLISHED;
return true;
}
@@ -644,6 +653,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
sk = __sock_map_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+ if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+ return SK_DROP;
skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
@@ -672,6 +683,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
return SK_DROP;
if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
return SK_DROP;
+ if (sk_is_vsock(sk))
+ return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
@@ -1171,6 +1184,7 @@ static void sock_hash_free(struct bpf_map *map)
sock_put(elem->sk);
sock_hash_free_elem(htab, elem);
}
+ cond_resched();
}
/* wait for psock readers accessing its map link */
@@ -1245,6 +1259,8 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
sk = __sock_hash_lookup_elem(map, key);
if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+ if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+ return SK_DROP;
skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
@@ -1273,6 +1289,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
return SK_DROP;
if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
return SK_DROP;
+ if (sk_is_vsock(sk))
+ return SK_DROP;
msg->flags = flags;
msg->sk_redir = sk;
@@ -1454,80 +1472,108 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
return NULL;
}
-static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
- u32 which)
+static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+ struct bpf_link ***plink, u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog **cur_pprog;
+ struct bpf_link **cur_plink;
if (!progs)
return -EOPNOTSUPP;
switch (which) {
case BPF_SK_MSG_VERDICT:
- *pprog = &progs->msg_parser;
+ cur_pprog = &progs->msg_parser;
+ cur_plink = &progs->msg_parser_link;
break;
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
- *pprog = &progs->stream_parser;
+ cur_pprog = &progs->stream_parser;
+ cur_plink = &progs->stream_parser_link;
break;
#endif
case BPF_SK_SKB_STREAM_VERDICT:
if (progs->skb_verdict)
return -EBUSY;
- *pprog = &progs->stream_verdict;
+ cur_pprog = &progs->stream_verdict;
+ cur_plink = &progs->stream_verdict_link;
break;
case BPF_SK_SKB_VERDICT:
if (progs->stream_verdict)
return -EBUSY;
- *pprog = &progs->skb_verdict;
+ cur_pprog = &progs->skb_verdict;
+ cur_plink = &progs->skb_verdict_link;
break;
default:
return -EOPNOTSUPP;
}
+ *pprog = cur_pprog;
+ if (plink)
+ *plink = cur_plink;
return 0;
}
+/* Handle the following four cases:
+ * prog_attach: prog != NULL, old == NULL, link == NULL
+ * prog_detach: prog == NULL, old != NULL, link == NULL
+ * link_attach: prog != NULL, old == NULL, link != NULL
+ * link_detach: prog == NULL, old != NULL, link != NULL
+ */
static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- struct bpf_prog *old, u32 which)
+ struct bpf_prog *old, struct bpf_link *link,
+ u32 which)
{
struct bpf_prog **pprog;
+ struct bpf_link **plink;
int ret;
- ret = sock_map_prog_lookup(map, &pprog, which);
+ ret = sock_map_prog_link_lookup(map, &pprog, &plink, which);
if (ret)
return ret;
- if (old)
- return psock_replace_prog(pprog, prog, old);
+ /* for prog_attach/prog_detach/link_attach, return error if a bpf_link
+ * exists for that prog.
+ */
+ if ((!link || prog) && *plink)
+ return -EBUSY;
- psock_set_prog(pprog, prog);
- return 0;
+ if (old) {
+ ret = psock_replace_prog(pprog, prog, old);
+ if (!ret)
+ *plink = NULL;
+ } else {
+ psock_set_prog(pprog, prog);
+ if (link)
+ *plink = link;
+ }
+
+ return ret;
}
int sock_map_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
- u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd;
+ u32 prog_cnt = 0, flags = 0;
struct bpf_prog **pprog;
struct bpf_prog *prog;
struct bpf_map *map;
- struct fd f;
u32 id = 0;
int ret;
if (attr->query.query_flags)
return -EINVAL;
- f = fdget(ufd);
+ CLASS(fd, f)(attr->target_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
rcu_read_lock();
- ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type);
+ ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type);
if (ret)
goto end;
@@ -1552,7 +1598,6 @@ end:
copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
ret = -EFAULT;
- fdput(f);
return ret;
}
@@ -1633,19 +1678,23 @@ void sock_map_close(struct sock *sk, long timeout)
lock_sock(sk);
rcu_read_lock();
- psock = sk_psock_get(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- release_sock(sk);
- saved_close = READ_ONCE(sk->sk_prot)->close;
- } else {
+ psock = sk_psock(sk);
+ if (likely(psock)) {
saved_close = psock->saved_close;
sock_map_remove_links(sk, psock);
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ goto no_psock;
rcu_read_unlock();
sk_psock_stop(psock);
release_sock(sk);
cancel_delayed_work_sync(&psock->work);
sk_psock_put(sk, psock);
+ } else {
+ saved_close = READ_ONCE(sk->sk_prot)->close;
+no_psock:
+ rcu_read_unlock();
+ release_sock(sk);
}
/* Make sure we do not recurse. This is a bug.
@@ -1657,6 +1706,200 @@ void sock_map_close(struct sock *sk, long timeout)
}
EXPORT_SYMBOL_GPL(sock_map_close);
+struct sockmap_link {
+ struct bpf_link link;
+ struct bpf_map *map;
+ enum bpf_attach_type attach_type;
+};
+
+static void sock_map_link_release(struct bpf_link *link)
+{
+ struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+
+ mutex_lock(&sockmap_mutex);
+ if (!sockmap_link->map)
+ goto out;
+
+ WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
+ sockmap_link->attach_type));
+
+ bpf_map_put_with_uref(sockmap_link->map);
+ sockmap_link->map = NULL;
+out:
+ mutex_unlock(&sockmap_mutex);
+}
+
+static int sock_map_link_detach(struct bpf_link *link)
+{
+ sock_map_link_release(link);
+ return 0;
+}
+
+static void sock_map_link_dealloc(struct bpf_link *link)
+{
+ kfree(link);
+}
+
+/* Handle the following two cases:
+ * case 1: link != NULL, prog != NULL, old != NULL
+ * case 2: link != NULL, prog != NULL, old == NULL
+ */
+static int sock_map_link_update_prog(struct bpf_link *link,
+ struct bpf_prog *prog,
+ struct bpf_prog *old)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ struct bpf_prog **pprog, *old_link_prog;
+ struct bpf_link **plink;
+ int ret = 0;
+
+ mutex_lock(&sockmap_mutex);
+
+ /* If old prog is not NULL, ensure old prog is the same as link->prog. */
+ if (old && link->prog != old) {
+ ret = -EPERM;
+ goto out;
+ }
+ /* Ensure link->prog has the same type/attach_type as the new prog. */
+ if (link->prog->type != prog->type ||
+ link->prog->expected_attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sockmap_link->map) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink,
+ sockmap_link->attach_type);
+ if (ret)
+ goto out;
+
+ /* return error if the stored bpf_link does not match the incoming bpf_link. */
+ if (link != *plink) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (old) {
+ ret = psock_replace_prog(pprog, prog, old);
+ if (ret)
+ goto out;
+ } else {
+ psock_set_prog(pprog, prog);
+ }
+
+ bpf_prog_inc(prog);
+ old_link_prog = xchg(&link->prog, prog);
+ bpf_prog_put(old_link_prog);
+
+out:
+ mutex_unlock(&sockmap_mutex);
+ return ret;
+}
+
+static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
+{
+ u32 map_id = 0;
+
+ mutex_lock(&sockmap_mutex);
+ if (sockmap_link->map)
+ map_id = sockmap_link->map->id;
+ mutex_unlock(&sockmap_mutex);
+ return map_id;
+}
+
+static int sock_map_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+ info->sockmap.map_id = map_id;
+ info->sockmap.attach_type = sockmap_link->attach_type;
+ return 0;
+}
+
+static void sock_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+ seq_printf(seq, "map_id:\t%u\n", map_id);
+ seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type);
+}
+
+static const struct bpf_link_ops sock_map_link_ops = {
+ .release = sock_map_link_release,
+ .dealloc = sock_map_link_dealloc,
+ .detach = sock_map_link_detach,
+ .update_prog = sock_map_link_update_prog,
+ .fill_link_info = sock_map_link_fill_info,
+ .show_fdinfo = sock_map_link_show_fdinfo,
+};
+
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct sockmap_link *sockmap_link;
+ enum bpf_attach_type attach_type;
+ struct bpf_map *map;
+ int ret;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ map = bpf_map_get_with_uref(attr->link_create.target_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
+ if (!sockmap_link) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ attach_type = attr->link_create.attach_type;
+ bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog);
+ sockmap_link->map = map;
+ sockmap_link->attach_type = attach_type;
+
+ ret = bpf_link_prime(&sockmap_link->link, &link_primer);
+ if (ret) {
+ kfree(sockmap_link);
+ goto out;
+ }
+
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
+ mutex_unlock(&sockmap_mutex);
+ if (ret) {
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+
+ /* Increase refcnt for the prog since when old prog is replaced with
+ * psock_replace_prog() and psock_set_prog() its refcnt will be decreased.
+ *
+ * Actually, we do not need to increase refcnt for the prog since bpf_link
+ * will hold a reference. But in order to have less complexity w.r.t.
+ * replacing/setting prog, let us increase the refcnt to make things simpler.
+ */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out:
+ bpf_map_put_with_uref(map);
+ return ret;
+}
+
static int sock_map_iter_attach_target(struct bpf_prog *prog,
union bpf_iter_link_info *linfo,
struct bpf_iter_aux_info *aux)
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 5a165286e4d8..4211710393a8 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -173,10 +173,9 @@ static bool __reuseport_detach_closed_sock(struct sock *sk,
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
- unsigned int size = sizeof(struct sock_reuseport) +
- sizeof(struct sock *) * max_socks;
- struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
+ struct sock_reuseport *reuse;
+ reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC);
if (!reuse)
return NULL;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 6973dda3abda..5dbb2c6f371d 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -24,6 +24,7 @@
#include <net/busy_poll.h>
#include <net/pkt_sched.h>
#include <net/hotdata.h>
+#include <net/proto_memory.h>
#include <net/rps.h>
#include "dev.h"
@@ -33,6 +34,7 @@ static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
+static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -50,29 +52,45 @@ int sysctl_devconf_inherit_init_net __read_mostly;
EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
-static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
- struct cpumask *mask)
+static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
+ struct cpumask *mask)
{
- char kbuf[128];
+ char *kbuf;
int len;
if (*ppos || !*lenp) {
*lenp = 0;
- return;
+ return 0;
+ }
+
+ /* CPUs are displayed as a hex bitmap + a comma between each groups of 8
+ * nibbles (except the last one which has a newline instead).
+ * Guesstimate the buffer size at the group granularity level.
+ */
+ len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp);
+ kbuf = kmalloc(len, GFP_KERNEL);
+ if (!kbuf) {
+ *lenp = 0;
+ return -ENOMEM;
}
- len = min(sizeof(kbuf) - 1, *lenp);
len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
if (!len) {
*lenp = 0;
- return;
+ goto free_buf;
}
- if (len < *lenp)
- kbuf[len++] = '\n';
+ /* scnprintf writes a trailing null char not counted in the returned
+ * length, override it with a newline.
+ */
+ kbuf[len++] = '\n';
memcpy(buffer, kbuf, len);
*lenp = len;
*ppos += len;
+
+free_buf:
+ kfree(kbuf);
+ return 0;
}
#endif
@@ -94,7 +112,7 @@ static struct cpumask *rps_default_mask_cow_alloc(struct net *net)
return rps_default_mask;
}
-static int rps_default_mask_sysctl(struct ctl_table *table, int write,
+static int rps_default_mask_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = (struct net *)table->data;
@@ -116,8 +134,8 @@ static int rps_default_mask_sysctl(struct ctl_table *table, int write,
if (err)
goto done;
} else {
- dump_cpumask(buffer, lenp, ppos,
- net->core.rps_default_mask ? : cpu_none_mask);
+ err = dump_cpumask(buffer, lenp, ppos,
+ net->core.rps_default_mask ? : cpu_none_mask);
}
done:
@@ -125,7 +143,7 @@ done:
return err;
}
-static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
+static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int orig_size, size;
@@ -183,7 +201,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (orig_sock_table) {
static_branch_dec(&rps_needed);
static_branch_dec(&rfs_needed);
- kvfree_rcu_mightsleep(orig_sock_table);
+ kvfree_rcu(orig_sock_table, rcu);
}
}
}
@@ -197,7 +215,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_NET_FLOW_LIMIT
static DEFINE_MUTEX(flow_limit_update_mutex);
-static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
+static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct sd_flow_limit *cur;
@@ -221,7 +239,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
lockdep_is_held(&flow_limit_update_mutex));
if (cur && !cpumask_test_cpu(i, mask)) {
RCU_INIT_POINTER(sd->flow_limit, NULL);
- kfree_rcu_mightsleep(cur);
+ kfree_rcu(cur, rcu);
} else if (!cur && cpumask_test_cpu(i, mask)) {
cur = kzalloc_node(len, GFP_KERNEL,
cpu_to_node(i));
@@ -230,7 +248,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
ret = -ENOMEM;
goto write_unlock;
}
- cur->num_buckets = netdev_flow_limit_table_len;
+ cur->log_buckets = ilog2(netdev_flow_limit_table_len);
rcu_assign_pointer(sd->flow_limit, cur);
}
}
@@ -246,7 +264,7 @@ write_unlock:
}
rcu_read_unlock();
- dump_cpumask(buffer, lenp, ppos, mask);
+ ret = dump_cpumask(buffer, lenp, ppos, mask);
}
done:
@@ -254,7 +272,7 @@ done:
return ret;
}
-static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
+static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old, *ptr;
@@ -276,7 +294,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
#endif /* CONFIG_NET_FLOW_LIMIT */
#ifdef CONFIG_NET_SCHED
-static int set_default_qdisc(struct ctl_table *table, int write,
+static int set_default_qdisc(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char id[IFNAMSIZ];
@@ -295,14 +313,14 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
-static int proc_do_dev_weight(struct ctl_table *table, int write,
+static int proc_do_dev_weight(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
static DEFINE_MUTEX(dev_weight_mutex);
int ret, weight;
mutex_lock(&dev_weight_mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
weight = READ_ONCE(weight_p);
WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias);
@@ -313,7 +331,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write,
return ret;
}
-static int proc_do_rss_key(struct ctl_table *table, int write,
+static int proc_do_rss_key(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
@@ -326,7 +344,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
}
#ifdef CONFIG_BPF_JIT
-static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -359,7 +377,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
# ifdef CONFIG_HAVE_EBPF_JIT
static int
-proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
@@ -370,7 +388,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
# endif /* CONFIG_HAVE_EBPF_JIT */
static int
-proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
@@ -382,40 +400,8 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
static struct ctl_table net_core_table[] = {
{
- .procname = "wmem_max",
- .data = &sysctl_wmem_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_sndbuf,
- },
- {
- .procname = "rmem_max",
- .data = &sysctl_rmem_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_rcvbuf,
- },
- {
- .procname = "wmem_default",
- .data = &sysctl_wmem_default,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_sndbuf,
- },
- {
- .procname = "rmem_default",
- .data = &sysctl_rmem_default,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_rcvbuf,
- },
- {
.procname = "mem_pcpu_rsv",
- .data = &sysctl_mem_pcpu_rsv,
+ .data = &net_hotdata.sysctl_mem_pcpu_rsv,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -427,6 +413,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "dev_weight_rx_bias",
@@ -434,6 +421,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "dev_weight_tx_bias",
@@ -441,6 +429,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "netdev_max_backlog",
@@ -522,15 +511,6 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "tstamp_allow_data",
- .data = &sysctl_tstamp_allow_data,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE
- },
#ifdef CONFIG_RPS
{
.procname = "rps_sock_flow_entries",
@@ -595,7 +575,7 @@ static struct ctl_table net_core_table[] = {
},
{
.procname = "max_skb_frags",
- .data = &sysctl_max_skb_frags,
+ .data = &net_hotdata.sysctl_max_skb_frags,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -608,7 +588,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .extra1 = &netdev_budget_usecs_min,
},
{
.procname = "fb_tunnels_only_for_init_net",
@@ -654,13 +634,12 @@ static struct ctl_table net_core_table[] = {
},
{
.procname = "skb_defer_max",
- .data = &sysctl_skb_defer_max,
+ .data = &net_hotdata.sysctl_skb_defer_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
- { }
};
static struct ctl_table netns_core_table[] = {
@@ -697,7 +676,50 @@ static struct ctl_table netns_core_table[] = {
.extra2 = SYSCTL_ONE,
.proc_handler = proc_dou8vec_minmax,
},
- { }
+ {
+ .procname = "tstamp_allow_data",
+ .data = &init_net.core.sysctl_tstamp_allow_data,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ /* sysctl_core_net_init() will set the values after this
+ * to readonly in network namespaces
+ */
+ {
+ .procname = "wmem_max",
+ .data = &sysctl_wmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_max",
+ .data = &sysctl_rmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ {
+ .procname = "wmem_default",
+ .data = &sysctl_wmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_default",
+ .data = &sysctl_rmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
};
static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
@@ -715,20 +737,27 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
static __net_init int sysctl_core_net_init(struct net *net)
{
- struct ctl_table *tbl, *tmp;
+ size_t table_size = ARRAY_SIZE(netns_core_table);
+ struct ctl_table *tbl;
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
+ int i;
tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
if (tbl == NULL)
goto err_dup;
- for (tmp = tbl; tmp->procname; tmp++)
- tmp->data += (char *)net - (char *)&init_net;
+ for (i = 0; i < table_size; ++i) {
+ if (tbl[i].data == &sysctl_wmem_max)
+ break;
+
+ tbl[i].data += (char *)net - (char *)&init_net;
+ }
+ for (; i < table_size; ++i)
+ tbl[i].mode &= ~0222;
}
- net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl,
- ARRAY_SIZE(netns_core_table));
+ net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size);
if (net->core.sysctl_hdr == NULL)
goto err_reg;
@@ -743,7 +772,7 @@ err_dup:
static __net_exit void sysctl_core_net_exit(struct net *net)
{
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = net->core.sysctl_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->core.sysctl_hdr);
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 04840697fe79..a50a7ef49ae8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -9,6 +9,7 @@
#include <linux/ptp_classify.h>
#include <linux/skbuff.h>
#include <linux/export.h>
+#include <linux/ptp_clock_kernel.h>
static unsigned int classify(const struct sk_buff *skb)
{
@@ -21,18 +22,39 @@ static unsigned int classify(const struct sk_buff *skb)
void skb_clone_tx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
struct sk_buff *clone;
unsigned int type;
- if (!skb->sk)
+ if (!skb->sk || !skb->dev)
return;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
type = classify(skb);
if (type == PTP_CLASS_NONE)
return;
- mii_ts = skb->dev->phydev->mii_ts;
+ mii_ts = phydev->mii_ts;
if (likely(mii_ts->txtstamp)) {
clone = skb_clone_sk(skb);
if (!clone)
@@ -44,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
unsigned int type;
- if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts)
+ if (!skb->dev)
return false;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
@@ -62,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
if (type == PTP_CLASS_NONE)
return false;
- mii_ts = skb->dev->phydev->mii_ts;
+ mii_ts = phydev->mii_ts;
if (likely(mii_ts->rxtstamp))
return mii_ts->rxtstamp(mii_ts, skb, type);
diff --git a/net/core/tso.c b/net/core/tso.c
index e00796e3b146..6df997b9076e 100644
--- a/net/core/tso.c
+++ b/net/core/tso.c
@@ -3,7 +3,7 @@
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/tso.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
int size, bool is_last)
diff --git a/net/core/utils.c b/net/core/utils.c
index c994e95172ac..5e63b0ea21f3 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Generic address resultion entity
+ * Generic address resolution entity
*
* Authors:
* net_random Alan Cox
@@ -399,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
}
EXPORT_SYMBOL(inet_pton_with_scope);
-bool inet_addr_is_any(struct sockaddr *addr)
+bool inet_addr_is_any(struct sockaddr_storage *addr)
{
- if (addr->sa_family == AF_INET6) {
+ if (addr->ss_family == AF_INET6) {
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
const struct sockaddr_in6 in6_any =
{ .sin6_addr = IN6ADDR_ANY_INIT };
@@ -409,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr)
if (!memcmp(in6->sin6_addr.s6_addr,
in6_any.sin6_addr.s6_addr, 16))
return true;
- } else if (addr->sa_family == AF_INET) {
+ } else if (addr->ss_family == AF_INET) {
struct sockaddr_in *in = (struct sockaddr_in *)addr;
if (in->sin_addr.s_addr == htonl(INADDR_ANY))
return true;
} else {
- pr_warn("unexpected address family %u\n", addr->sa_family);
+ pr_warn("unexpected address family %u\n", addr->ss_family);
}
return false;
@@ -473,11 +473,11 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
EXPORT_SYMBOL(inet_proto_csum_replace16);
void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
- __wsum diff, bool pseudohdr)
+ __wsum diff, bool pseudohdr, bool ipv6)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
csum_replace_by_diff(sum, diff);
- if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6)
skb->csum = ~csum_sub(diff, skb->csum);
} else if (pseudohdr) {
*sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 41693154e426..491334b9b8be 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -17,6 +17,7 @@
#include <net/page_pool/helpers.h>
#include <net/hotdata.h>
+#include <net/netdev_lock.h>
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
@@ -127,10 +128,8 @@ void xdp_unreg_mem_model(struct xdp_mem_info *mem)
return;
if (type == MEM_TYPE_PAGE_POOL) {
- rcu_read_lock();
- xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
page_pool_destroy(xa->page_pool);
- rcu_read_unlock();
}
}
EXPORT_SYMBOL_GPL(xdp_unreg_mem_model);
@@ -188,7 +187,6 @@ int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq_info_init(xdp_rxq);
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
- xdp_rxq->napi_id = napi_id;
xdp_rxq->frag_size = frag_size;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
@@ -295,10 +293,8 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
mutex_lock(&mem_id_lock);
ret = __mem_id_init_hash_table();
mutex_unlock(&mem_id_lock);
- if (ret < 0) {
- WARN_ON(1);
+ if (ret < 0)
return ERR_PTR(ret);
- }
}
xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
@@ -362,6 +358,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
if (IS_ERR(xdp_alloc))
return PTR_ERR(xdp_alloc);
+ if (type == MEM_TYPE_XSK_BUFF_POOL && allocator)
+ xsk_pool_set_rxq_info(allocator, xdp_rxq);
+
if (trace_mem_connect_enabled() && xdp_alloc)
trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
@@ -369,33 +368,87 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+/**
+ * xdp_reg_page_pool - register &page_pool as a memory provider for XDP
+ * @pool: &page_pool to register
+ *
+ * Can be used to register pools manually without connecting to any XDP RxQ
+ * info, so that the XDP layer will be aware of them. Then, they can be
+ * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool().
+ *
+ * Return: %0 on success, -errno on error.
+ */
+int xdp_reg_page_pool(struct page_pool *pool)
+{
+ struct xdp_mem_info mem;
+
+ return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool);
+}
+EXPORT_SYMBOL_GPL(xdp_reg_page_pool);
+
+/**
+ * xdp_unreg_page_pool - unregister &page_pool from the memory providers list
+ * @pool: &page_pool to unregister
+ *
+ * A shorthand for manual unregistering page pools. If the pool was previously
+ * attached to an RxQ info, it must be detached first.
+ */
+void xdp_unreg_page_pool(const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_unreg_mem_model(&mem);
+}
+EXPORT_SYMBOL_GPL(xdp_unreg_page_pool);
+
+/**
+ * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info
+ * @xdp_rxq: XDP RxQ info to attach the pool to
+ * @pool: pool to attach
+ *
+ * If the pool was registered manually, this function must be called instead
+ * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info.
+ */
+void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
+ const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_rxq_info_attach_mem_model(xdp_rxq, &mem);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
+
/* XDP RX runs under NAPI protection, and in different delivery error
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
-void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- struct xdp_buff *xdp)
+void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
+ bool napi_direct, struct xdp_buff *xdp)
{
- struct page *page;
-
- switch (mem->type) {
+ switch (mem_type) {
case MEM_TYPE_PAGE_POOL:
- page = virt_to_head_page(data);
+ netmem = netmem_compound_head(netmem);
if (napi_direct && xdp_return_frame_no_direct())
napi_direct = false;
- /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
- * as mem->type knows this a page_pool page
+ /* No need to check netmem_is_pp() as mem->type knows this a
+ * page_pool page
*/
- page_pool_put_full_page(page->pp, page, napi_direct);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
+ napi_direct);
break;
case MEM_TYPE_PAGE_SHARED:
- page_frag_free(data);
+ page_frag_free(__netmem_address(netmem));
break;
case MEM_TYPE_PAGE_ORDER0:
- page = virt_to_page(data); /* Assumes order0 page*/
- put_page(page);
+ put_page(__netmem_to_page(netmem));
break;
case MEM_TYPE_XSK_BUFF_POOL:
/* NB! Only valid from an xdp_buff! */
@@ -403,7 +456,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
break;
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
- WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem_type);
break;
}
}
@@ -411,38 +464,34 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
void xdp_return_frame(struct xdp_frame *xdpf)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_frame_has_frags(xdpf)))
goto out;
sinfo = xdp_get_shared_info_from_frame(xdpf);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ false, NULL);
- __xdp_return(page_address(page), &xdpf->mem, false, NULL);
- }
out:
- __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_frame_has_frags(xdpf)))
goto out;
sinfo = xdp_get_shared_info_from_frame(xdpf);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ true, NULL);
- __xdp_return(page_address(page), &xdpf->mem, true, NULL);
- }
out:
- __xdp_return(xdpf->data, &xdpf->mem, true, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
@@ -456,46 +505,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
* xdp_frame_bulk is usually stored/allocated on the function
* call-stack to avoid locking penalties.
*/
-void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
-{
- struct xdp_mem_allocator *xa = bq->xa;
-
- if (unlikely(!xa || !bq->count))
- return;
-
- page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count);
- /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */
- bq->count = 0;
-}
-EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
/* Must be called with rcu_read_lock held */
void xdp_return_frame_bulk(struct xdp_frame *xdpf,
struct xdp_frame_bulk *bq)
{
- struct xdp_mem_info *mem = &xdpf->mem;
- struct xdp_mem_allocator *xa;
-
- if (mem->type != MEM_TYPE_PAGE_POOL) {
+ if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) {
xdp_return_frame(xdpf);
return;
}
- xa = bq->xa;
- if (unlikely(!xa)) {
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- bq->count = 0;
- bq->xa = xa;
- }
-
if (bq->count == XDP_BULK_QUEUE_SIZE)
xdp_flush_frame_bulk(bq);
- if (unlikely(mem->id != xa->mem.id)) {
- xdp_flush_frame_bulk(bq);
- bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- }
-
if (unlikely(xdp_frame_has_frags(xdpf))) {
struct skb_shared_info *sinfo;
int i;
@@ -504,31 +526,40 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
for (i = 0; i < sinfo->nr_frags; i++) {
skb_frag_t *frag = &sinfo->frags[i];
- bq->q[bq->count++] = skb_frag_address(frag);
+ bq->q[bq->count++] = skb_frag_netmem(frag);
if (bq->count == XDP_BULK_QUEUE_SIZE)
xdp_flush_frame_bulk(bq);
}
}
- bq->q[bq->count++] = xdpf->data;
+ bq->q[bq->count++] = virt_to_netmem(xdpf->data);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+/**
+ * xdp_return_frag -- free one XDP frag or decrement its refcount
+ * @netmem: network memory reference to release
+ * @xdp: &xdp_buff to release the frag for
+ */
+void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
+{
+ __xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frag);
+
void xdp_return_buff(struct xdp_buff *xdp)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_buff_has_frags(xdp)))
goto out;
sinfo = xdp_get_shared_info_from_buff(xdp);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]),
+ xdp->rxq->mem.type, true, xdp);
- __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
- }
out:
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
+ __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
@@ -574,7 +605,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->headroom = 0;
xdpf->metasize = metasize;
xdpf->frame_sz = PAGE_SIZE;
- xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+ xdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
xsk_buff_free(xdp);
return xdpf;
@@ -588,15 +619,177 @@ void xdp_warn(const char *msg, const char *func, const int line)
};
EXPORT_SYMBOL_GPL(xdp_warn);
-int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
+/**
+ * xdp_build_skb_from_buff - create an skb from &xdp_buff
+ * @xdp: &xdp_buff to convert to an skb
+ *
+ * Perform common operations to create a new skb to pass up the stack from
+ * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize
+ * skb data pointers and offsets, set the recycle bit if the buff is
+ * PP-backed, Rx queue index, protocol and update frags info.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
{
- n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs);
- if (unlikely(!n_skb))
- return -ENOMEM;
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ const struct skb_shared_info *sinfo;
+ struct sk_buff *skb;
+ u32 nr_frags = 0;
+ int metalen;
- return 0;
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
+
+ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ __skb_put(skb, xdp->data_end - xdp->data);
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0)
+ skb_metadata_set(skb, metalen);
+
+ if (rxq->mem.type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(nr_frags)) {
+ u32 tsize;
+
+ tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
+ xdp_update_skb_shared_info(skb, nr_frags,
+ sinfo->xdp_frags_size, tsize,
+ xdp_buff_is_frag_pfmemalloc(xdp));
+ }
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff);
+
+/**
+ * xdp_copy_frags_from_zc - copy frags from XSk buff to skb
+ * @skb: skb to copy frags to
+ * @xdp: XSk &xdp_buff from which the frags will be copied
+ * @pp: &page_pool backing page allocation, if available
+ *
+ * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack.
+ * Allocate a new buffer for each frag, copy it and attach to the skb.
+ *
+ * Return: true on success, false on netmem allocation fail.
+ */
+static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
+ const struct xdp_buff *xdp,
+ struct page_pool *pp)
+{
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ const struct skb_shared_info *xinfo;
+ u32 nr_frags, tsize = 0;
+ bool pfmemalloc = false;
+
+ xinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = xinfo->nr_frags;
+
+ for (u32 i = 0; i < nr_frags; i++) {
+ const skb_frag_t *frag = &xinfo->frags[i];
+ u32 len = skb_frag_size(frag);
+ u32 offset, truesize = len;
+ struct page *page;
+
+ page = page_pool_dev_alloc(pp, &offset, &truesize);
+ if (unlikely(!page)) {
+ sinfo->nr_frags = i;
+ return false;
+ }
+
+ memcpy(page_address(page) + offset, skb_frag_address(frag),
+ LARGEST_ALIGN(len));
+ __skb_fill_page_desc_noacc(sinfo, i, page, offset, len);
+
+ tsize += truesize;
+ pfmemalloc |= page_is_pfmemalloc(page);
+ }
+
+ xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size,
+ tsize, pfmemalloc);
+
+ return true;
}
-EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk);
+
+/**
+ * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff
+ * @xdp: source XSk buff
+ *
+ * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb
+ * head, new buffer for the head, copy the data and initialize the skb fields.
+ * If there are frags, allocate new buffers for them and copy.
+ * Buffers are allocated from the system percpu pools to try recycling them.
+ * If new skb was built successfully, @xdp is returned to XSk pool's freelist.
+ * On error, it remains untouched and the caller must take care of this.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
+{
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ u32 len = xdp->data_end - xdp->data_meta;
+ u32 truesize = xdp->frame_sz;
+ struct sk_buff *skb = NULL;
+ struct page_pool *pp;
+ int metalen;
+ void *data;
+
+ if (!IS_ENABLED(CONFIG_PAGE_POOL))
+ return NULL;
+
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ pp = this_cpu_read(system_page_pool.pool);
+ data = page_pool_dev_alloc_va(pp, &truesize);
+ if (unlikely(!data))
+ goto out;
+
+ skb = napi_build_skb(data, truesize);
+ if (unlikely(!skb)) {
+ page_pool_free_va(pp, data, true);
+ goto out;
+ }
+
+ skb_mark_for_recycle(skb);
+ skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
+
+ memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len));
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0) {
+ skb_metadata_set(skb, metalen);
+ __skb_pull(skb, metalen);
+ }
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(xdp_buff_has_frags(xdp)) &&
+ unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
+ napi_consume_skb(skb, true);
+ skb = NULL;
+ goto out;
+ }
+
+ xsk_buff_free(xdp);
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+out:
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *skb,
@@ -644,7 +837,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
* - RX ring dev queue index (skb_record_rx_queue)
*/
- if (xdpf->mem.type == MEM_TYPE_PAGE_POOL)
+ if (xdpf->mem_type == MEM_TYPE_PAGE_POOL)
skb_mark_for_recycle(skb);
/* Allow SKB to reuse area used by xdp_frame */
@@ -691,8 +884,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
nxdpf = addr;
nxdpf->data = addr + headroom;
nxdpf->frame_sz = PAGE_SIZE;
- nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
- nxdpf->mem.id = 0;
+ nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
return nxdpf;
}
@@ -805,34 +997,60 @@ static int __init xdp_metadata_init(void)
}
late_initcall(xdp_metadata_init);
-void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
+void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val)
{
val &= NETDEV_XDP_ACT_MASK;
if (dev->xdp_features == val)
return;
+ netdev_assert_locked_or_invisible(dev);
dev->xdp_features = val;
if (dev->reg_state == NETREG_REGISTERED)
call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev);
}
+EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked);
+
+void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
+{
+ netdev_lock(dev);
+ xdp_set_features_flag_locked(dev, val);
+ netdev_unlock(dev);
+}
EXPORT_SYMBOL_GPL(xdp_set_features_flag);
-void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+void xdp_features_set_redirect_target_locked(struct net_device *dev,
+ bool support_sg)
{
xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT);
if (support_sg)
val |= NETDEV_XDP_ACT_NDO_XMIT_SG;
- xdp_set_features_flag(dev, val);
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked);
+
+void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+{
+ netdev_lock(dev);
+ xdp_features_set_redirect_target_locked(dev, support_sg);
+ netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target);
-void xdp_features_clear_redirect_target(struct net_device *dev)
+void xdp_features_clear_redirect_target_locked(struct net_device *dev)
{
xdp_features_t val = dev->xdp_features;
val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG);
- xdp_set_features_flag(dev, val);
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked);
+
+void xdp_features_clear_redirect_target(struct net_device *dev)
+{
+ netdev_lock(dev);
+ xdp_features_clear_redirect_target_locked(dev);
+ netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target);