diff options
Diffstat (limited to 'net/core')
59 files changed, 10599 insertions, 4092 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 6e6548011fae..b2a76ce33932 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ +obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ fib_notifier.o xdp.o flow_offload.o gro.o \ @@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o +obj-y += netdev_rx_queue.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o @@ -26,6 +27,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o @@ -41,4 +43,7 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o -obj-$(CONFIG_NET_TEST) += gso_test.o +obj-$(CONFIG_NET_TEST) += net_test.o +obj-$(CONFIG_NET_DEVMEM) += devmem.o +obj-$(CONFIG_DEBUG_NET) += lock_debug.o +obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 6c4d90b24d46..2e538399757f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage) { - rcu_read_unlock(); - return; - } + if (!sk_storage) + goto out; bpf_local_storage_destroy(sk_storage); +out: rcu_read_unlock(); + migrate_enable(); } static void bpf_sk_storage_map_free(struct bpf_map *map) @@ -106,7 +107,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags, GFP_ATOMIC); + map_flags, false, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -137,7 +138,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); @@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) out: rcu_read_unlock(); + migrate_enable(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. @@ -243,7 +246,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -352,11 +355,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) { - const struct btf *btf_vmlinux; - const struct btf_type *t; - const char *tname; - u32 btf_id; - if (prog->aux->dst_prog) return false; @@ -371,13 +369,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - btf_vmlinux = bpf_get_btf_vmlinux(); - if (IS_ERR_OR_NULL(btf_vmlinux)) - return false; - btf_id = prog->aux->attach_btf_id; - t = btf_type_by_id(btf_vmlinux, btf_id); - tname = btf_name_by_offset(btf_vmlinux, t->name_off); - return !!strncmp(tname, "bpf_sk_storage", + return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: return false; @@ -496,27 +488,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) if (!bpf_capable()) return ERR_PTR(-EPERM); - nla_for_each_nested(nla, nla_stgs, rem) { - if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { - if (nla_len(nla) != sizeof(u32)) - return ERR_PTR(-EINVAL); - nr_maps++; - } + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); + nr_maps++; } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); - nla_for_each_nested(nla, nla_stgs, rem) { - struct bpf_map *map; - int map_fd; - - if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) - continue; + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + int map_fd = nla_get_u32(nla); + struct bpf_map *map = bpf_map_get(map_fd); - map_fd = nla_get_u32(nla); - map = bpf_map_get(map_fd); if (IS_ERR(map)) { err = PTR_ERR(map); goto err_free; diff --git a/net/core/datagram.c b/net/core/datagram.c index a8b625abe242..94cc4705e91d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -52,6 +52,7 @@ #include <linux/pagemap.h> #include <linux/iov_iter.h> #include <linux/indirect_call_wrapper.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -61,7 +62,8 @@ #include <net/tcp_states.h> #include <trace/events/skb.h> #include <net/busy_poll.h> -#include <crypto/hash.h> + +#include "devmem.h" /* * Is a socket 'connection oriented' ? @@ -163,8 +165,7 @@ done: return skb; } -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) @@ -261,7 +262,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) @@ -324,25 +325,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(skb_free_datagram); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) -{ - bool slow; - - if (!skb_unref(skb)) { - sk_peek_offset_bwd(sk, len); - return; - } - - slow = lock_sock_fast(sk); - sk_peek_offset_bwd(sk, len); - skb_orphan(skb); - unlock_sock_fast(sk, slow); - - /* skb is now orphaned, can be freed outside of locked section */ - __kfree_skb(skb); -} -EXPORT_SYMBOL(__skb_free_datagram_locked); - int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, @@ -426,6 +408,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, return 0; } + if (!skb_frags_readable(skb)) + goto short_copy; + /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -435,15 +420,23 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; if (copy > len) copy = len; - n = INDIRECT_CALL_1(cb, simple_copy_to_iter, - vaddr + skb_frag_off(frag) + offset - start, - copy, data, to); - kunmap(page); + + n = 0; + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); + n += INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + p_off, p_len, data, to); + kunmap_local(vaddr); + } + offset += n; if (n != copy) goto short_copy; @@ -490,41 +483,37 @@ short_copy: return 0; } -static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, - struct iov_iter *i) +#ifdef CONFIG_NET_CRC32C +static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, + void *_crcp, struct iov_iter *i) { -#ifdef CONFIG_CRYPTO_HASH - struct ahash_request *hash = hashp; - struct scatterlist sg; + u32 *crcp = _crcp; size_t copied; copied = copy_to_iter(addr, bytes, i); - sg_init_one(&sg, addr, copied); - ahash_request_set_crypt(hash, &sg, NULL, copied); - crypto_ahash_update(hash); + *crcp = crc32c(*crcp, addr, copied); return copied; -#else - return 0; -#endif } /** - * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator - * and update a hash. + * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator + * and update a CRC32C value. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec - * @hash: hash request to update + * @crcp: pointer to CRC32C value to update + * + * Return: 0 on success, -EFAULT if there was a fault during copy. */ -int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, - struct ahash_request *hash) +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp) { return __skb_datagram_iter(skb, offset, to, len, true, - hash_and_copy_to_iter, hash); + crc32c_and_copy_to_iter, crcp); } -EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); +EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); +#endif /* CONFIG_NET_CRC32C */ static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) @@ -629,16 +618,13 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, - struct sk_buff *skb, struct iov_iter *from, - size_t length) +int zerocopy_fill_skb_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length) { - int frag; - - if (msg && msg->msg_ubuf && msg->sg_from_iter) - return msg->sg_from_iter(sk, skb, from, length); + int frag = skb_shinfo(skb)->nr_frags; - frag = skb_shinfo(skb)->nr_frags; + if (!skb_frags_readable(skb)) + return -EFAULT; while (length && iov_iter_count(from)) { struct page *head, *last_head = NULL; @@ -646,7 +632,6 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, int refs, order, n = 0; size_t start; ssize_t copied; - unsigned long truesize; if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; @@ -658,17 +643,9 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, length -= copied; - truesize = PAGE_ALIGN(copied + start); skb->data_len += copied; skb->len += copied; - skb->truesize += truesize; - if (sk && sk->sk_type == SOCK_STREAM) { - sk_wmem_queued_add(sk, truesize); - if (!skb_zcopy_pure(skb)) - sk_mem_charge(sk, truesize); - } else { - refcount_add(truesize, &skb->sk->sk_wmem_alloc); - } + skb->truesize += PAGE_ALIGN(copied + start); head = compound_head(pages[n]); order = compound_order(head); @@ -711,6 +688,73 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, } return 0; } + +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC && + iov_iter_type(from) != ITER_UBUF) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, struct iov_iter *from, + size_t length, + struct net_devmem_dmabuf_binding *binding) +{ + unsigned long orig_size = skb->truesize; + unsigned long truesize; + int ret; + + if (msg && msg->msg_ubuf && msg->sg_from_iter) + ret = msg->sg_from_iter(skb, from, length); + else if (binding) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); + else + ret = zerocopy_fill_skb_from_iter(skb, from, length); + + truesize = skb->truesize - orig_size; + if (sk && sk->sk_type == SOCK_STREAM) { + sk_wmem_queued_add(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } + return ret; +} EXPORT_SYMBOL(__zerocopy_sg_from_iter); /** @@ -731,7 +775,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/dev.c b/net/core/dev.c index 9a67003e49db..be97c440ecd5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -77,7 +77,9 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/isolation.h> #include <linux/sched/mm.h> +#include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> @@ -90,6 +92,7 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> +#include <linux/ethtool_netlink.h> #include <linux/skbuff.h> #include <linux/kthread.h> #include <linux/bpf.h> @@ -103,6 +106,7 @@ #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> +#include <net/netdev_queues.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> @@ -152,12 +156,16 @@ #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> +#include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/rps.h> +#include <linux/phy_link_topology.h> #include "dev.h" +#include "devmem.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); @@ -176,8 +184,6 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); -static DECLARE_RWSEM(devnet_rename_sem); - static inline void dev_base_seq_inc(struct net *net) { unsigned int val = net->dev_base_seq + 1; @@ -197,37 +203,62 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock_irqsave(struct softnet_data *sd, - unsigned long *flags) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) +{ + static_branch_enable(&use_backlog_threads_key); + return 0; +} +early_param("thread_backlog_napi", setup_backlog_napi_threads); + +static bool use_backlog_threads(void) { - if (IS_ENABLED(CONFIG_RPS)) + return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) +{ + return true; +} + +#endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_save(*flags); } -static inline void rps_lock_irq_disable(struct softnet_data *sd) +static inline void backlog_lock_irq_disable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_disable(); } -static inline void rps_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_restore(*flags); } -static inline void rps_unlock_irq_enable(struct softnet_data *sd) +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_enable(); } @@ -422,14 +453,18 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { + .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), +}; EXPORT_PER_CPU_SYMBOL(softnet_data); /* Page_pool has a lockless array/stack to alloc/recycle pages. * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #ifdef CONFIG_LOCKDEP /* @@ -539,10 +574,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { - if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; - else - return pt->dev ? &pt->dev->ptype_specific : + if (pt->type == htons(ETH_P_ALL)) { + if (!pt->af_packet_net && !pt->dev) + return NULL; + + return pt->dev ? &pt->dev->ptype_all : + &pt->af_packet_net->ptype_all; + } + + if (pt->dev) + return &pt->dev->ptype_specific; + + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } @@ -563,6 +606,9 @@ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); + if (WARN_ON_ONCE(!head)) + return; + spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); @@ -587,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt) struct list_head *head = ptype_head(pt); struct packet_type *pt1; + if (!head) + return; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { @@ -722,6 +771,80 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, } EXPORT_SYMBOL_GPL(dev_fill_forward_path); +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} + +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct * +netdev_napi_by_id(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + + napi = napi_by_id(napi_id); + if (!napi) + return NULL; + + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; + + return napi; +} + +/** + * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it + * @net: the applicable net namespace + * @napi_id: ID of a NAPI of a target device + * + * Find a NAPI instance with @napi_id. Lock its device. + * The device must be in %NETREG_REGISTERED state for lookup to succeed. + * netdev_unlock() must be called to release it. + * + * Return: pointer to NAPI, its device with lock held, NULL if not found. + */ +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + struct net_device *dev; + + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { + rcu_read_unlock(); + return NULL; + } + + dev = napi->dev; + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock(dev, net); + if (!dev) + return NULL; + + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (napi && napi->dev != dev) + napi = NULL; + rcu_read_unlock(); + + if (!napi) + netdev_unlock(dev); + return napi; +} + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -896,21 +1019,150 @@ EXPORT_SYMBOL(netdev_get_by_index); * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ - struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } -EXPORT_SYMBOL(dev_get_by_napi_id); + +/* Release the held reference on the net_device, and if the net_device + * is still registered try to lock the instance lock. If device is being + * unregistered NULL will be returned (but the reference has been released, + * either way!) + * + * This helper is intended for locking net_device after it has been looked up + * using a lockless lookup helper. Lock prevents the instance from going away. + */ +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) +{ + netdev_lock(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +/** + * netdev_get_by_index_lock() - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. If a valid device + * with @ifindex is found it will be returned with netdev->lock held. + * netdev_unlock() must be called to release it. + * + * Return: pointer to a device with lock held, NULL if not found. + */ +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock(dev, net); +} + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); +} + +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} /** * netdev_get_name - get a netdevice name, knowing its ifindex. @@ -923,7 +1175,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex) struct net_device *dev; int ret; - down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -932,15 +1183,20 @@ int netdev_get_name(struct net *net, char *name, int ifindex) goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); - up_read(&devnet_rename_sem); return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -949,7 +1205,7 @@ out: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -961,14 +1217,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -1159,41 +1440,27 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev, return ret < 0 ? ret : 0; } -/** - * dev_change_name - change name of a device - * @dev: device - * @newname: name (or format string) must be at least IFNAMSIZ - * - * Change name of a device, can pass format strings "eth%d". - * for wildcarding. - */ -int dev_change_name(struct net_device *dev, const char *newname) +int netif_change_name(struct net_device *dev, const char *newname) { + struct net *net = dev_net(dev); unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; - struct net *net; - - ASSERT_RTNL(); - BUG_ON(!dev_net(dev)); - - net = dev_net(dev); - down_write(&devnet_rename_sem); + ASSERT_RTNL_NET(net); - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - up_write(&devnet_rename_sem); + if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; - } memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); - if (err < 0) { - up_write(&devnet_rename_sem); + write_sequnlock_bh(&netdev_rename_lock); + + if (err < 0) return err; - } if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s%s\n", oldname, @@ -1205,14 +1472,13 @@ int dev_change_name(struct net_device *dev, const char *newname) rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); WRITE_ONCE(dev->name_assign_type, old_assign_type); - up_write(&devnet_rename_sem); return ret; } - up_write(&devnet_rename_sem); - netdev_adjacent_rename_links(dev, oldname); netdev_name_node_del(dev->name_node); @@ -1228,8 +1494,9 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - down_write(&devnet_rename_sem); + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; @@ -1243,15 +1510,7 @@ rollback: return err; } -/** - * dev_set_alias - change ifalias of a device - * @dev: device - * @alias: name up to IFALIASZ - * @len: limit of bytes to copy from info - * - * Set ifalias for a device, - */ -int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +int netif_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; @@ -1277,7 +1536,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return len; } -EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device @@ -1314,16 +1572,10 @@ void netdev_features_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_features_change); -/** - * netdev_state_change - device changes state - * @dev: device to cause notification - * - * Called to indicate a device has changed state. This function calls - * the notifier chains for netdev_chain and sends a NEWLINK message - * to the routing socket. - */ -void netdev_state_change(struct net_device *dev) +void netif_state_change(struct net_device *dev) { + netdev_ops_assert_locked_or_invisible(dev); + if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, @@ -1334,7 +1586,6 @@ void netdev_state_change(struct net_device *dev) rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } -EXPORT_SYMBOL(netdev_state_change); /** * __netdev_notify_peers - notify network peers about existence of @dev, @@ -1423,6 +1674,8 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) set_bit(__LINK_STATE_START, &dev->state); + netdev_ops_assert_locked(dev); + if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); @@ -1434,7 +1687,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - dev->flags |= IFF_UP; + netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1443,20 +1696,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) return ret; } -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * @extack: netlink extended ack - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +int netif_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; @@ -1472,7 +1712,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) return ret; } -EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { @@ -1510,10 +1749,13 @@ static void __dev_close_many(struct list_head *head) * We allow it to be called even after a DETACH hot-plug * event. */ + + netdev_ops_assert_locked(dev); + if (ops->ndo_stop) ops->ndo_stop(dev); - dev->flags &= ~IFF_UP; + netif_set_up(dev, false); netpoll_poll_enable(dev); } } @@ -1547,16 +1789,7 @@ void dev_close_many(struct list_head *head, bool unlink) } EXPORT_SYMBOL(dev_close_many); -/** - * dev_close - shutdown an interface. - * @dev: device to shutdown - * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. - */ -void dev_close(struct net_device *dev) +void netif_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); @@ -1566,18 +1799,9 @@ void dev_close(struct net_device *dev) list_del(&single); } } -EXPORT_SYMBOL(dev_close); - +EXPORT_SYMBOL(netif_close); -/** - * dev_disable_lro - disable Large Receive Offload on a device - * @dev: device - * - * Disable Large Receive Offload (LRO) on a net device. Must be - * called under RTNL. This is needed if received packets may be - * forwarded to another interface. - */ -void dev_disable_lro(struct net_device *dev) +void netif_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; @@ -1588,10 +1812,13 @@ void dev_disable_lro(struct net_device *dev) if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); - netdev_for_each_lower_dev(dev, lower_dev, iter) - dev_disable_lro(lower_dev); + netdev_for_each_lower_dev(dev, lower_dev, iter) { + netdev_lock_ops(lower_dev); + netif_disable_lro(lower_dev); + netdev_unlock_ops(lower_dev); + } } -EXPORT_SYMBOL(dev_disable_lro); +EXPORT_IPV6_MOD(netif_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device @@ -1679,7 +1906,9 @@ static int call_netdevice_register_net_notifiers(struct notifier_block *nb, int err; for_each_netdev(net, dev) { + netdev_lock_ops(dev); err = call_netdevice_register_notifiers(nb, dev); + netdev_unlock_ops(dev); if (err) goto rollback; } @@ -1723,14 +1952,19 @@ int register_netdevice_notifier(struct notifier_block *nb) /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); + + /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { + __rtnl_net_lock(net); err = call_netdevice_register_net_notifiers(nb, net); + __rtnl_net_unlock(net); if (err) goto rollback; } @@ -1741,8 +1975,11 @@ unlock: return err; rollback: - for_each_net_continue_reverse(net) + for_each_net_continue_reverse(net) { + __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); + } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; @@ -1775,8 +2012,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; - for_each_net(net) + for_each_net(net) { + __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); + } unlock: rtnl_unlock(); @@ -1840,9 +2080,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; - rtnl_lock(); + rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); @@ -1868,9 +2109,10 @@ int unregister_netdevice_notifier_net(struct net *net, { int err; - rtnl_lock(); + rtnl_net_lock(net); err = __unregister_netdevice_notifier_net(net, nb); - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); @@ -1883,19 +2125,56 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { int err; - rtnl_lock(); + rtnl_net_dev_lock(dev); err = __register_netdevice_notifier_net(dev_net(dev), nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } - rtnl_unlock(); + rtnl_net_dev_unlock(dev); + return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); @@ -1906,10 +2185,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, { int err; - rtnl_lock(); + rtnl_net_dev_lock(dev); list_del(&nn->list); err = __unregister_netdevice_notifier_net(dev_net(dev), nb); - rtnl_unlock(); + rtnl_net_dev_unlock(dev); + return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); @@ -2057,6 +2337,11 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); +EXPORT_SYMBOL(tcf_sw_enabled_key); +#endif + DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL @@ -2113,7 +2398,7 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; - skb->mono_delivery_time = 0; + skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } @@ -2220,16 +2505,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) } /** - * dev_nit_active - return true if any network interface taps are in use + * dev_nit_active_rcu - return true if any network interface taps are in use + * + * The caller must hold the RCU lock * * @dev: network device to check for the presence of taps */ -bool dev_nit_active(struct net_device *dev) +bool dev_nit_active_rcu(const struct net_device *dev) { - return !list_empty(&net_hotdata.ptype_all) || + /* Callers may hold either RCU or RCU BH lock */ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + return !list_empty(&dev_net(dev)->ptype_all) || !list_empty(&dev->ptype_all); } -EXPORT_SYMBOL_GPL(dev_nit_active); +EXPORT_SYMBOL_GPL(dev_nit_active_rcu); /* * Support routine. Sends outgoing frames to any network @@ -2238,11 +2528,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct list_head *ptype_list = &net_hotdata.ptype_all; struct packet_type *ptype, *pt_prev = NULL; + struct list_head *ptype_list; struct sk_buff *skb2 = NULL; rcu_read_lock(); + ptype_list = &dev_net_rcu(dev)->ptype_all; again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (READ_ONCE(ptype->ignore_outgoing)) @@ -2286,7 +2577,7 @@ again: pt_prev = ptype; } - if (ptype_list == &net_hotdata.ptype_all) { + if (ptype_list != &dev->ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -2889,6 +3180,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); @@ -2898,6 +3190,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); + net_shaper_set_real_num_tx_queues(dev, txq); + dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; @@ -2917,7 +3211,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -2937,6 +3230,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); @@ -2948,7 +3242,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); -#endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used @@ -3180,7 +3473,7 @@ void netif_device_attach(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); @@ -3261,6 +3554,10 @@ int skb_checksum_help(struct sk_buff *skb) return -EINVAL; } + if (!skb_frags_readable(skb)) { + return -EFAULT; + } + /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ @@ -3299,9 +3596,10 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +#ifdef CONFIG_NET_CRC32C int skb_crc32c_csum_help(struct sk_buff *skb) { - __le32 crc32c_csum; + u32 crc; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) @@ -3329,14 +3627,14 @@ int skb_crc32c_csum_help(struct sk_buff *skb) if (ret) goto out; - crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, - skb->len - start, ~(__u32)0, - crc32c_csum_stub)); - *(__le32 *)(skb->data + offset) = crc32c_csum; + crc = ~skb_crc32c(skb, start, skb->len - start, ~0); + *(__le32 *)(skb->data + offset) = cpu_to_le32(crc); skb_reset_csum_not_inet(skb); out: return ret; } +EXPORT_SYMBOL(skb_crc32c_csum_help); +#endif /* CONFIG_NET_CRC32C */ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { @@ -3382,8 +3680,9 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = skb_frag_page(frag); - if (PageHighMem(skb_frag_page(frag))) + if (page && PageHighMem(page)) return 1; } } @@ -3455,7 +3754,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; - if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size))) + if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { @@ -3523,7 +3822,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int len; int rc; - if (dev_nit_active(dev)) + if (dev_nit_active_rcu(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; @@ -3582,6 +3881,11 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, return 0; if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { + if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + goto sw_checksum; + switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): @@ -3589,14 +3893,48 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, } } +sw_checksum: return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); +static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *shinfo; + struct net_iov *niov; + + if (likely(skb_frags_readable(skb))) + goto out; + + if (!dev->netmem_tx) + goto out_free; + + shinfo = skb_shinfo(skb); + + if (shinfo->nr_frags > 0) { + niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); + if (net_is_devmem_iov(niov) && + net_devmem_iov_binding(niov)->dev != dev) + goto out_free; + } + +out: + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; + skb = validate_xmit_unreadable_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -3656,7 +3994,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d next = skb->next; skb_mark_not_on_list(skb); - /* in case skb wont be segmented, point to itself */ + /* in case skb won't be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); @@ -3701,7 +4039,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); - } else { + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { struct udphdr _udphdr; if (skb_header_pointer(skb, hdr_len, @@ -3709,10 +4047,14 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) hdr_len += sizeof(struct udphdr); } - if (shinfo->gso_type & SKB_GSO_DODGY) - gso_segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) { + int payload = skb->len - hdr_len; + /* Malicious packet. */ + if (payload <= 0) + return; + gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } @@ -3775,6 +4117,10 @@ no_lock_out: return rc; } + if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { + kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); + return NET_XMIT_DROP; + } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3814,7 +4160,9 @@ no_lock_out: qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { + WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -3887,6 +4235,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); } +#ifndef CONFIG_PREEMPT_RT static bool netdev_xmit_txqueue_skipped(void) { return __this_cpu_read(softnet_data.xmit.skip_txqueue); @@ -3897,6 +4246,19 @@ void netdev_xmit_skip_txqueue(bool skip) __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); + +#else +static bool netdev_xmit_txqueue_skipped(void) +{ + return current->net_xmit.skip_txqueue; +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + current->net_xmit.skip_txqueue = skip; +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_NET_XGRESS @@ -3911,6 +4273,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, if (!miniq) return ret; + /* Global bypass */ + if (!static_branch_likely(&tcf_sw_enabled_key)) + return ret; + + /* Block-wise bypass */ + if (tcf_block_bypass_sw(miniq->block)) + return ret; + tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); @@ -3971,10 +4341,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; @@ -4003,10 +4376,12 @@ ingress_verdict: break; } *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: @@ -4016,8 +4391,10 @@ ingress_verdict: fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; } + bpf_net_ctx_clear(bpf_net_ctx); return skb; } @@ -4027,11 +4404,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */ @@ -4047,10 +4427,12 @@ egress_verdict: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: @@ -4060,8 +4442,10 @@ egress_verdict: fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; } + bpf_net_ctx_clear(bpf_net_ctx); return skb; } @@ -4157,13 +4541,6 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_zero); -u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) -{ - return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; -} -EXPORT_SYMBOL(dev_pick_tx_cpu_id); - u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@ -4251,7 +4628,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); skb_assert_len(skb); - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + if (unlikely(skb_shinfo(skb)->tx_flags & + (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also @@ -4404,8 +4782,8 @@ EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); -unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ @@ -4427,25 +4805,24 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ thread = READ_ONCE(napi->thread); if (thread) { - /* Avoid doing set_bit() if the thread is in - * INTERRUPTIBLE state, cause napi_thread_wait() - * makes sure to proceed with napi polling - * if the thread is explicitly woken from here. - */ - if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) - set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) + goto use_local_napi; + + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } +use_local_napi: + DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list)); list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS @@ -4455,17 +4832,23 @@ EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); +static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +{ + return hash_32(hash, flow_table->log); +} + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { + u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; - u32 flow_id; u16 rxq_index; + u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4480,23 +4863,23 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb_get_hash(skb) & flow_table->mask; + flow_id = rfs_slot(skb_get_hash(skb), flow_table); rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; - rflow->filter = rc; - if (old_rflow->filter == rflow->filter) - old_rflow->filter = RPS_NO_FILTER; + WRITE_ONCE(rflow->filter, rc); + if (old_rflow->filter == rc) + WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif - rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); + rps_input_queue_tail_save(&rflow->last_qtail, head); } - rflow->cpu = next_cpu; + WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } @@ -4559,7 +4942,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[hash & flow_table->mask]; + rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; tcpu = rflow->cpu; /* @@ -4575,7 +4958,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); @@ -4626,13 +5009,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id <= flow_table->mask) { + if (flow_table && flow_id < (1UL << flow_table->log)) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu < nr_cpu_ids && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < - (int)(10 * flow_table->mask))) + if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && + ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - + READ_ONCE(rflow->last_qtail)) < + (int)(10 << flow_table->log))) expire = false; } rcu_read_unlock(); @@ -4648,7 +5031,8 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ @@ -4678,6 +5062,11 @@ static void napi_schedule_rps(struct softnet_data *sd) #ifdef CONFIG_RPS if (sd != mysd) { + if (use_backlog_threads()) { + __napi_schedule_irqoff(&sd->backlog); + return; + } + sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4692,6 +5081,23 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +{ + unsigned long flags; + + if (use_backlog_threads()) { + backlog_lock_irq_save(sd, &flags); + + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + __napi_schedule_irqoff(&sd->backlog); + + backlog_unlock_irq_restore(sd, &flags); + + } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { + smp_call_function_single_async(cpu, &sd->defer_csd); + } +} + #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif @@ -4711,7 +5117,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; @@ -4722,7 +5128,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } @@ -4743,37 +5150,45 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; + u32 tail; + + reason = SKB_DROP_REASON_DEV_READY; + if (!netif_running(skb->dev)) + goto bad_dev; - reason = SKB_DROP_REASON_NOT_SPECIFIED; + reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); - rps_lock_irqsave(sd, &flags); - if (!netif_running(skb->dev)) - goto drop; + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog)) + goto cpu_backlog_drop; + backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(net_hotdata.max_backlog) && - !skb_flow_limit(skb, qlen)) { - if (qlen) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - rps_unlock_irq_restore(sd, &flags); - return NET_RX_SUCCESS; - } - - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) - napi_schedule_rps(sd); - goto enqueue; + if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); + } + __skb_queue_tail(&sd->input_pkt_queue, skb); + tail = rps_input_queue_tail_incr(sd); + backlog_unlock_irq_restore(sd, &flags); + + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); + return NET_RX_SUCCESS; } - reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: - sd->dropped++; - rps_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: + atomic_inc(&sd->dropped); +bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; @@ -4803,7 +5218,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; @@ -4905,12 +5320,15 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, } static int -netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; - if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) return 0; /* In case we have to go down the path and also linearize, @@ -4929,7 +5347,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; @@ -4982,7 +5400,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -5007,13 +5425,16 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { @@ -5027,11 +5448,14 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) generic_xdp_tx(*pskb, xdp_prog); break; } + bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } + bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: + bpf_net_ctx_clear(bpf_net_ctx); kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } @@ -5125,7 +5549,7 @@ int netif_rx(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx); -static __latent_entropy void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5147,7 +5571,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, - get_kfree_skb_cb(skb)->reason); + get_kfree_skb_cb(skb)->reason, NULL); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); @@ -5340,8 +5764,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, orig_dev = skb->dev; skb_reset_network_header(skb); +#if !defined(CONFIG_DEBUG_NET) + /* We plan to no longer reset the transport header here. + * Give some time to fuzzers and dev build to catch bugs + * in network stacks. + */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); +#endif skb_reset_mac_len(skb); pt_prev = NULL; @@ -5377,7 +5807,8 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, + list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -5489,6 +5920,14 @@ check_vlan_id: deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); + + /* orig_dev and skb->dev could belong to different netns; + * Even in such case we need to traverse only the list + * coming from skb->dev, as the ptype owner (packet socket) + * will use dev_net(skb->dev) to do namespace filtering. + */ + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &dev_net_rcu(skb->dev)->ptype_specific); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, @@ -5602,10 +6041,9 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; - struct list_head sublist; struct sk_buff *skb, *next; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; @@ -5700,7 +6138,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); - dev_disable_lro(dev); + netif_disable_lro(dev); dev_disable_gro_hw(dev); } break; @@ -5743,9 +6181,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb) void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); @@ -5827,35 +6264,39 @@ void netif_receive_skb_list(struct list_head *head) } EXPORT_SYMBOL(netif_receive_skb_list); -static DEFINE_PER_CPU(struct work_struct, flush_works); - /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; + struct sk_buff_head list; struct softnet_data *sd; + __skb_queue_head_init(&list); local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + __skb_queue_tail(&list, skb); + rps_input_queue_head_incr(sd); } } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); - input_queue_head_incr(sd); + __skb_queue_tail(&list, skb); + rps_input_queue_head_incr(sd); } } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } static bool flush_required(int cpu) @@ -5864,14 +6305,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); return do_flush; #endif @@ -5883,36 +6324,54 @@ static bool flush_required(int cpu) return true; } +struct flush_backlogs { + cpumask_t flush_cpus; + struct work_struct w[]; +}; + +static struct flush_backlogs *flush_backlogs_alloc(void) +{ + return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), + GFP_KERNEL); +} + +static struct flush_backlogs *flush_backlogs_fallback; +static DEFINE_MUTEX(flush_backlogs_mutex); + static void flush_all_backlogs(void) { - static cpumask_t flush_cpus; + struct flush_backlogs *ptr = flush_backlogs_alloc(); unsigned int cpu; - /* since we are under rtnl lock protection we can use static data - * for the cpumask and avoid allocating on stack the possibly - * large mask - */ - ASSERT_RTNL(); + if (!ptr) { + mutex_lock(&flush_backlogs_mutex); + ptr = flush_backlogs_fallback; + } + cpumask_clear(&ptr->flush_cpus); cpus_read_lock(); - cpumask_clear(&flush_cpus); for_each_online_cpu(cpu) { if (flush_required(cpu)) { - queue_work_on(cpu, system_highpri_wq, - per_cpu_ptr(&flush_works, cpu)); - cpumask_set_cpu(cpu, &flush_cpus); + INIT_WORK(&ptr->w[cpu], flush_backlog); + queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); + __cpumask_set_cpu(cpu, &ptr->flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of - * them + * them. */ - for_each_cpu(cpu, &flush_cpus) - flush_work(per_cpu_ptr(&flush_works, cpu)); + for_each_cpu(cpu, &ptr->flush_cpus) + flush_work(&ptr->w[cpu]); cpus_read_unlock(); + + if (ptr != flush_backlogs_fallback) + kfree(ptr); + else + mutex_unlock(&flush_backlogs_mutex); } static void net_rps_send_ipi(struct softnet_data *remsd) @@ -5937,7 +6396,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; - if (remsd) { + if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); @@ -5952,7 +6411,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS - return sd->rps_ipi_list != NULL; + return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif @@ -5976,17 +6435,22 @@ static int process_backlog(struct napi_struct *napi, int quota) while (again) { struct sk_buff *skb; + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) { + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); - if (++work >= quota) + if (++work >= quota) { + rps_input_queue_head_add(sd, work); return work; + } + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -5996,15 +6460,19 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; again = false; } else { + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); } + if (work) + rps_input_queue_head_add(sd, work); return work; } @@ -6092,25 +6560,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; if (work_done) { - if (n->gro_bitmask) - timeout = READ_ONCE(n->dev->gro_flush_timeout); - n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); + if (n->gro.bitmask) + timeout = napi_get_gro_flush_timeout(n); + n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; - timeout = READ_ONCE(n->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } - if (n->gro_bitmask) { - /* When the NAPI instance uses a timeout and keeps postponing - * it, we need to bound somehow the time packets are kept in - * the GRO layer - */ - napi_gro_flush(n, !!timeout); - } - gro_normal_list(n); + /* + * When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer. + */ + gro_flush(&n->gro, !!timeout); + gro_normal_list(&n->gro); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6148,19 +6615,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -/* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) -{ - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; - - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; - - return NULL; -} - static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; @@ -6187,19 +6641,15 @@ static void skb_defer_free_flush(struct softnet_data *sd) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { - gro_normal_list(napi); + gro_normal_list(&napi->gro); __napi_schedule(napi); return; } - if (napi->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(napi, HZ >= 1000); - } + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&napi->gro, HZ >= 1000); + gro_normal_list(&napi->gro); - gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -6211,6 +6661,7 @@ enum { static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; bool skip_schedule = false; unsigned long timeout; int rc; @@ -6228,10 +6679,11 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { - napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); - timeout = READ_ONCE(napi->dev->gro_flush_timeout); + napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); + timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; @@ -6250,6 +6702,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } @@ -6259,6 +6712,7 @@ static void __napi_busy_loop(unsigned int napi_id, { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; void *have_poll_lock = NULL; struct napi_struct *napi; @@ -6277,6 +6731,7 @@ restart: int work = 0; local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -6301,12 +6756,13 @@ restart: } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); - gro_normal_list(napi); + gro_normal_list(&napi->gro); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) @@ -6358,26 +6814,84 @@ void napi_busy_loop(unsigned int napi_id, } EXPORT_SYMBOL(napi_busy_loop); +void napi_suspend_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + unsigned long timeout = napi_get_irq_suspend_timeout(napi); + + if (timeout) + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + } + rcu_read_unlock(); +} + +void napi_resume_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + /* If irq_suspend_timeout is set to 0 between the call to + * napi_suspend_irqs and now, the original value still + * determines the safety timeout as intended and napi_watchdog + * will resume irq processing. + */ + if (napi_get_irq_suspend_timeout(napi)) { + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); + } + } + rcu_read_unlock(); +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ +static void __napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + napi->gro.cached_napi_id = napi_id; + + WRITE_ONCE(napi->napi_id, napi_id); + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); +} + +static void napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + unsigned long flags; + + spin_lock_irqsave(&napi_hash_lock, flags); + WARN_ON_ONCE(napi_by_id(napi_id)); + __napi_hash_add_with_id(napi, napi_id); + spin_unlock_irqrestore(&napi_hash_lock, flags); +} + static void napi_hash_add(struct napi_struct *napi) { + unsigned long flags; + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; - spin_lock(&napi_hash_lock); + spin_lock_irqsave(&napi_hash_lock, flags); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + if (unlikely(!napi_id_valid(++napi_gen_id))) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); - napi->napi_id = napi_gen_id; - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + __napi_hash_add_with_id(napi, napi_gen_id); - spin_unlock(&napi_hash_lock); + spin_unlock_irqrestore(&napi_hash_lock, flags); } /* Warning : caller is responsible to make sure rcu grace period @@ -6385,11 +6899,13 @@ static void napi_hash_add(struct napi_struct *napi) */ static void napi_hash_del(struct napi_struct *napi) { - spin_lock(&napi_hash_lock); + unsigned long flags; + + spin_lock_irqsave(&napi_hash_lock, flags); hlist_del_init_rcu(&napi->napi_hash_node); - spin_unlock(&napi_hash_lock); + spin_unlock_irqrestore(&napi_hash_lock, flags); } static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) @@ -6410,22 +6926,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void init_gro_hash(struct napi_struct *napi) -{ - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&napi->gro_hash[i].list); - napi->gro_hash[i].count = 0; - } - napi->gro_bitmask = 0; -} - int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; int err = 0; + netdev_assert_locked_or_invisible(dev); + if (dev->threaded == threaded) return 0; @@ -6441,7 +6948,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } } - dev->threaded = threaded; + WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. @@ -6480,8 +6987,7 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, if (WARN_ON_ONCE(napi && !napi->dev)) return; - if (dev->reg_state >= NETREG_REGISTERED) - ASSERT_RTNL(); + netdev_ops_assert_locked_or_invisible(dev); switch (type) { case NETDEV_QUEUE_TYPE_RX: @@ -6498,20 +7004,255 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); -void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +static void +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) { + struct napi_struct *napi = + container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + int err; +#endif + + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif +} + +#ifdef CONFIG_RFS_ACCEL +static void netif_napi_affinity_release(struct kref *ref) +{ + struct napi_struct *napi = + container_of(ref, struct napi_struct, notify.kref); + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + + netdev_assert_locked(napi->dev); + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, + &napi->state)); + + if (!napi->dev->rx_cpu_rmap_auto) + return; + rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + cpu_rmap_put(rmap); +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + if (dev->rx_cpu_rmap_auto) + return 0; + + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); + if (!dev->rx_cpu_rmap) + return -ENOMEM; + + dev->rx_cpu_rmap_auto = true; + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ + struct cpu_rmap *rmap = dev->rx_cpu_rmap; + + if (!dev->rx_cpu_rmap_auto) + return; + + /* Free the rmap */ + cpu_rmap_put(rmap); + dev->rx_cpu_rmap = NULL; + dev->rx_cpu_rmap_auto = false; +} + +#else +static void netif_napi_affinity_release(struct kref *ref) +{ +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ +} +#endif + +void netif_set_affinity_auto(struct net_device *dev) +{ + unsigned int i, maxqs, numa; + + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); + numa = dev_to_node(&dev->dev); + + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + + dev->irq_affinity_auto = true; +} +EXPORT_SYMBOL(netif_set_affinity_auto); + +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) +{ + int rc; + + netdev_assert_locked_or_invisible(napi->dev); + + if (napi->irq == irq) + return; + + /* Remove existing resources */ + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + napi->irq = irq; + if (irq < 0 || + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) + return; + + /* Abort for buggy drivers */ + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) + return; + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); + if (rc < 0) + return; + + cpu_rmap_get(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = rc; + } +#endif + + /* Use core IRQ notifier */ + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + goto put_rmap; + } + + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + return; + +put_rmap: +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; + cpu_rmap_put(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = -1; + } +#endif + napi->notify.notify = NULL; + napi->notify.release = NULL; +} +EXPORT_SYMBOL(netif_napi_set_irq_locked); + +static void napi_restore_config(struct napi_struct *n) +{ + n->defer_hard_irqs = n->config->defer_hard_irqs; + n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; + + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + + /* a NAPI ID might be stored in the config, if so use it. if not, use + * napi_hash_add to generate one for us. + */ + if (n->config->napi_id) { + napi_hash_add_with_id(n, n->config->napi_id); + } else { + napi_hash_add(n); + n->config->napi_id = n->napi_id; + } +} + +static void napi_save_config(struct napi_struct *n) +{ + n->config->defer_hard_irqs = n->defer_hard_irqs; + n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; + napi_hash_del(n); +} + +/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will + * inherit an existing ID try to insert it at the right position. + */ +static void +netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) +{ + unsigned int new_id, pos_id; + struct list_head *higher; + struct napi_struct *pos; + + new_id = UINT_MAX; + if (napi->config && napi->config->napi_id) + new_id = napi->config->napi_id; + + higher = &dev->napi_list; + list_for_each_entry(pos, &dev->napi_list, dev_list) { + if (napi_id_valid(pos->napi_id)) + pos_id = pos->napi_id; + else if (pos->config) + pos_id = pos->config->napi_id; + else + pos_id = UINT_MAX; + + if (pos_id <= new_id) + break; + higher = &pos->dev_list; + } + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ +} + +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + +void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + netdev_assert_locked(dev); if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); - hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - napi->timer.function = napi_watchdog; - init_gro_hash(napi); + hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + gro_init(&napi->gro); napi->skb = NULL; - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -6524,24 +7265,32 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); - list_add_rcu(&napi->dev_list, &dev->napi_list); - napi_hash_add(napi); + netif_napi_dev_list_add(dev, napi); + + /* default settings from sysfs are applied to all NAPIs. any per-NAPI + * configuration will be loaded in napi_enable + */ + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); + napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = 0; - netif_napi_set_irq(napi, -1); + dev->threaded = false; + netif_napi_set_irq_locked(napi, -1); } -EXPORT_SYMBOL(netif_napi_add_weight); +EXPORT_SYMBOL(netif_napi_add_weight_locked); -void napi_disable(struct napi_struct *n) +void napi_disable_locked(struct napi_struct *n) { unsigned long val, new; might_sleep(); + netdev_assert_locked(n->dev); + set_bit(NAPI_STATE_DISABLE, &n->state); val = READ_ONCE(n->state); @@ -6557,21 +7306,40 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + if (n->config) + napi_save_config(n); + else + napi_hash_del(n); + clear_bit(NAPI_STATE_DISABLE, &n->state); } -EXPORT_SYMBOL(napi_disable); +EXPORT_SYMBOL(napi_disable_locked); /** - * napi_enable - enable NAPI scheduling - * @n: NAPI context + * napi_disable() - prevent NAPI from scheduling + * @n: NAPI context * - * Resume NAPI from being scheduled on this context. - * Must be paired with napi_disable. + * Stop NAPI from being scheduled on this context. + * Waits till any outstanding processing completes. + * Takes netdev_lock() for associated net_device. */ -void napi_enable(struct napi_struct *n) +void napi_disable(struct napi_struct *n) +{ + netdev_lock(n->dev); + napi_disable_locked(n); + netdev_unlock(n->dev); +} +EXPORT_SYMBOL(napi_disable); + +void napi_enable_locked(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); + if (n->config) + napi_restore_config(n); + else + napi_hash_add(n); + do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); @@ -6580,40 +7348,54 @@ void napi_enable(struct napi_struct *n) new |= NAPIF_STATE_THREADED; } while (!try_cmpxchg(&n->state, &val, new)); } -EXPORT_SYMBOL(napi_enable); +EXPORT_SYMBOL(napi_enable_locked); -static void flush_gro_hash(struct napi_struct *napi) +/** + * napi_enable() - enable NAPI scheduling + * @n: NAPI context + * + * Enable scheduling of a NAPI instance. + * Must be paired with napi_disable(). + * Takes netdev_lock() for associated net_device. + */ +void napi_enable(struct napi_struct *n) { - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct sk_buff *skb, *n; - - list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) - kfree_skb(skb); - napi->gro_hash[i].count = 0; - } + netdev_lock(n->dev); + napi_enable_locked(n); + netdev_unlock(n->dev); } +EXPORT_SYMBOL(napi_enable); /* Must be called in process context */ -void __netif_napi_del(struct napi_struct *napi) +void __netif_napi_del_locked(struct napi_struct *napi) { + netdev_assert_locked(napi->dev); + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; - napi_hash_del(napi); + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + if (napi->config) { + napi->index = -1; + napi->config = NULL; + } + list_del_rcu(&napi->dev_list); napi_free_frags(napi); - flush_gro_hash(napi); - napi->gro_bitmask = 0; + gro_cleanup(&napi->gro); if (napi->thread) { kthread_stop(napi->thread); napi->thread = NULL; } } -EXPORT_SYMBOL(__netif_napi_del); +EXPORT_SYMBOL(__netif_napi_del_locked); static int __napi_poll(struct napi_struct *n, bool *repoll) { @@ -6665,14 +7447,9 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) return work; } - if (n->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(n, HZ >= 1000); - } - - gro_normal_list(n); + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&n->gro, HZ >= 1000); + gro_normal_list(&n->gro); /* Some drivers may have called napi_schedule * prior to exhausting their budget. @@ -6700,9 +7477,14 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) work = __napi_poll(n, &do_repoll); - if (do_repoll) + if (do_repoll) { +#if defined(CONFIG_DEBUG_NET) + if (unlikely(!napi_is_scheduled(n))) + pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n", + n->dev->name, n->poll); +#endif list_add_tail(&n->poll_list, repoll); - + } netpoll_poll_unlock(have); return work; @@ -6710,8 +7492,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static int napi_thread_wait(struct napi_struct *napi) { - bool woken = false; - set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { @@ -6720,15 +7500,13 @@ static int napi_thread_wait(struct napi_struct *napi) * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ - if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); - /* woken being true indicates this thread owns this napi. */ - woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -6736,55 +7514,66 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static int napi_threaded_poll(void *data) +static void napi_threaded_poll_loop(struct napi_struct *napi) { - struct napi_struct *napi = data; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; - void *have; + unsigned long last_qs = jiffies; - while (!napi_thread_wait(napi)) { - unsigned long last_qs = jiffies; + for (;;) { + bool repoll = false; + void *have; - for (;;) { - bool repoll = false; + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - local_bh_disable(); - sd = this_cpu_ptr(&softnet_data); - sd->in_napi_threaded_poll = true; + sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; - have = netpoll_poll_lock(napi); - __napi_poll(napi, &repoll); - netpoll_poll_unlock(have); + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); - sd->in_napi_threaded_poll = false; - barrier(); + sd->in_napi_threaded_poll = false; + barrier(); - if (sd_has_rps_ipi_waiting(sd)) { - local_irq_disable(); - net_rps_action_and_irq_enable(sd); - } - skb_defer_free_flush(sd); - local_bh_enable(); + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + skb_defer_free_flush(sd); + bpf_net_ctx_clear(bpf_net_ctx); + local_bh_enable(); - if (!repoll) - break; + if (!repoll) + break; - rcu_softirq_qs_periodic(last_qs); - cond_resched(); - } + rcu_softirq_qs_periodic(last_qs); + cond_resched(); } +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + + while (!napi_thread_wait(napi)) + napi_threaded_poll_loop(napi); + return 0; } -static __latent_entropy void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); start: sd->in_net_rx_action = true; local_irq_disable(); @@ -6821,7 +7610,8 @@ start: */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { - sd->time_squeeze++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } } @@ -6837,7 +7627,8 @@ start: sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); -end:; +end: + bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { @@ -8453,27 +9244,29 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; + unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); - dev->flags |= IFF_PROMISC; - dev->promiscuity += inc; - if (dev->promiscuity == 0) { + promiscuity = dev->promiscuity + inc; + if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ - if (inc < 0) - dev->flags &= ~IFF_PROMISC; - else { - dev->promiscuity -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_PROMISC; + } else { + flags = old_flags | IFF_PROMISC; } - if (dev->flags != old_flags) { + WRITE_ONCE(dev->promiscuity, promiscuity); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { @@ -8491,23 +9284,20 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev_change_rx_flags(dev, IFF_PROMISC); } - if (notify) + if (notify) { + /* The ops lock is only required to ensure consistent locking + * for `NETDEV_CHANGE` notifiers. This function is sometimes + * called without the lock, even for devices that are ops + * locked, such as in `dev_uc_sync_multiple` when using + * bonding or teaming. + */ + netdev_ops_assert_locked(dev); __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); + } return 0; } -/** - * dev_set_promiscuity - update promiscuity count on a device - * @dev: device - * @inc: modifier - * - * Add or remove promiscuity from a device. While the count in the device - * remains above zero the interface remains promiscuous. Once it hits zero - * the device reverts back to normal filtering operation. A negative inc - * value is used to drop promiscuity on the device. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_promiscuity(struct net_device *dev, int inc) +int netif_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; @@ -8519,30 +9309,31 @@ int dev_set_promiscuity(struct net_device *dev, int inc) dev_set_rx_mode(dev); return err; } -EXPORT_SYMBOL(dev_set_promiscuity); -static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) +int netif_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + unsigned int allmulti, flags; ASSERT_RTNL(); - dev->flags |= IFF_ALLMULTI; - dev->allmulti += inc; - if (dev->allmulti == 0) { + allmulti = dev->allmulti + inc; + if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ - if (inc < 0) - dev->flags &= ~IFF_ALLMULTI; - else { - dev->allmulti -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_ALLMULTI; + } else { + flags = old_flags | IFF_ALLMULTI; } - if (dev->flags ^ old_flags) { + WRITE_ONCE(dev->allmulti, allmulti); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); @@ -8554,25 +9345,6 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) return 0; } -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) -{ - return __dev_set_allmulti(dev, inc, true); -} -EXPORT_SYMBOL(dev_set_allmulti); - /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast @@ -8688,7 +9460,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; - unsigned int old_flags = dev->flags; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; @@ -8705,7 +9477,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - __dev_set_allmulti(dev, inc, false); + netif_set_allmulti(dev, inc, false); } return ret; @@ -8740,17 +9512,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, } } -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * @extack: netlink extended ack - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned int flags, - struct netlink_ext_ack *extack) +int netif_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; @@ -8763,7 +9526,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } -EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { @@ -8795,15 +9557,15 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu, } /** - * dev_set_mtu_ext - Change maximum transfer unit + * netif_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ -int dev_set_mtu_ext(struct net_device *dev, int new_mtu, - struct netlink_ext_ack *extack) +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) { int err, orig_mtu; @@ -8841,25 +9603,20 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, return err; } -int dev_set_mtu(struct net_device *dev, int new_mtu) +int netif_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); - err = dev_set_mtu_ext(dev, new_mtu, &extack); + err = netif_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } -EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(netif_set_mtu); -/** - * dev_change_tx_queue_len - Change TX queue length of a netdevice - * @dev: device - * @new_len: new tx queue length - */ -int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; @@ -8868,7 +9625,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) return -ERANGE; if (new_len != orig_len) { - dev->tx_queue_len = new_len; + WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) @@ -8882,16 +9639,11 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); - dev->tx_queue_len = orig_len; + WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } -/** - * dev_set_group - Change group this device belongs to - * @dev: device - * @new_group: group this device should belong to - */ -void dev_set_group(struct net_device *dev, int new_group) +void netif_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } @@ -8917,31 +9669,23 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, } EXPORT_SYMBOL(dev_pre_changeaddr_notify); -/** - * dev_set_mac_address - Change Media Access Control Address - * @dev: device - * @sa: new address - * @extack: netlink extended ack - * - * Change the hardware (MAC) address of the device - */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack) +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; - if (sa->sa_family != dev->type) + if (ss->ss_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + err = dev_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; - if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { - err = ops->ndo_set_mac_address(dev, sa); + if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { + err = ops->ndo_set_mac_address(dev, ss); if (err) return err; } @@ -8950,22 +9694,10 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } -EXPORT_SYMBOL(dev_set_mac_address); DECLARE_RWSEM(dev_addr_sem); -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack) -{ - int ret; - - down_write(&dev_addr_sem); - ret = dev_set_mac_address(dev, sa, extack); - up_write(&dev_addr_sem); - return ret; -} -EXPORT_SYMBOL(dev_set_mac_address_user); - +/* "sa" is a true struct sockaddr with limited "sa_data" member. */ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); @@ -8994,14 +9726,7 @@ unlock: } EXPORT_SYMBOL(dev_get_mac_address); -/** - * dev_change_carrier - Change device carrier - * @dev: device - * @new_carrier: new value - * - * Change device carrier - */ -int dev_change_carrier(struct net_device *dev, bool new_carrier) +int netif_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; @@ -9112,15 +9837,9 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) } EXPORT_SYMBOL(netdev_port_same_parent_id); -/** - * dev_change_proto_down - set carrier according to proto_down. - * - * @dev: device - * @proto_down: new value - */ -int dev_change_proto_down(struct net_device *dev, bool proto_down) +int netif_change_proto_down(struct net_device *dev, bool proto_down) { - if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) + if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -9128,32 +9847,35 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); - dev->proto_down = proto_down; + WRITE_ONCE(dev->proto_down, proto_down); return 0; } /** - * dev_change_proto_down_reason - proto down reason + * netdev_change_proto_down_reason_locked - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value) +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value) { + u32 proto_down_reason; int b; if (!mask) { - dev->proto_down_reason = value; + proto_down_reason = value; } else { + proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) - dev->proto_down_reason |= BIT(b); + proto_down_reason |= BIT(b); else - dev->proto_down_reason &= ~BIT(b); + proto_down_reason &= ~BIT(b); } } + WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { @@ -9214,6 +9936,40 @@ u8 dev_xdp_prog_count(struct net_device *dev) } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); +u8 dev_xdp_sb_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog && + !dev->xdp_state[i].prog->aux->xdp_has_frags) + count++; + return count; +} + +int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + if (!dev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + bpf->command == XDP_SETUP_PROG && + bpf->prog && !bpf->prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(bpf->extack, + "unable to propagate XDP to device using tcp-data-split"); + return -EBUSY; + } + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); + return -EBUSY; + } + + return dev->netdev_ops->ndo_bpf(dev, bpf); +} +EXPORT_SYMBOL_GPL(netif_xdp_propagate); + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -9242,6 +9998,19 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; + netdev_ops_assert_locked(dev); + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + prog && !prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); + return -EBUSY; + } + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); + return -EBUSY; + } + memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; @@ -9390,6 +10159,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; @@ -9459,7 +10232,9 @@ static void bpf_xdp_link_release(struct bpf_link *link) * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { + netdev_lock_ops(xdp_link->dev); WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + netdev_unlock_ops(xdp_link->dev); xdp_link->dev = NULL; } @@ -9541,10 +10316,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, goto out_unlock; } + netdev_lock_ops(xdp_link->dev); mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); + netdev_unlock_ops(xdp_link->dev); if (err) goto out_unlock; @@ -9597,7 +10374,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } + netdev_lock_ops(dev); err = dev_xdp_attach_link(dev, &extack, link); + netdev_unlock_ops(dev); rtnl_unlock(); if (err) { @@ -9666,6 +10445,20 @@ err_out: return err; } +u32 dev_get_min_mp_channel_count(const struct net_device *dev) +{ + int i; + + netdev_ops_assert_locked(dev); + + for (i = dev->real_num_rx_queues - 1; i >= 0; i--) + if (dev->_rx[i].mp_params.mp_priv) + /* The channel count is the idx plus 1. */ + return i + 1; + + return 0; +} + /** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace @@ -9703,6 +10496,15 @@ static void dev_index_release(struct net *net, int ifindex) WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } +static bool from_cleanup_net(void) +{ +#ifdef CONFIG_NET_NS + return current == READ_ONCE(cleanup_net_task); +#else + return false; +#endif +} + /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); @@ -9745,6 +10547,7 @@ static void netdev_sync_lower_features(struct net_device *upper, if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); + netdev_lock_ops(lower); lower->wanted_features &= ~feature; __netdev_update_features(lower); @@ -9753,10 +10556,20 @@ static void netdev_sync_lower_features(struct net_device *upper, &feature, lower->name); else netdev_features_change(lower); + netdev_unlock_ops(lower); } } } +static bool netdev_has_ip_or_hw_csum(netdev_features_t features) +{ + netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; + bool hw_csum = features & NETIF_F_HW_CSUM; + + return ip_csum || hw_csum; +} + static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { @@ -9838,15 +10651,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_LRO; } - if (features & NETIF_F_HW_TLS_TX) { - bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == - (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - bool hw_csum = features & NETIF_F_HW_CSUM; - - if (!ip_csum && !hw_csum) { - netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); - features &= ~NETIF_F_HW_TLS_TX; - } + if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { @@ -9854,6 +10661,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_HW_TLS_RX; } + if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); + features &= ~NETIF_F_GSO_UDP_L4; + } + return features; } @@ -9865,6 +10677,7 @@ int __netdev_update_features(struct net_device *dev) int err = -1; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); features = netdev_get_wanted_features(dev); @@ -10158,6 +10971,17 @@ static void netdev_do_free_pcpu_stats(struct net_device *dev) } } +static void netdev_free_phy_link_topology(struct net_device *dev) +{ + struct phy_link_topology *topo = dev->link_topo; + + if (IS_ENABLED(CONFIG_PHYLIB) && topo) { + xa_destroy(&topo->phys); + kfree(topo); + dev->link_topo = NULL; + } +} + /** * register_netdevice() - register a network device * @dev: device to register @@ -10187,6 +11011,10 @@ int register_netdevice(struct net_device *dev) if (ret) return ret; + /* rss ctx ID 0 is reserved for the default context, start from 1 */ + xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); + mutex_init(&dev->ethtool->rss_lock); + spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); @@ -10276,12 +11104,16 @@ int register_netdevice(struct net_device *dev) ret = netdev_register_kobject(dev); + netdev_lock(dev); WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); + netdev_unlock(dev); if (ret) goto err_uninit_notify; + netdev_lock_ops(dev); __netdev_update_features(dev); + netdev_unlock_ops(dev); /* * Default initial state at registry is that the @@ -10307,7 +11139,9 @@ int register_netdevice(struct net_device *dev) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ + netdev_lock_ops(dev); ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */ @@ -10319,8 +11153,7 @@ int register_netdevice(struct net_device *dev) * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: @@ -10343,47 +11176,26 @@ err_free_name: } EXPORT_SYMBOL(register_netdevice); -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initialize the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * The setup steps dummy netdevs need which normal netdevs get by going + * through register_netdevice(). */ -void init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev(struct net_device *dev) { - /* Clear everything. Note we don't initialize spinlocks - * are they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; - /* NAPI wants this */ - INIT_LIST_HEAD(&dev->napi_list); - /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); - /* napi_busy_loop stats accounting wants this */ - dev_net_set(dev, &init_net); - /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. */ } -EXPORT_SYMBOL_GPL(init_dummy_netdev); - /** * register_netdev - register a network device @@ -10400,12 +11212,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev); */ int register_netdev(struct net_device *dev) { + struct net *net = dev_net(dev); int err; - if (rtnl_lock_killable()) + if (rtnl_net_lock_killable(net)) return -EINTR; + err = register_netdevice(dev); - rtnl_unlock(); + + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdev); @@ -10482,8 +11298,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) rebroadcast_time = jiffies; } + rcu_barrier(); + if (!wait) { - rcu_barrier(); wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); @@ -10542,9 +11359,8 @@ void netdev_run_todo(void) list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { - struct net_device *dev = list_first_entry(&unlink_list, - struct net_device, - unlink_list); + dev = list_first_entry(&unlink_list, struct net_device, + unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } @@ -10566,7 +11382,9 @@ void netdev_run_todo(void) continue; } + netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); + netdev_unlock(dev); linkwatch_sync_dev(dev); } @@ -10597,6 +11415,54 @@ void netdev_run_todo(void) wake_up(&netdev_unregistering_wq); } +/* Collate per-cpu network dstats statistics + * + * Read per-cpu network statistics from dev->dstats and populate the related + * fields in @s. + */ +static void dev_fetch_dstats(struct rtnl_link_stats64 *s, + const struct pcpu_dstats __percpu *dstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, rx_drops; + u64 tx_packets, tx_bytes, tx_drops; + const struct pcpu_dstats *stats; + unsigned int start; + + stats = per_cpu_ptr(dstats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + rx_drops = u64_stats_read(&stats->rx_drops); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); + tx_drops = u64_stats_read(&stats->tx_drops); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->rx_dropped += rx_drops; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; + s->tx_dropped += tx_drops; + } +} + +/* ndo_get_stats64 implementation for dtstats-based accounting. + * + * Populate @s from dev->stats and dev->dstats. This is used internally by the + * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. + */ +static void dev_get_dstats64(const struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_dstats(s, dev->dstats); +} + /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has * all the same fields in the same order as net_device_stats, with only * the type differing, but rtnl_link_stats64 may have additional fields @@ -10645,7 +11511,7 @@ noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) return; } - field = (__force unsigned long __percpu *)((__force void *)p + offset); + field = (unsigned long __percpu *)((void __percpu *)p + offset); this_cpu_inc(*field); } EXPORT_SYMBOL_GPL(netdev_core_stats_inc); @@ -10666,6 +11532,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; + /* + * IPv{4,6} and udp tunnels share common stat helpers and use + * different stat type (NETDEV_PCPU_STAT_TSTATS vs + * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. + */ + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != + offsetof(struct pcpu_dstats, rx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != + offsetof(struct pcpu_dstats, rx_packets)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != + offsetof(struct pcpu_dstats, tx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != + offsetof(struct pcpu_dstats, tx_packets)); + if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); @@ -10673,6 +11553,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { dev_get_tstats64(dev, storage); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { + dev_get_dstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } @@ -10784,19 +11666,12 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) WARN_ON(dev->reg_state == NETREG_REGISTERED); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - dev->gro_flush_timeout = 20000; - dev->napi_defer_hard_irqs = 1; + netdev_set_gro_flush_timeout(dev, 20000); + netdev_set_defer_hard_irqs(dev, 1); } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); -void netdev_freemem(struct net_device *dev) -{ - char *addr = (char *)dev - dev->padded; - - kvfree(addr); -} - /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -10816,8 +11691,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned int txqs, unsigned int rxqs) { struct net_device *dev; - unsigned int alloc_size; - struct net_device *p; + size_t napi_config_sz; + unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -10831,21 +11706,14 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } - alloc_size = sizeof(struct net_device); - if (sizeof_priv) { - /* ensure 32-byte alignment of private area */ - alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); - alloc_size += sizeof_priv; - } - /* ensure 32-byte alignment of whole construct */ - alloc_size += NETDEV_ALIGN - 1; + maxqs = max(txqs, rxqs); - p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); - if (!p) + dev = kvzalloc(struct_size(dev, priv, sizeof_priv), + GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!dev) return NULL; - dev = PTR_ALIGN(p, NETDEV_ALIGN); - dev->padded = (char *)dev - (char *)p; + dev->priv_len = sizeof_priv; ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); #ifdef CONFIG_PCPU_DEV_REFCNT @@ -10892,6 +11760,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif + + mutex_init(&dev->lock); + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -10909,8 +11780,21 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; + dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); + if (!dev->ethtool) + goto free_all; + + dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); + if (!dev->cfg) + goto free_all; + dev->cfg_pending = dev->cfg; - strcpy(dev->name, name); + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); + if (!dev->napi_config) + goto free_all; + + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) @@ -10929,11 +11813,27 @@ free_pcpu: free_percpu(dev->pcpu_refcnt); free_dev: #endif - netdev_freemem(dev); + kvfree(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); +static void netdev_napi_exit(struct net_device *dev) +{ + if (!list_empty(&dev->napi_list)) { + struct napi_struct *p, *n; + + netdev_lock(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + __netif_napi_del_locked(p); + netdev_unlock(dev); + + synchronize_net(); + } + + kvfree(dev->napi_config); +} + /** * free_netdev - free network device * @dev: device @@ -10945,8 +11845,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs); */ void free_netdev(struct net_device *dev) { - struct napi_struct *p, *n; - might_sleep(); /* When called immediately after register_netdevice() failed the unwind @@ -10959,6 +11857,9 @@ void free_netdev(struct net_device *dev) return; } + WARN_ON(dev->cfg != dev->cfg_pending); + kfree(dev->cfg); + kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -10967,8 +11868,9 @@ void free_netdev(struct net_device *dev) /* Flush device addresses */ dev_addr_flush(dev); - list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) - netif_napi_del(p); + netdev_napi_exit(dev); + + netif_del_cpu_rmap(dev); ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT @@ -10980,9 +11882,14 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; + netdev_free_phy_link_topology(dev); + + mutex_destroy(&dev->lock); + /* Compatibility with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - netdev_freemem(dev); + if (dev->reg_state == NETREG_UNINITIALIZED || + dev->reg_state == NETREG_DUMMY) { + kvfree(dev); return; } @@ -10995,6 +11902,19 @@ void free_netdev(struct net_device *dev) EXPORT_SYMBOL(free_netdev); /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, + init_dummy_netdev); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. @@ -11003,13 +11923,41 @@ EXPORT_SYMBOL(free_netdev); void synchronize_net(void) { might_sleep(); - if (rtnl_is_locked()) + if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); +static void netdev_rss_contexts_free(struct net_device *dev) +{ + struct ethtool_rxfh_context *ctx; + unsigned long context; + + mutex_lock(&dev->ethtool->rss_lock); + xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { + struct ethtool_rxfh_param rxfh; + + rxfh.indir = ethtool_rxfh_context_indir(ctx); + rxfh.key = ethtool_rxfh_context_key(ctx); + rxfh.hfunc = ctx->hfunc; + rxfh.input_xfrm = ctx->input_xfrm; + rxfh.rss_context = context; + rxfh.rss_delete = true; + + xa_erase(&dev->ethtool->rss_ctx, context); + if (dev->ethtool_ops->create_rxfh_context) + dev->ethtool_ops->remove_rxfh_context(dev, ctx, + context, NULL); + else + dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); + kfree(ctx); + } + xa_destroy(&dev->ethtool->rss_ctx); + mutex_unlock(&dev->ethtool->rss_lock); +} + /** * unregister_netdevice_queue - remove device from the kernel * @dev: device @@ -11038,6 +11986,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -11068,15 +12029,29 @@ void unregister_netdevice_many_notify(struct list_head *head, BUG_ON(dev->reg_state != NETREG_REGISTERED); } - /* If device is running, close it first. */ - list_for_each_entry(dev, head, unreg_list) - list_add_tail(&dev->close_list, &close_head); + /* If device is running, close it first. Start with ops locked... */ + list_for_each_entry(dev, head, unreg_list) { + if (netdev_need_ops_lock(dev)) { + list_add_tail(&dev->close_list, &close_head); + netdev_lock(dev); + } + } + dev_close_many(&close_head, true); + /* ... now unlock them and go over the rest. */ + list_for_each_entry(dev, head, unreg_list) { + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + list_add_tail(&dev->close_list, &close_head); + } dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); + netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); + netdev_unlock(dev); } flush_all_backlogs(); @@ -11086,9 +12061,12 @@ void unregister_netdevice_many_notify(struct list_head *head, struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); + dev_memory_provider_uninstall(dev); + netdev_unlock_ops(dev); bpf_dev_bound_netdev_unregister(dev); netdev_offload_xstats_disable_all(dev); @@ -11098,8 +12076,7 @@ void unregister_netdevice_many_notify(struct list_head *head, */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); @@ -11113,11 +12090,17 @@ void unregister_netdevice_many_notify(struct list_head *head, netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); + netdev_rss_contexts_free(dev); + call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + mutex_destroy(&dev->ethtool->rss_lock); + + net_shaper_flush_netdev(dev); + if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); @@ -11150,7 +12133,7 @@ void unregister_netdevice_many_notify(struct list_head *head, * @head: list of devices * * Note: As most callers use a stack allocated list_head, - * we force a list_del() to make sure stack wont be corrupted later. + * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@ -11171,30 +12154,15 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - rtnl_lock(); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_unlock(); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); -/** - * __dev_change_net_namespace - move device to different nethost namespace - * @dev: device - * @net: network namespace - * @pat: If not NULL name pattern to try if the current device name - * is already taken in the destination network namespace. - * @new_ifindex: If not zero, specifies device index in the target - * namespace. - * - * This function shuts down a device interface and moves it - * to a new network namespace. On success 0 is returned, on - * a failure a netagive errno code is returned. - * - * Callers must hold the rtnl semaphore. - */ - int __dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex) + const char *pat, int new_ifindex, + struct netlink_ext_ack *extack) { struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); @@ -11205,12 +12173,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_immutable) { + NL_SET_ERR_MSG(extack, "The interface netns is immutable"); goto out; + } - /* Ensure the device has been registrered */ - if (dev->reg_state != NETREG_REGISTERED) + /* Ensure the device has been registered */ + if (dev->reg_state != NETREG_REGISTERED) { + NL_SET_ERR_MSG(extack, "The interface isn't registered"); goto out; + } /* Get out if there is nothing todo */ err = 0; @@ -11223,30 +12195,49 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ - if (!pat) + if (!pat) { + NL_SET_ERR_MSG(extack, + "An interface with the same name exists in the target netns"); goto out; + } err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "Unable to use '%s' for the new interface name in the target netns", + pat); goto out; + } } /* Check that none of the altnames conflicts. */ err = -EEXIST; - netdev_for_each_altname(dev, name_node) - if (netdev_name_in_use(net, name_node->name)) + netdev_for_each_altname(dev, name_node) { + if (netdev_name_in_use(net, name_node->name)) { + NL_SET_ERR_MSG_FMT(extack, + "An interface with the altname %s exists in the target netns", + name_node->name); goto out; + } + } /* Check that new_ifindex isn't used yet. */ if (new_ifindex) { err = dev_index_reserve(net, new_ifindex); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "The ifindex %d is not available in the target netns", + new_ifindex); goto out; + } } else { /* If there is an ifindex conflict assign a new one */ err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY) err = dev_index_reserve(net, 0); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Unable to allocate a new ifindex in the target netns"); goto out; + } new_ifindex = err; } @@ -11254,16 +12245,23 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, * And now a mini version of register_netdevice unregister_netdevice. */ + netdev_lock_ops(dev); /* If device is running close it first. */ - dev_close(dev); - + netif_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); + synchronize_net(); /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); + netdev_unlock_ops(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -11294,11 +12292,17 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); dev->ifindex = new_ifindex; - if (new_name[0]) /* Rename the netdev to prepared name */ + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock_bh(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); + } /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); @@ -11316,11 +12320,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = netdev_change_owner(dev, net_old, net); WARN_ON(err); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); - /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); /* * Prevent userspace races by waiting until the network @@ -11333,7 +12342,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, out: return err; } -EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { @@ -11373,7 +12381,7 @@ static int dev_cpu_dead(unsigned int oldcpu) list_del_init(&napi->poll_list); if (napi->poll == process_backlog) - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } @@ -11381,21 +12389,23 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + if (!use_backlog_threads()) { #ifdef CONFIG_RPS - remsd = oldsd->rps_ipi_list; - oldsd->rps_ipi_list = NULL; + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; #endif - /* send out pending IPI's on offline CPU */ - net_rps_send_ipi(remsd); + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0; @@ -11446,7 +12456,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > - 8 * sizeof_field(struct napi_struct, gro_bitmask)); + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); INIT_LIST_HEAD(&net->dev_base_head); @@ -11581,7 +12591,7 @@ static void __net_exit default_device_exit_net(struct net *net) char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_immutable) continue; /* Leave virtual devices for the generic cleanup */ @@ -11642,7 +12652,7 @@ static struct pernet_operations __net_initdata default_device_ops = { static void __init net_dev_struct_check(void) { /* TX read-mostly hotpath */ - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); @@ -11680,8 +12690,6 @@ static void __init net_dev_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); @@ -11693,7 +12701,7 @@ static void __init net_dev_struct_check(void) #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); } /* @@ -11712,19 +12720,58 @@ static int net_page_pool_create(int cpuid) struct page_pool_params page_pool_params = { .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, .flags = PP_FLAG_SYSTEM_POOL, - .nid = NUMA_NO_NODE, + .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; + int err; pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); if (IS_ERR(pp_ptr)) return -ENOMEM; - per_cpu(system_page_pool, cpuid) = pp_ptr; + err = xdp_reg_page_pool(pp_ptr); + if (err) { + page_pool_destroy(pp_ptr); + return err; + } + + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; #endif return 0; } +static int backlog_napi_should_run(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + + napi_threaded_poll_loop(&sd->backlog); +} + +static void backlog_napi_setup(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + napi->thread = this_cpu_read(backlog_napi); + set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { + .store = &backlog_napi, + .thread_should_run = backlog_napi_should_run, + .thread_fn = run_backlog_napi, + .thread_comm = "backlog_napi/%u", + .setup = backlog_napi_setup, +}; + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -11753,12 +12800,13 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ + flush_backlogs_fallback = flush_backlogs_alloc(); + if (!flush_backlogs_fallback) + goto out; + for_each_possible_cpu(i) { - struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); - INIT_WORK(flush, flush_backlog); - skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD @@ -11773,13 +12821,16 @@ static int __init net_dev_init(void) INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); - init_gro_hash(&sd->backlog); + gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; + INIT_LIST_HEAD(&sd->backlog.poll_list); if (net_page_pool_create(i)) goto out; } + if (use_backlog_threads()) + smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; @@ -11805,17 +12856,22 @@ static int __init net_dev_init(void) NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; + + /* avoid static key IPIs to isolated CPUs */ + if (housekeeping_enabled(HK_TYPE_MISC)) + net_enable_timestamp(); out: if (rc < 0) { for_each_possible_cpu(i) { struct page_pool *pp_ptr; - pp_ptr = per_cpu(system_page_pool, i); + pp_ptr = per_cpu(system_page_pool.pool, i); if (!pp_ptr) continue; + xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); - per_cpu(system_page_pool, i) = NULL; + per_cpu(system_page_pool.pool, i) = NULL; } } diff --git a/net/core/dev.h b/net/core/dev.h index 2bcaf8eee50c..e93f36b7ddf3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -2,21 +2,22 @@ #ifndef _NET_CORE_DEV_H #define _NET_CORE_DEV_H +#include <linux/cleanup.h> #include <linux/types.h> #include <linux/rwsem.h> +#include <linux/netdevice.h> +#include <net/netdev_lock.h> struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { - u64 count; - unsigned int num_buckets; + struct rcu_head rcu; + unsigned int count; + u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; @@ -24,6 +25,38 @@ struct sd_flow_limit { extern int netdev_flow_limit_table_len; +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); +struct net_device *dev_get_by_napi_id(unsigned int napi_id); + +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T)); + +#define for_each_netdev_lock_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock) = NULL; \ + (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \ + ifindex++) + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex); +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *, + if (_T) netdev_unlock_ops_compat(_T)); + +#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \ + (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \ + &ifindex)); \ + ifindex++) + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else @@ -37,8 +70,17 @@ void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); +#if IS_ENABLED(CONFIG_NET_SHAPER) +void net_shaper_flush_netdev(struct net_device *dev); +void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq); +#else +static inline void net_shaper_flush_netdev(struct net_device *dev) {} +static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq) {} +#endif + /* sysctls not referred to from outside net/core/ */ -extern unsigned int sysctl_skb_defer_max; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; @@ -60,6 +102,7 @@ struct netdev_name_node { }; int netdev_get_name(struct net *net, char *name, int ifindex); +int netif_change_name(struct net_device *dev, const char *newname); int dev_change_name(struct net_device *dev, const char *newname); #define netdev_for_each_altname(dev, namenode) \ @@ -73,24 +116,28 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); -int dev_set_mtu_ext(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); +int netif_change_proto_down(struct net_device *dev, bool proto_down); int dev_change_proto_down(struct net_device *dev, bool proto_down); -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value); +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value); typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len); int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +void netif_set_group(struct net_device *dev, int new_group); void dev_set_group(struct net_device *dev, int new_group); +int netif_change_carrier(struct net_device *dev, bool new_carrier); int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); @@ -102,6 +149,20 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh); +static inline void netif_set_up(struct net_device *dev, bool value) +{ + if (value) + dev->flags |= IFF_UP; + else + dev->flags &= ~IFF_UP; + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->up = value; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { @@ -141,6 +202,119 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_ipv4_max_size, size); } +/** + * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs + * @n: napi struct to get the defer_hard_irqs field from + * + * Return: the per-NAPI value of the defar_hard_irqs field. + */ +static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n) +{ + return READ_ONCE(n->defer_hard_irqs); +} + +/** + * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi + * @n: napi_struct to set the defer_hard_irqs field + * @defer: the value the field should be set to + */ +static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) +{ + WRITE_ONCE(n->defer_hard_irqs, defer); +} + +/** + * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev + * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set + * @defer: the defer_hard_irqs value to set + */ +static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, + u32 defer) +{ + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); + struct napi_struct *napi; + int i; + + WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_defer_hard_irqs(napi, defer); + + for (i = 0; i < count; i++) + netdev->napi_config[i].defer_hard_irqs = defer; +} + +/** + * napi_get_gro_flush_timeout - get the gro_flush_timeout + * @n: napi struct to get the gro_flush_timeout from + * + * Return: the per-NAPI value of the gro_flush_timeout field. + */ +static inline unsigned long +napi_get_gro_flush_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->gro_flush_timeout); +} + +/** + * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi + * @n: napi struct to set the gro_flush_timeout + * @timeout: timeout value to set + * + * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout + */ +static inline void napi_set_gro_flush_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->gro_flush_timeout, timeout); +} + +/** + * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs + * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set + * @timeout: the timeout value to set + */ +static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, + unsigned long timeout) +{ + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); + struct napi_struct *napi; + int i; + + WRITE_ONCE(netdev->gro_flush_timeout, timeout); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_gro_flush_timeout(napi, timeout); + + for (i = 0; i < count; i++) + netdev->napi_config[i].gro_flush_timeout = timeout; +} + +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) @@ -149,5 +323,60 @@ void xdp_do_check_flushed(struct napi_struct *napi); static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif -struct napi_struct *napi_by_id(unsigned int napi_id); +/* Best effort check that NAPI is not idle (can't be scheduled to run) */ +static inline void napi_assert_will_not_race(const struct napi_struct *napi) +{ + /* uninitialized instance, can't race */ + if (!napi->poll_list.next) + return; + + /* SCHED bit is set on disabled instances */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + WARN_ON(READ_ONCE(napi->list_owner) != -1); +} + +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + +#define XMIT_RECURSION_LIMIT 8 + +#ifndef CONFIG_PREEMPT_RT +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} +#else +static inline bool dev_xmit_recursion(void) +{ + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ + current->net_xmit.recursion--; +} +#endif + +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg); +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg); + #endif diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index baa63dee2829..90716bd736f3 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -242,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, __hw_addr_del_entry(from_list, ha, false, false); } -static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) +int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; @@ -260,9 +260,10 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, } return err; } +EXPORT_SYMBOL(__hw_addr_sync_multiple); /* This function only works where there is a strict 1-1 relationship - * between source and destionation of they synch. If you ever need to + * between source and destination of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use * __hw_addr_sync_multiple(). */ @@ -299,8 +300,8 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, EXPORT_SYMBOL(__hw_addr_unsync); /** - * __hw_addr_sync_dev - Synchonize device's multicast list - * @list: address list to syncronize + * __hw_addr_sync_dev - Synchronize device's multicast list + * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 4dbd0dc6aea2..8e1dba825e94 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test) KUNIT_FAIL(test, "Can't register netdev %d", err); } - rtnl_lock(); return 0; } @@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test) { struct net_device *netdev = test->priv; - rtnl_unlock(); unregister_netdev(netdev); free_netdev(netdev); } @@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); memset(addr, 2, sizeof(addr)); @@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test) memset(addr, 3, sizeof(addr)); dev_addr_set(netdev, addr); KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); + rtnl_unlock(); } static void dev_addr_test_sync_one(struct kunit *test) @@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test) * considered synced and we overwrite in place. */ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_del(struct kunit *test) @@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); for (i = 1; i < 4; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, @@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_del_main(struct kunit *test) @@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test) NETDEV_HW_ADDR_T_LAN)); KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, NETDEV_HW_ADDR_T_LAN)); + rtnl_unlock(); } static void dev_addr_test_add_set(struct kunit *test) @@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); /* There is no external API like dev_addr_add_excl(), * so shuffle the tree a little bit and exploit aliasing. */ @@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_excl(struct kunit *test) @@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test) u8 addr[ETH_ALEN]; int i; + rtnl_lock(); for (i = 0; i < 10; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); @@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test) memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); } + rtnl_unlock(); } static struct kunit_case dev_addr_test_cases[] = { diff --git a/net/core/dev_api.c b/net/core/dev_api.c new file mode 100644 index 000000000000..1bf0153195f2 --- /dev/null +++ b/net/core/dev_api.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/netdevice.h> +#include <net/netdev_lock.h> + +#include "dev.h" + +/** + * dev_change_name() - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_name(struct net_device *dev, const char *newname) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_name(dev, newname); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_set_alias() - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device. + * + * Return: 0 on success, -errno on failure. + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_alias(dev, alias, len); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_alias); + +/** + * dev_change_flags() - change device settings + * @dev: device + * @flags: device state flags + * @extack: netlink extended ack + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_flags(dev, flags, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_change_flags); + +/** + * dev_set_group() - change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + netdev_lock_ops(dev); + netif_set_group(dev, new_group); + netdev_unlock_ops(dev); +} + +int dev_set_mac_address_user(struct net_device *dev, + struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + netdev_lock_ops(dev); + ret = netif_set_mac_address(dev, ss, extack); + netdev_unlock_ops(dev); + up_write(&dev_addr_sem); + + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +/** + * dev_change_net_namespace() - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat) +{ + return __dev_change_net_namespace(dev, net, pat, 0, NULL); +} +EXPORT_SYMBOL_GPL(dev_change_net_namespace); + +/** + * dev_change_carrier() - change device carrier + * @dev: device + * @new_carrier: new value + * + * Change device carrier + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_carrier(struct net_device *dev, bool new_carrier) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_carrier(dev, new_carrier); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_change_tx_queue_len() - change TX queue length of a netdevice + * @dev: device + * @new_len: new tx queue length + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_tx_queue_len(dev, new_len); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_change_proto_down() - set carrier according to proto_down + * @dev: device + * @proto_down: new value + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_proto_down(dev, proto_down); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_open() - prepare an interface for use + * @dev: device to open + * @extack: netlink extended ack + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + * + * Return: 0 on success, -errno on failure. + */ +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_open(dev, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +/** + * dev_close() - shutdown an interface + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +void dev_close(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_close(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(dev_close); + +int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int ret = -ENODEV; + + if (!ops->ndo_eth_ioctl) + return -EOPNOTSUPP; + + netdev_lock_ops(dev); + if (netif_device_present(dev)) + ret = ops->ndo_eth_ioctl(dev, ifr, cmd); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_eth_ioctl); + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_mtu(dev, new_mtu); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_mtu); + +/** + * dev_disable_lro() - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_disable_lro(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(dev_disable_lro); + +/** + * dev_set_promiscuity() - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_promiscuity(dev, inc); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_promiscuity); + +/** + * dev_set_allmulti() - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * + * Return: 0 on success, -errno on failure. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_allmulti(dev, inc, true); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_allmulti); + +/** + * dev_set_mac_address() - change Media Access Control Address + * @dev: device + * @ss: new address + * @extack: netlink extended ack + * + * Change the hardware (MAC) address of the device + * + * Return: 0 on success, -errno on failure. + */ +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_mac_address(dev, ss, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address); + +int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_xdp_propagate(dev, bpf); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL_GPL(dev_xdp_propagate); + +/** + * netdev_state_change() - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_state_change(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(netdev_state_change); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9a66cf5015f2..616479e71466 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -6,9 +6,11 @@ #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/phylib_stubs.h> +#include <linux/ptp_clock_kernel.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> +#include <net/netdev_lock.h> #include <net/wext.h> #include "dev.h" @@ -64,7 +66,7 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc) } /* Loop over the interfaces, and write an info block for each. */ - rtnl_lock(); + rtnl_net_lock(net); for_each_netdev(net, dev) { if (!pos) done = inet_gifconf(dev, NULL, 0, size); @@ -72,12 +74,12 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc) done = inet_gifconf(dev, pos + total, len - total, size); if (done < 0) { - rtnl_unlock(); + rtnl_net_unlock(net); return -EFAULT; } total += done; } - rtnl_unlock(); + rtnl_net_unlock(net); return put_user(total, &uifc->ifc_len); } @@ -109,7 +111,7 @@ static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) return 0; } -static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) +static int netif_setifmap(struct net_device *dev, struct ifreq *ifr) { struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; @@ -184,7 +186,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm return err; } -static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; @@ -239,39 +241,38 @@ static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) return 0; } -static int dev_eth_ioctl(struct net_device *dev, - struct ifreq *ifr, unsigned int cmd) -{ - const struct net_device_ops *ops = dev->netdev_ops; - - if (!ops->ndo_eth_ioctl) - return -EOPNOTSUPP; - - if (!netif_device_present(dev)) - return -ENODEV; - - return ops->ndo_eth_ioctl(dev, ifr, cmd); -} - /** * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC * or of attached phylib PHY * @dev: Network device * @cfg: Timestamping configuration structure * - * Helper for enforcing a common policy that phylib timestamping, if available, - * should take precedence in front of hardware timestamping provided by the - * netdev. + * Helper for calling the default hardware provider timestamping. * * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and * there only exists a phydev->mii_ts->hwtstamp() method. So this will return * -EOPNOTSUPP for phylib for now, which is still more accurate than letting * the netdev handle the GET request. */ -static int dev_get_hwtstamp_phylib(struct net_device *dev, - struct kernel_hwtstamp_config *cfg) +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) { - if (phy_has_hwtstamp(dev->phydev)) + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + cfg->qualifier = hwprov->desc.qualifier; + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) + return phy_hwtstamp_get(hwprov->phydev, cfg); + + if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); + + return -EOPNOTSUPP; + } + + if (phy_is_default_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); @@ -291,7 +292,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) return -ENODEV; kernel_cfg.ifr = ifr; + netdev_lock_ops(dev); err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); + netdev_unlock_ops(dev); if (err) return err; @@ -319,28 +322,48 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) * should take precedence in front of hardware timestamping provided by the * netdev. If the netdev driver needs to perform specific actions even for PHY * timestamping to work properly (a switch port must trap the timestamped - * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in - * dev->priv_flags. + * frames and not forward them), it must set dev->see_all_hwtstamp_requests. */ int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; - bool phy_ts = phy_has_hwtstamp(dev->phydev); struct kernel_hwtstamp_config old_cfg = {}; + struct hwtstamp_provider *hwprov; + struct phy_device *phydev; bool changed = false; + bool phy_ts; int err; + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) { + phy_ts = true; + phydev = hwprov->phydev; + } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) { + phy_ts = false; + } else { + return -EOPNOTSUPP; + } + + cfg->qualifier = hwprov->desc.qualifier; + } else { + phy_ts = phy_is_default_hwtstamp(dev->phydev); + if (phy_ts) + phydev = dev->phydev; + } + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; - if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (phy_ts && dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; } - if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (!phy_ts || dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) @@ -349,11 +372,11 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, } } - if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) + if (phy_ts && dev->see_all_hwtstamp_requests) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { - err = phy_hwtstamp_set(dev->phydev, cfg, extack); + err = phy_hwtstamp_set(phydev, cfg, extack); if (err) { if (changed) ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); @@ -363,7 +386,6 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, return 0; } -EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib); static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { @@ -396,7 +418,9 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) if (!netif_device_present(dev)) return -ENODEV; + netdev_lock_ops(dev); err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); + netdev_unlock_ops(dev); if (err) return err; @@ -471,10 +495,14 @@ static int dev_siocbond(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocbond) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocbond(dev, ifr, cmd); - else - return -ENODEV; + ret = ops->ndo_siocbond(dev, ifr, cmd); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; @@ -486,10 +514,14 @@ static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocdevprivate) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocdevprivate(dev, ifr, data, cmd); - else - return -ENODEV; + ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; @@ -500,17 +532,21 @@ static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocwandev) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocwandev(dev, ifs); - else - return -ENODEV; + ret = ops->ndo_siocwandev(dev, ifs); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; } /* - * Perform the SIOCxIFxxx calls, inside rtnl_lock() + * Perform the SIOCxIFxxx calls, inside rtnl_net_lock() */ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, unsigned int cmd) @@ -518,7 +554,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; - netdevice_tracker dev_tracker; if (!dev) return -ENODEV; @@ -537,9 +572,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: - if (dev->addr_len > sizeof(struct sockaddr)) + if (dev->addr_len > sizeof(ifr->ifr_hwaddr)) return -EINVAL; - return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); + return dev_set_mac_address_user(dev, + (struct sockaddr_storage *)&ifr->ifr_hwaddr, + NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) @@ -547,11 +584,16 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, min(sizeof(ifr->ifr_hwaddr.sa_data_min), (size_t)dev->addr_len)); + netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + netdev_unlock_ops(dev); return 0; case SIOCSIFMAP: - return dev_setifmap(dev, ifr); + netdev_lock_ops(dev); + err = netif_setifmap(dev, ifr); + netdev_unlock_ops(dev); + return err; case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || @@ -559,7 +601,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_lock_ops(dev); + err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_unlock_ops(dev); + return err; case SIOCDELMULTI: if (!ops->ndo_set_rx_mode || @@ -567,7 +612,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_lock_ops(dev); + err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_unlock_ops(dev); + return err; case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) @@ -581,19 +629,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, case SIOCWANDEV: return dev_siocwandev(dev, &ifr->ifr_settings); - case SIOCBRADDIF: - case SIOCBRDELIF: - if (!netif_device_present(dev)) - return -ENODEV; - if (!netif_is_bridge_master(dev)) - return -EOPNOTSUPP; - netdev_hold(dev, &dev_tracker, GFP_KERNEL); - rtnl_unlock(); - err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); - netdev_put(dev, &dev_tracker); - rtnl_lock(); - return err; - case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: return dev_siocdevprivate(dev, ifr, data, cmd); @@ -737,9 +772,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, dev_load(net, ifr->ifr_name); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); + if (colon) *colon = ':'; return ret; @@ -774,8 +811,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -783,9 +818,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); + if (need_copyout) *need_copyout = false; return ret; @@ -808,9 +845,10 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); return ret; } return -ENOTTY; diff --git a/net/core/devmem.c b/net/core/devmem.c new file mode 100644 index 000000000000..b3a62ca0df65 --- /dev/null +++ b/net/core/devmem.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Devmem TCP + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemdebruijn.kernel@gmail.com> + * Kaiyuan Zhang <kaiyuanz@google.com + */ + +#include <linux/dma-buf.h> +#include <linux/genalloc.h> +#include <linux/mm.h> +#include <linux/netdevice.h> +#include <linux/types.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/sock.h> +#include <trace/events/page_pool.h> + +#include "devmem.h" +#include "mp_dmabuf_devmem.h" +#include "page_pool_priv.h" + +/* Device memory support */ + +static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); + +static const struct memory_provider_ops dmabuf_devmem_ops; + +bool net_is_devmem_iov(struct net_iov *niov) +{ + return niov->type == NET_IOV_DMABUF; +} + +static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, + struct gen_pool_chunk *chunk, + void *not_used) +{ + struct dmabuf_genpool_chunk_owner *owner = chunk->owner; + + kvfree(owner->area.niovs); + kfree(owner); +} + +static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) +{ + struct dmabuf_genpool_chunk_owner *owner; + + owner = net_devmem_iov_to_chunk_owner(niov); + return owner->base_dma_addr + + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); +} + +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) +{ + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + + size_t size, avail; + + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + + size = gen_pool_size(binding->chunk_pool); + avail = gen_pool_avail(binding->chunk_pool); + + if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu", + size, avail)) + gen_pool_destroy(binding->chunk_pool); + + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + DMA_FROM_DEVICE); + dma_buf_detach(binding->dmabuf, binding->attachment); + dma_buf_put(binding->dmabuf); + xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); + kfree(binding); +} +EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct dmabuf_genpool_chunk_owner *owner; + unsigned long dma_addr; + struct net_iov *niov; + ssize_t offset; + ssize_t index; + + dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, + (void **)&owner); + if (!dma_addr) + return NULL; + + offset = dma_addr - owner->base_dma_addr; + index = offset / PAGE_SIZE; + niov = &owner->area.niovs[index]; + + niov->pp_magic = 0; + niov->pp = NULL; + atomic_long_set(&niov->pp_ref_count, 0); + + return niov; +} + +void net_devmem_free_dmabuf(struct net_iov *niov) +{ + struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); + unsigned long dma_addr = net_devmem_get_dma_addr(niov); + + if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, + PAGE_SIZE))) + return; + + gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); +} + +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct netdev_rx_queue *rxq; + unsigned long xa_idx; + unsigned int rxq_idx; + + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase. + */ + synchronize_net(); + + if (binding->list.next) + list_del(&binding->list); + + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { + const struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; + + rxq_idx = get_netdev_rx_queue_index(rxq); + + __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); + } + + net_devmem_dmabuf_binding_put(binding); +} + +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) +{ + struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; + struct netdev_rx_queue *rxq; + u32 xa_idx; + int err; + + err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); + if (err) + return err; + + rxq = __netif_get_rx_queue(dev, rxq_idx); + err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, + GFP_KERNEL); + if (err) + goto err_close_rxq; + + return 0; + +err_close_rxq: + __net_mp_close_rxq(dev, rxq_idx, &mp_params); + return err; +} + +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack) +{ + struct net_devmem_dmabuf_binding *binding; + static u32 id_alloc_next; + struct scatterlist *sg; + struct dma_buf *dmabuf; + unsigned int sg_idx, i; + unsigned long virtual; + int err; + + dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!binding) { + err = -ENOMEM; + goto err_put_dmabuf; + } + + binding->dev = dev; + xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); + + refcount_set(&binding->ref, 1); + + mutex_init(&binding->lock); + + binding->dmabuf = dmabuf; + + binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); + if (IS_ERR(binding->attachment)) { + err = PTR_ERR(binding->attachment); + NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); + goto err_free_binding; + } + + binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, + direction); + if (IS_ERR(binding->sgt)) { + err = PTR_ERR(binding->sgt); + NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); + goto err_detach; + } + + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + + /* For simplicity we expect to make PAGE_SIZE allocations, but the + * binding can be much more flexible than that. We may be able to + * allocate MTU sized chunks here. Leave that for future work... + */ + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); + if (!binding->chunk_pool) { + err = -ENOMEM; + goto err_tx_vec; + } + + virtual = 0; + for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { + dma_addr_t dma_addr = sg_dma_address(sg); + struct dmabuf_genpool_chunk_owner *owner; + size_t len = sg_dma_len(sg); + struct net_iov *niov; + + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!owner) { + err = -ENOMEM; + goto err_free_chunks; + } + + owner->area.base_virtual = virtual; + owner->base_dma_addr = dma_addr; + owner->area.num_niovs = len / PAGE_SIZE; + owner->binding = binding; + + err = gen_pool_add_owner(binding->chunk_pool, dma_addr, + dma_addr, len, dev_to_node(&dev->dev), + owner); + if (err) { + kfree(owner); + err = -EINVAL; + goto err_free_chunks; + } + + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { + err = -ENOMEM; + goto err_free_chunks; + } + + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->type = NET_IOV_DMABUF; + niov->owner = &owner->area; + page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), + net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; + } + + virtual += len; + } + + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_chunks; + + list_add(&binding->list, &priv->bindings); + + return binding; + +err_free_chunks: + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); +err_unmap: + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + DMA_FROM_DEVICE); +err_detach: + dma_buf_detach(dmabuf, binding->attachment); +err_free_binding: + kfree(binding); +err_put_dmabuf: + dma_buf_put(dmabuf); + return ERR_PTR(err); +} + +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); +} + +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct dst_entry *dst = __sk_dst_get(sk); + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + err = -ENODEV; + goto out_err; + } + + return binding; + +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + if (virt_addr >= binding->dmabuf->size) + return NULL; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[virt_addr / PAGE_SIZE]; +} + +/*** "Dmabuf devmem memory provider" ***/ + +int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + if (!binding) + return -EINVAL; + + /* dma-buf dma addresses do not need and should not be used with + * dma_sync_for_cpu/device. Force disable dma_sync. + */ + pool->dma_sync = false; + pool->dma_sync_for_cpu = false; + + if (pool->p.order != 0) + return -E2BIG; + + net_devmem_dmabuf_binding_get(binding); + return 0; +} + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + struct net_iov *niov; + netmem_ref netmem; + + niov = net_devmem_alloc_dmabuf(binding); + if (!niov) + return 0; + + netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); + return netmem; +} + +void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + net_devmem_dmabuf_binding_put(binding); +} + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem)); + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + if (WARN_ON_ONCE(refcount != 1)) + return false; + + page_pool_clear_pp_info(netmem); + + net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); + + /* We don't want the page pool put_page()ing our net_iovs. */ + return false; +} + +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + if (xa_empty(&binding->bound_rxqs)) { + mutex_lock(&binding->lock); + binding->dev = NULL; + mutex_unlock(&binding->lock); + } + break; + } + } +} + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, +}; diff --git a/net/core/devmem.h b/net/core/devmem.h new file mode 100644 index 000000000000..0a3b28ba5c13 --- /dev/null +++ b/net/core/devmem.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Device memory TCP support + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemb@google.com> + * Kaiyuan Zhang <kaiyuanz@google.com> + * + */ +#ifndef _NET_DEVMEM_H +#define _NET_DEVMEM_H + +#include <net/netmem.h> +#include <net/netdev_netlink.h> + +struct netlink_ext_ack; + +struct net_devmem_dmabuf_binding { + struct dma_buf *dmabuf; + struct dma_buf_attachment *attachment; + struct sg_table *sgt; + struct net_device *dev; + struct gen_pool *chunk_pool; + /* Protect dev */ + struct mutex lock; + + /* The user holds a ref (via the netlink API) for as long as they want + * the binding to remain alive. Each page pool using this binding holds + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. + * + * The binding undos itself and unmaps the underlying dmabuf once all + * those refs are dropped and the binding is no longer desired or in + * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. + */ + refcount_t ref; + + /* The list of bindings currently active. Used for netlink to notify us + * of the user dropping the bind. + */ + struct list_head list; + + /* rxq's this binding is active on. */ + struct xarray bound_rxqs; + + /* ID of this binding. Globally unique to all bindings currently + * active. + */ + u32 id; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; +}; + +#if defined(CONFIG_NET_DEVMEM) +/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist + * entry from the dmabuf is inserted into the genpool as a chunk, and needs + * this owner struct to keep track of some metadata necessary to create + * allocations from this chunk. + */ +struct dmabuf_genpool_chunk_owner { + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; + + /* dma_addr of the start of the chunk. */ + dma_addr_t base_dma_addr; +}; + +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack); +void net_devmem_bind_tx_release(struct sock *sk); + +static inline struct dmabuf_genpool_chunk_owner * +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return net_devmem_iov_to_chunk_owner(niov)->binding; +} + +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) +{ + return net_devmem_iov_binding(niov)->id; +} + +static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return owner->base_virtual + + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); +} + +static inline bool +net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) +{ + return refcount_inc_not_zero(&binding->ref); +} + +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ + if (!refcount_dec_and_test(&binding->ref)) + return; + + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); +} + +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); +void net_devmem_free_dmabuf(struct net_iov *ppiov); + +bool net_is_devmem_iov(struct net_iov *niov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); + +#else +struct net_devmem_dmabuf_binding; + +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) +{ +} + +static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) +{ +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, + struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + +static inline void +net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline int +net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) + +{ + return -EOPNOTSUPP; +} + +static inline struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + return NULL; +} + +static inline void net_devmem_free_dmabuf(struct net_iov *ppiov) +{ +} + +static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) +{ + return 0; +} + +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) +{ + return 0; +} + +static inline bool net_is_devmem_iov(struct net_iov *niov) +{ + return false; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return NULL; +} +#endif + +#endif /* _NET_DEVMEM_H */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b0f221d658be..60d31c2feed3 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -37,7 +37,7 @@ #include <trace/events/napi.h> #include <trace/events/devlink.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #define TRACE_ON 1 #define TRACE_OFF 0 @@ -74,7 +74,7 @@ struct net_dm_hw_entries { }; struct per_cpu_dm_data { - spinlock_t lock; /* Protects 'skb', 'hw_entries' and + raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { @@ -109,7 +109,8 @@ static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason); + enum skb_drop_reason reason, + struct sock *rx_sk); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); @@ -168,9 +169,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: - spin_lock_irqsave(&data->lock, flags); + raw_spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; @@ -207,7 +208,7 @@ static void send_dm_alert(struct work_struct *work) */ static void sched_send_work(struct timer_list *t) { - struct per_cpu_dm_data *data = from_timer(data, t, send_timer); + struct per_cpu_dm_data *data = timer_container_of(data, t, send_timer); schedule_work(&data->dm_alert_work); } @@ -225,7 +226,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); - spin_lock(&data->lock); + raw_spin_lock(&data->lock); dskb = data->skb; if (!dskb) @@ -259,12 +260,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location) } out: - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason) + enum skb_drop_reason reason, + struct sock *rx_sk) { trace_drop_common(skb, location); } @@ -314,9 +316,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } @@ -448,7 +450,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) @@ -477,7 +479,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, } out: - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { @@ -491,7 +493,8 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = { static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason) + enum skb_drop_reason reason, + struct sock *rx_sk) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; @@ -1085,7 +1088,7 @@ err_module_put: struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&hw_data->send_timer); + timer_delete_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; @@ -1119,7 +1122,7 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&hw_data->send_timer); + timer_delete_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; @@ -1180,7 +1183,7 @@ err_module_put: struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&data->send_timer); + timer_delete_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); @@ -1208,7 +1211,7 @@ static void net_dm_trace_off_set(void) struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&data->send_timer); + timer_delete_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); @@ -1673,7 +1676,7 @@ static struct notifier_block dropmon_net_notifier = { static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { - spin_lock_init(&data->lock); + raw_spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } @@ -1731,30 +1734,30 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family(&net_drop_monitor_family); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); + return rc; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; - for_each_possible_cpu(cpu) { - net_dm_cpu_data_init(cpu); - net_dm_hw_cpu_data_init(cpu); - } - goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } @@ -1763,19 +1766,18 @@ static void exit_net_drop_monitor(void) { int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); diff --git a/net/core/dst.c b/net/core/dst.c index 95f533844f17..795ca07e28a4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -109,9 +109,6 @@ static void dst_destroy(struct dst_entry *dst) child = xdst->child; } #endif - if (!(dst->flags & DST_NOCOUNT)) - dst_entries_add(dst->ops, -1); - if (dst->ops->destroy) dst->ops->destroy(dst); netdev_put(dst->dev, &dst->dev_tracker); @@ -159,17 +156,35 @@ void dst_dev_put(struct dst_entry *dst) } EXPORT_SYMBOL(dst_dev_put); +static void dst_count_dec(struct dst_entry *dst) +{ + if (!(dst->flags & DST_NOCOUNT)) + dst_entries_add(dst->ops, -1); +} + void dst_release(struct dst_entry *dst) { - if (dst && rcuref_put(&dst->__rcuref)) + if (dst && rcuref_put(&dst->__rcuref)) { +#ifdef CONFIG_DST_CACHE + if (dst->flags & DST_METADATA) { + struct metadata_dst *md_dst = (struct metadata_dst *)dst; + + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_reset_now(&md_dst->u.tun_info.dst_cache); + } +#endif + dst_count_dec(dst); call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); + } } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { - if (dst && rcuref_put(&dst->__rcuref)) + if (dst && rcuref_put(&dst->__rcuref)) { + dst_count_dec(dst); dst_destroy(dst); + } } EXPORT_SYMBOL(dst_release_immediate); @@ -279,7 +294,8 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, { struct metadata_dst *md_dst; - md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen), + flags); if (!md_dst) return NULL; @@ -307,7 +323,8 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) int cpu; struct metadata_dst __percpu *md_dst; - md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options, + optslen), __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 0ccfd5fa5cb9..93a04d18e505 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -17,6 +17,7 @@ struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; + local_lock_t bh_lock; u32 cookie; union { struct in_addr in_saddr; @@ -27,6 +28,7 @@ struct dst_cache_pcpu { static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, struct dst_entry *dst, u32 cookie) { + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst_release(dst_cache->dst); if (dst) dst_hold(dst); @@ -40,6 +42,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, { struct dst_entry *dst; + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst = idst->dst; if (!dst) goto fail; @@ -47,7 +50,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, /* the cache already hold a dst reference; it can't go away */ dst_hold(dst); - if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + if (unlikely(!time_after(idst->refresh_ts, + READ_ONCE(dst_cache->reset_ts)) || (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); @@ -62,10 +66,15 @@ fail: struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { + struct dst_entry *dst; + if (!dst_cache->cache) return NULL; - return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_lock_nested_bh(&dst_cache->cache->bh_lock); + dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; } EXPORT_SYMBOL_GPL(dst_cache_get); @@ -77,13 +86,17 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in_saddr.s_addr; - return container_of(dst, struct rtable, dst); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -95,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); @@ -110,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); - dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, - rt6_get_cookie((struct rt6_info *)dst)); + dst_cache_per_cpu_dst_set(idst, dst, + rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -126,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in6_saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); @@ -139,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6); int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { + unsigned int i; + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; + for_each_possible_cpu(i) + local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); dst_cache_reset(dst_cache); return 0; @@ -170,7 +197,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache) if (!dst_cache->cache) return; - dst_cache->reset_ts = jiffies; + dst_cache_reset(dst_cache); for_each_possible_cpu(i) { struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); struct dst_entry *dst = idst->dst; diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index fc96259807b6..5cdca49b1d7c 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -43,7 +43,6 @@ static unsigned int fib_seq_sum(struct net *net) struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - rtnl_lock(); rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) @@ -52,7 +51,6 @@ static unsigned int fib_seq_sum(struct net *net) module_put(ops->owner); } rcu_read_unlock(); - rtnl_unlock(); return fib_seq; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3f933ffcefc3..8ca634964e36 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -11,6 +11,7 @@ #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> +#include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> @@ -36,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = { bool fib_rule_matchall(const struct fib_rule *rule) { - if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || - rule->flags) + if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || + rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; @@ -72,7 +73,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; - /* The lock is not required here, the list in unreacheable + /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; @@ -100,7 +101,8 @@ static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); -static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +static struct fib_rules_ops *lookup_rules_ops(const struct net *net, + int family) { struct fib_rules_ops *ops; @@ -255,16 +257,36 @@ static int nla_put_port_range(struct sk_buff *skb, int attrtype, return nla_put(skb, attrtype, sizeof(*range), range); } +static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex, + const struct flowi *fl) +{ + u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master); + + return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) : + fl->flowi_iif == iifindex; +} + +static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex, + const struct flowi *fl) +{ + u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master); + + return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) : + fl->flowi_oif == oifindex; +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { - int ret = 0; + int iifindex, oifindex, ret = 0; - if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + iifindex = READ_ONCE(rule->iifindex); + if (iifindex && !fib_rule_iif_match(rule, iifindex, fl)) goto out; - if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + oifindex = READ_ONCE(rule->oifindex); + if (oifindex && !fib_rule_oif_match(rule, oifindex, fl)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) @@ -369,7 +391,10 @@ static int call_fib_rule_notifiers(struct net *net, .rule = rule, }; - ops->fib_rules_seq++; + ASSERT_RTNL_NET(net); + + /* Paired with READ_ONCE() in fib_rules_seq() */ + WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } @@ -396,17 +421,16 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, } EXPORT_SYMBOL_GPL(fib_rules_dump); -unsigned int fib_rules_seq_read(struct net *net, int family) +unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; - ASSERT_RTNL(); - ops = lookup_rules_ops(net, family); if (!ops) return 0; - fib_rules_seq = ops->fib_rules_seq; + /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ + fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; @@ -456,9 +480,6 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->tun_id && r->tun_id != rule->tun_id) continue; - if (r->fr_net != rule->fr_net) - continue; - if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; @@ -478,11 +499,17 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, &rule->sport_range)) continue; + if (rule->sport_mask && r->sport_mask != rule->sport_mask) + continue; + if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (rule->dport_mask && r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return r; @@ -512,14 +539,40 @@ static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, } #endif -static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, +static int fib_nl2rule_port_mask(const struct nlattr *mask_attr, + const struct fib_rule_port_range *range, + u16 *port_mask, + struct netlink_ext_ack *extack) +{ + if (!fib_rule_port_range_valid(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask without port value"); + return -EINVAL; + } + + if (fib_rule_port_is_range(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask for port range"); + return -EINVAL; + } + + if (range->start & ~nla_get_u16(mask_attr)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask"); + return -EINVAL; + } + + *port_mask = nla_get_u16(mask_attr); + + return 0; +} + +static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { - struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; @@ -551,31 +604,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; - } else { - nlrule->pref = fib_default_rule_pref(ops); } - nlrule->proto = tb[FRA_PROTOCOL] ? - nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { - struct net_device *dev; - nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, nlrule->iifname); - if (dev) - nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { - struct net_device *dev; - nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, nlrule->oifname); - if (dev) - nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { @@ -617,11 +657,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, } nlrule->target = nla_get_u32(tb[FRA_GOTO]); - /* Backward jumps are prohibited to avoid endless loops */ - if (nlrule->target <= nlrule->pref) { - NL_SET_ERR_MSG(extack, "Backward goto not supported"); - goto errout_free; - } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; @@ -660,6 +695,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->sport_range)) + nlrule->sport_mask = U16_MAX; + } + + if (tb[FRA_SPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK], + &nlrule->sport_range, + &nlrule->sport_mask, extack); + if (err) + goto errout_free; } if (tb[FRA_DPORT_RANGE]) { @@ -669,6 +714,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->dport_range)) + nlrule->dport_mask = U16_MAX; + } + + if (tb[FRA_DPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK], + &nlrule->dport_range, + &nlrule->dport_mask, extack); + if (err) + goto errout_free; } *rule = nlrule; @@ -681,6 +736,43 @@ errout: return err; } +static int fib_nl2rule_rtnl(struct fib_rule *nlrule, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + if (!tb[FRA_PRIORITY]) + nlrule->pref = fib_default_rule_pref(ops); + + /* Backward jumps are prohibited to avoid endless loops */ + if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); + return -EINVAL; + } + + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname); + if (dev) { + nlrule->iifindex = dev->ifindex; + nlrule->iif_is_l3_master = netif_is_l3_master(dev); + } + } + + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname); + if (dev) { + nlrule->oifindex = dev->ifindex; + nlrule->oif_is_l3_master = netif_is_l3_master(dev); + } + } + + return 0; +} + static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { @@ -717,9 +809,6 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, if (r->tun_id != rule->tun_id) continue; - if (r->fr_net != rule->fr_net) - continue; - if (r->l3mdev != rule->l3mdev) continue; @@ -737,10 +826,16 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, &rule->sport_range)) continue; + if (r->sport_mask != rule->sport_mask) + continue; + if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -766,21 +861,27 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, - [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } + [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, + [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), + [FRA_FLOWLABEL] = { .type = NLA_BE32 }, + [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, + [FRA_SPORT_MASK] = { .type = NLA_U16 }, + [FRA_DPORT_MASK] = { .type = NLA_U16 }, + [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2), }; -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) { - struct net *net = sock_net(skb->sk); - struct fib_rule_hdr *frh = nlmsg_data(nlh); - struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; - struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; + struct fib_rules_ops *ops = NULL; + struct nlattr *tb[FRA_MAX + 1]; bool user_priority = false; + struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -799,10 +900,17 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); + err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(rule, ops, tb, extack); + if (err) + goto errout_free; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -864,31 +972,45 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); + fib_rule_get(rule); + + if (!rtnl_held) + rtnl_net_unlock(net); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); + fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); kfree(rule); errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_newrule); +EXPORT_SYMBOL_GPL(fib_newrule); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); - struct fib_rule_hdr *frh = nlmsg_data(nlh); + return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false); +} + +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) +{ + struct fib_rule *rule = NULL, *nlrule = NULL; struct fib_rules_ops *ops = NULL; - struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL; bool user_priority = false; + struct fib_rule_hdr *frh; + int err = -EINVAL; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -907,25 +1029,32 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); + err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(nlrule, ops, tb, extack); + if (err) + goto errout_free; + rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; - goto errout; + goto errout_free; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; - goto errout; + goto errout_free; } if (ops->delete) { err = ops->delete(rule); if (err) - goto errout; + goto errout_free; } if (rule->tun_id) @@ -947,7 +1076,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { - struct fib_rule *n; + struct fib_rule *n, *r; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) @@ -961,22 +1090,33 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } } - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, - NULL); - notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).portid); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); + + if (!rtnl_held) + rtnl_net_unlock(net); + + notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; -errout: +errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); kfree(nlrule); +errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_delrule); +EXPORT_SYMBOL_GPL(fib_delrule); + +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false); +} static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) @@ -995,7 +1135,9 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ - + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */ + + nla_total_size(2) /* FRA_SPORT_MASK */ + + nla_total_size(2); /* FRA_DPORT_MASK */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -1036,14 +1178,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; - if (rule->iifindex == -1) + if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; - if (rule->oifindex == -1) + if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } @@ -1063,8 +1205,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK, + rule->sport_mask)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK, + rule->dport_mask)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; @@ -1116,12 +1262,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, { struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } - frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, @@ -1142,10 +1288,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; - int idx = 0, family; + int err, idx = 0, family; if (cb->strict_check) { - int err = fib_valid_dumprule_req(nlh, cb->extack); + err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; @@ -1158,17 +1304,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - dump_rules(skb, cb, ops); - - return skb->len; + return dump_rules(skb, cb, ops); } + err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; - if (dump_rules(skb, cb, ops) < 0) + err = dump_rules(skb, cb, ops); + if (err < 0) break; cb->args[1] = 0; @@ -1178,7 +1324,7 @@ skip: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static void notify_rule_change(int event, struct fib_rule *rule, @@ -1205,8 +1351,7 @@ static void notify_rule_change(int event, struct fib_rule *rule, rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, ops->nlgroup, err); + rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) @@ -1215,11 +1360,17 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && - strcmp(dev->name, rule->iifname) == 0) - rule->iifindex = dev->ifindex; + strcmp(dev->name, rule->iifname) == 0) { + WRITE_ONCE(rule->iifindex, dev->ifindex); + WRITE_ONCE(rule->iif_is_l3_master, + netif_is_l3_master(dev)); + } if (rule->oifindex == -1 && - strcmp(dev->name, rule->oifname) == 0) - rule->oifindex = dev->ifindex; + strcmp(dev->name, rule->oifname) == 0) { + WRITE_ONCE(rule->oifindex, dev->ifindex); + WRITE_ONCE(rule->oif_is_l3_master, + netif_is_l3_master(dev)); + } } } @@ -1228,10 +1379,14 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) { - if (rule->iifindex == dev->ifindex) - rule->iifindex = -1; - if (rule->oifindex == dev->ifindex) - rule->oifindex = -1; + if (rule->iifindex == dev->ifindex) { + WRITE_ONCE(rule->iifindex, -1); + WRITE_ONCE(rule->iif_is_l3_master, false); + } + if (rule->oifindex == dev->ifindex) { + WRITE_ONCE(rule->oifindex, -1); + WRITE_ONCE(rule->oif_is_l3_master, false); + } } } @@ -1288,12 +1443,20 @@ static struct pernet_operations fib_rules_net_ops = { .exit = fib_rules_net_exit, }; +static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, +}; + static int __init fib_rules_init(void) { int err; - rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); + + rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) @@ -1308,9 +1471,7 @@ static int __init fib_rules_init(void) fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: - rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); - rtnl_unregister(PF_UNSPEC, RTM_DELRULE); - rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } diff --git a/net/core/filter.c b/net/core/filter.c index 8adf95765cdd..7a72f766aacf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -42,7 +42,7 @@ #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> @@ -84,9 +84,13 @@ #include <net/netkit.h> #include <linux/un.h> #include <net/xdp_sock_drv.h> +#include <net/inet_dscp.h> #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -214,24 +218,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) +{ + if (likely(offset >= 0)) + return offset; + + if (offset >= SKF_NET_OFF) + return offset - SKF_NET_OFF + skb_network_offset(skb); + + if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) + return offset - SKF_LL_OFF + skb_mac_offset(skb); + + return INT_MIN; +} + BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u8 tmp, *ptr; + u8 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return *(u8 *)(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return tmp; - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return *(u8 *)ptr; - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, @@ -244,21 +260,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be16 tmp, *ptr; + __be16 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return get_unaligned_be16(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be16_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be16(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, @@ -271,21 +285,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be32 tmp, *ptr; + __be32 tmp; const int len = sizeof(tmp); - if (likely(offset >= 0)) { - if (headlen - offset >= len) - return get_unaligned_be32(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be32_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be32(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, @@ -1262,8 +1274,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ - old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), - GFP_KERNEL | __GFP_NOWARN); + old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), + GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; @@ -1650,18 +1662,14 @@ void sk_reuseport_prog_free(struct bpf_prog *prog) bpf_prog_destroy(prog); } -struct bpf_scratchpad { - union { - __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; - u8 buff[MAX_BPF_STACK]; - }; -}; - -static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); - static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { +#ifdef CONFIG_DEBUG_NET + /* Avoid a splat in pskb_may_pull_reason() */ + if (write_len > INT_MAX) + return -EINVAL; +#endif return skb_ensure_writable(skb, write_len); } @@ -1960,10 +1968,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; + bool is_ipv6 = flags & BPF_F_IPV6; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | - BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; @@ -1979,7 +1988,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, if (unlikely(from != 0)) return -EINVAL; - inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); @@ -2010,10 +2019,6 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = { BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { - struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); - u32 diff_size = from_size + to_size; - int i, j = 0; - /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data @@ -2022,16 +2027,19 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, * * Even for diffing, from_size and to_size don't need to be equal. */ - if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || - diff_size > sizeof(sp->diff))) - return -EINVAL; - for (i = 0; i < from_size / sizeof(__be32); i++, j++) - sp->diff[j] = ~from[i]; - for (i = 0; i < to_size / sizeof(__be32); i++, j++) - sp->diff[j] = to[i]; + __wsum ret = seed; - return csum_partial(sp->diff, diff_size, seed); + if (from_size && to_size) + ret = csum_sub(csum_partial(to, to_size, ret), + csum_partial(from, from_size, 0)); + else if (to_size) + ret = csum_partial(to, to_size, ret); + + else if (from_size) + ret = ~csum_partial(from, from_size, ~ret); + + return csum_from32to16((__force unsigned int)ret); } static const struct bpf_func_proto bpf_csum_diff_proto = { @@ -2215,7 +2223,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { dst = skb_dst(skb); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; @@ -2233,7 +2241,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); + rcu_read_unlock(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: @@ -2271,12 +2279,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2314,8 +2322,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); + struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { @@ -2357,7 +2364,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -2378,12 +2385,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2423,9 +2430,9 @@ out: /* Internal, non-exposed redirect flags. */ enum { - BPF_F_NEIGH = (1ULL << 1), - BPF_F_PEER = (1ULL << 2), - BPF_F_NEXTHOP = (1ULL << 3), + BPF_F_NEIGH = (1ULL << 16), + BPF_F_PEER = (1ULL << 17), + BPF_F_NEXTHOP = (1ULL << 18), #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; @@ -2435,6 +2442,8 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) struct sk_buff *clone; int ret; + BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; @@ -2469,9 +2478,6 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); -EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); - static struct net_device *skb_get_peer_dev(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; @@ -2484,7 +2490,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev) int skb_do_redirect(struct sk_buff *skb) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; @@ -2504,6 +2510,7 @@ int skb_do_redirect(struct sk_buff *skb) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); + skb_scrub_packet(skb, false); return -EAGAIN; } return flags & BPF_F_NEIGH ? @@ -2517,7 +2524,7 @@ out_drop: BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; @@ -2538,7 +2545,7 @@ static const struct bpf_func_proto bpf_redirect_proto = { BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; @@ -2560,7 +2567,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; @@ -2607,18 +2614,16 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) static void sk_msg_reset_curr(struct sk_msg *msg) { - u32 i = msg->sg.start; - u32 len = 0; - - do { - len += sk_msg_elem(msg, i)->length; - sk_msg_iter_var_next(i); - if (len >= msg->sg.size) - break; - } while (i != msg->sg.end); + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else { + u32 i = msg->sg.end; - msg->sg.curr = i; - msg->sg.copybreak = 0; + sk_msg_iter_var_prev(i); + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { @@ -2781,7 +2786,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); } while (i != msg->sg.end); - if (start >= offset + l) + if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2806,6 +2811,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, raw = page_address(page); + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; @@ -2822,7 +2829,13 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, } put_page(sg_page(psge)); - } else if (start - offset) { + new = i; + goto place_new; + } + + if (start - offset) { + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); @@ -2833,39 +2846,44 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); - sk_msg_iter_next(msg, end); } /* Slot(s) to place newly allocated data */ + sk_msg_iter_next(msg, end); new = i; + sk_msg_iter_var_next(i); + + if (i == msg->sg.end) { + if (!rsge.length) + goto place_new; + sk_msg_iter_next(msg, end); + goto place_new; + } /* Shift one or two slots as needed */ - if (!copy) { - sge = sk_msg_elem_cpy(msg, i); + sge = sk_msg_elem_cpy(msg, new); + sg_unmark_end(&sge); + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { sk_msg_iter_var_next(i); - sg_unmark_end(&sge); + nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); + } - nsge = sk_msg_elem_cpy(msg, i); + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); if (rsge.length) { - sk_msg_iter_var_next(i); + nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); - } - - while (i != msg->sg.end) { - msg->sg.data[i] = sge; - sge = nsge; - sk_msg_iter_var_next(i); - if (rsge.length) { - nsge = nnsge; - nnsge = sk_msg_elem_cpy(msg, i); - } else { - nsge = sk_msg_elem_cpy(msg, i); - } + } else { + nsge = sk_msg_elem_cpy(msg, i); } } +place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; @@ -2894,8 +2912,10 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { static void sk_msg_shift_left(struct sk_msg *msg, int i) { + struct scatterlist *sge = sk_msg_elem(msg, i); int prev; + put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); @@ -2932,6 +2952,9 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { @@ -2944,7 +2967,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ - if (start >= offset + l || last >= msg->sg.size) + if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2973,12 +2996,12 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); - int a = start; + int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); - if (pop < sge->length - a) { + if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); @@ -2997,7 +3020,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(!page)) return -ENOMEM; - sge->length = a; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); @@ -3007,7 +3029,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, put_page(orig); } pop = 0; - } else if (pop >= sge->length - a) { + } else { pop -= (sge->length - a); sge->length = a; } @@ -3041,7 +3063,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, pop -= sge->length; sk_msg_shift_left(msg, i); } - sk_msg_iter_var_next(i); } sk_mem_uncharge(msg->sk, len - pop); @@ -3178,6 +3199,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); + skb_reset_mac_len(skb); bpf_compute_data_pointers(skb); return ret; @@ -3211,6 +3233,13 @@ static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) +{ + skb->protocol = htons(proto); + if (skb_valid_dst(skb)) + skb_dst_drop(skb); +} + static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { /* Caller already did skb_cow() with len as headroom, @@ -3307,7 +3336,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) } } - skb->protocol = htons(ETH_P_IPV6); + bpf_skb_change_protocol(skb, ETH_P_IPV6); skb_clear_hash(skb); return 0; @@ -3337,7 +3366,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) } } - skb->protocol = htons(ETH_P_IP); + bpf_skb_change_protocol(skb, ETH_P_IP); skb_clear_hash(skb); return 0; @@ -3528,22 +3557,29 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) - skb->protocol = htons(ETH_P_IPV6); + bpf_skb_change_protocol(skb, ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) - skb->protocol = htons(ETH_P_IP); + bpf_skb_change_protocol(skb, ETH_P_IP); } if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Due to header grow, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; + + /* Due to header growth, MSS needs to be downgraded. + * There is a BUG_ON() when segmenting the frag_list with + * head_frag true, so linearize the skb after downgrading + * the MSS. + */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { + skb_decrease_gso_size(shinfo, len_diff); + if (shinfo->frag_list) + return skb_linearize(skb); + } } return 0; @@ -3577,10 +3613,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) - skb->protocol = htons(ETH_P_IPV6); + bpf_skb_change_protocol(skb, ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) - skb->protocol = htons(ETH_P_IP); + bpf_skb_change_protocol(skb, ETH_P_IP); if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -3715,13 +3751,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } @@ -4100,13 +4145,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - struct xdp_mem_info *mem_info, bool release) + enum xdp_mem_type mem_type, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); - __xdp_return(NULL, mem_info, false, zc_frag); + __xdp_return(0, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } @@ -4115,19 +4160,16 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { - struct xdp_mem_info *mem_info = &xdp->rxq->mem; + enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; - if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); + if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { + bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); goto out; } - if (release) { - struct page *page = skb_frag_page(frag); - - __xdp_return(page_address(page), mem_info, false, NULL); - } + if (release) + __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); out: return release; @@ -4266,50 +4308,50 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { */ void xdp_do_flush(void) { - __dev_flush(); - __cpu_map_flush(); - __xsk_map_flush(); + struct list_head *lh_map, *lh_dev, *lh_xsk; + + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) + __dev_flush(lh_dev); + if (lh_map) + __cpu_map_flush(lh_map); + if (lh_xsk) + __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi) { - bool ret; + struct list_head *lh_map, *lh_dev, *lh_xsk; + bool missed = false; - ret = dev_check_flush(); - ret |= cpu_map_check_flush(); - ret |= xsk_map_check_flush(); + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) { + __dev_flush(lh_dev); + missed = true; + } + if (lh_map) { + __cpu_map_flush(lh_map); + missed = true; + } + if (lh_xsk) { + __xsk_map_flush(lh_xsk); + missed = true; + } - WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n", + WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); } #endif -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); @@ -4329,9 +4371,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, - struct net_device *dev, + const struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4352,18 +4394,20 @@ err: return err; } -static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, - struct net_device *dev, - struct xdp_frame *xdpf, - struct bpf_prog *xdp_prog) +static __always_inline int +__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { @@ -4375,11 +4419,20 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by dev_map_free() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } @@ -4413,9 +4466,9 @@ err: } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4427,9 +4480,10 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, - struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4442,11 +4496,11 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, - void *fwd, - enum bpf_map_type map_type, u32 map_id) + const struct bpf_prog *xdp_prog, + void *fwd, enum bpf_map_type map_type, + u32 map_id, u32 flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; @@ -4454,11 +4508,20 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by dev_map_free() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } @@ -4489,15 +4552,18 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { @@ -4517,7 +4583,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, return 0; } - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; @@ -4525,7 +4591,7 @@ err: BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; @@ -4662,7 +4728,7 @@ set_compat: to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) - to->tunnel_flags = info->key.tun_flags; + to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; @@ -4705,7 +4771,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) int err; if (unlikely(!info || - !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } @@ -4775,15 +4841,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; - if (flags & BPF_F_DONT_FRAGMENT) - info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; - if (flags & BPF_F_SEQ_NUMBER) - info->key.tun_flags |= TUNNEL_SEQ; - if (flags & BPF_F_NO_TUNNEL_KEY) - info->key.tun_flags &= ~TUNNEL_KEY; + __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); + __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, + flags & BPF_F_DONT_FRAGMENT); + __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, + !(flags & BPF_F_ZERO_CSUM_TX)); + __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, + flags & BPF_F_SEQ_NUMBER); + __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, + !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -4821,13 +4887,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); + IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); + ip_tunnel_set_options_present(present); + ip_tunnel_info_opts_set(info, from, size, present); return 0; } @@ -5094,6 +5162,17 @@ static u64 __bpf_get_netns_cookie(struct sock *sk) return net->net_cookie; } +BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) +{ + return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_proto = { + .func = bpf_get_netns_cookie, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); @@ -5160,6 +5239,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) +{ + u32 sk_bpf_cb_flags; + + if (getopt) { + *(u32 *)optval = sk->sk_bpf_cb_flags; + return 0; + } + + sk_bpf_cb_flags = *(u32 *)optval; + + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) + return -EINVAL; + + sk->sk_bpf_cb_flags = sk_bpf_cb_flags; + + return 0; +} + static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5176,6 +5274,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname, case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: + case SK_BPF_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5185,6 +5284,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname, return -EINVAL; } + if (optname == SK_BPF_CB_FLAGS) + return sk_bpf_set_get_cb_flags(sk, optval, getopt); + if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; @@ -5197,6 +5299,38 @@ static int sol_socket_sockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), *optlen); } +static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + if (optlen != sizeof(int)) + return -EINVAL; + + switch (optname) { + case TCP_BPF_SOCK_OPS_CB_FLAGS: { + int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags; + + memcpy(optval, &cb_flags, optlen); + break; + } + case TCP_BPF_RTO_MIN: { + int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min); + + memcpy(optval, &rto_min_us, optlen); + break; + } + case TCP_BPF_DELACK_MAX: { + int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max); + + memcpy(optval, &delack_max_us, optlen); + break; + } + default: + return -EINVAL; + } + + return 0; +} + static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { @@ -5236,6 +5370,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; + case TCP_BPF_SOCK_OPS_CB_FLAGS: + if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) + return -EINVAL; + tp->bpf_sock_ops_cb_flags = val; + break; default: return -EINVAL; } @@ -5315,6 +5454,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: + case TCP_RTO_MAX_MS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5326,7 +5466,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, break; default: if (getopt) - return -EINVAL; + return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen); return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); } @@ -5422,6 +5562,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } +static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) +{ + return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; +} + static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { @@ -5572,6 +5717,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -5657,6 +5805,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; @@ -5699,6 +5850,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; @@ -5857,7 +6011,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } - fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_tos = params->tos & INET_DSCP_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; @@ -5884,7 +6038,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -6027,7 +6184,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6105,7 +6265,7 @@ set_fwd_params: #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ - BPF_FIB_LOOKUP_SRC) + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -6213,12 +6373,10 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, { int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; struct net_device *dev = skb->dev; - int skb_len, dev_len; - int mtu; + int mtu, dev_len, skb_len; if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) return -EINVAL; - if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) return -EINVAL; @@ -6227,7 +6385,6 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, return -ENODEV; mtu = READ_ONCE(dev->mtu); - dev_len = mtu + dev->hard_header_len; /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ @@ -6245,15 +6402,12 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; - if (flags & BPF_MTU_CHK_SEGS && !skb_gso_validate_network_len(skb, mtu)) ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; } out: - /* BPF verifier guarantees valid pointer */ *mtu_len = mtu; - return ret; } @@ -6274,8 +6428,6 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, return -ENODEV; mtu = READ_ONCE(dev->mtu); - - /* Add L2-header as dev MTU is L3 size */ dev_len = mtu + dev->hard_header_len; /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ @@ -6286,9 +6438,7 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, if (xdp_len > dev_len) ret = BPF_MTU_CHK_RET_FRAG_NEEDED; - /* BPF verifier guarantees valid pointer */ *mtu_len = mtu; - return ret; } @@ -6298,7 +6448,8 @@ static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_INT, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, + .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6309,7 +6460,8 @@ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_INT, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, + .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6418,6 +6570,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; + lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; @@ -6474,6 +6627,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, int hdroff = 0; int err; + lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) @@ -6550,6 +6704,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, int srhoff = 0; int ret; + lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; @@ -6705,8 +6860,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -6753,8 +6906,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -6783,7 +6934,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6802,7 +6953,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6821,7 +6972,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6845,7 +6996,7 @@ static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6869,7 +7020,7 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6893,7 +7044,7 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6931,7 +7082,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6955,7 +7106,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6979,7 +7130,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6999,7 +7150,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -7018,7 +7169,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -7037,7 +7188,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -7203,7 +7354,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); - if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; @@ -7511,6 +7662,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u8 search_kind, search_len, copy_len, magic_len; int ret; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. @@ -7576,7 +7730,7 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -7694,17 +7848,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, return -EOPNOTSUPP; switch (tstamp_type) { - case BPF_SKB_TSTAMP_DELIVERY_MONO: + case BPF_SKB_CLOCK_REALTIME: + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_REALTIME; + break; + case BPF_SKB_CLOCK_MONOTONIC: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; - skb->mono_delivery_time = 1; + skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; - case BPF_SKB_TSTAMP_UNSPEC: - if (tstamp) + case BPF_SKB_CLOCK_TAI: + if (!tstamp) return -EINVAL; - skb->tstamp = 0; - skb->mono_delivery_time = 0; + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_TAI; break; default: return -EINVAL; @@ -7828,42 +7986,37 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { #endif /* CONFIG_INET */ -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == sk_skb_change_head || - func == bpf_skb_change_tail || - func == sk_skb_change_tail || - func == bpf_skb_adjust_room || - func == sk_skb_adjust_room || - func == bpf_skb_pull_data || - func == sk_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_msg_push_data || - func == bpf_msg_pop_data || - func == bpf_xdp_adjust_tail || -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) - func == bpf_lwt_seg6_store_bytes || - func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action || -#endif -#ifdef CONFIG_INET - func == bpf_sock_ops_store_hdr_opt || -#endif - func == bpf_lwt_in_push_encap || - func == bpf_lwt_xmit_push_encap) +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_clone_redirect: + case BPF_FUNC_l3_csum_replace: + case BPF_FUNC_l4_csum_replace: + case BPF_FUNC_lwt_push_encap: + case BPF_FUNC_lwt_seg6_action: + case BPF_FUNC_lwt_seg6_adjust_srh: + case BPF_FUNC_lwt_seg6_store_bytes: + case BPF_FUNC_msg_pop_data: + case BPF_FUNC_msg_pull_data: + case BPF_FUNC_msg_push_data: + case BPF_FUNC_skb_adjust_room: + case BPF_FUNC_skb_change_head: + case BPF_FUNC_skb_change_proto: + case BPF_FUNC_skb_change_tail: + case BPF_FUNC_skb_pull_data: + case BPF_FUNC_skb_store_bytes: + case BPF_FUNC_skb_vlan_pop: + case BPF_FUNC_skb_vlan_push: + case BPF_FUNC_store_hdr_opt: + case BPF_FUNC_xdp_adjust_head: + case BPF_FUNC_xdp_adjust_meta: + case BPF_FUNC_xdp_adjust_tail: + /* tail-called program could call any of the above */ + case BPF_FUNC_tail_call: return true; - - return false; + default: + return false; + } } const struct bpf_func_proto bpf_event_output_data_proto __weak; @@ -7878,10 +8031,6 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; @@ -7907,10 +8056,6 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { @@ -8001,6 +8146,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_perf_event_output: @@ -8132,6 +8279,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: @@ -8340,20 +8489,12 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif default: return bpf_sk_base_func_proto(func_id, prog); } @@ -8526,13 +8667,16 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, data_end): + if (info->is_ldsx || size != size_default) + return false; + break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): - case bpf_ctx_range(struct __sk_buff, data): - case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; break; @@ -8976,6 +9120,14 @@ static bool xdp_is_valid_access(int off, int size, } } return false; + } else { + switch (off) { + case offsetof(struct xdp_md, data_meta): + case offsetof(struct xdp_md, data): + case offsetof(struct xdp_md, data_end): + if (info->is_ldsx) + return false; + } } switch (off) { @@ -8993,7 +9145,8 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; @@ -9301,12 +9454,12 @@ static bool flow_dissector_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, data): - if (size != size_default) + if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): - if (size != size_default) + if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; @@ -9357,16 +9510,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; - /* AX is needed because src_reg and dst_reg could be the same */ - __u8 tmp_reg = BPF_REG_AX; - - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_BF_MONO_TC_OFFSET); - *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, - SKB_MONO_DELIVERY_TIME_MASK, 2); - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); - *insn++ = BPF_JMP_A(1); - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); + BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); + BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); + BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); + BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); + *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); +#else + BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); +#endif return insn; } @@ -9409,11 +9563,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); - /* skb->tc_at_ingress && skb->mono_delivery_time, + /* check if ingress mask bits is set */ + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); + *insn++ = BPF_JMP_A(4); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); + *insn++ = BPF_JMP_A(2); + /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); @@ -9438,7 +9593,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the - * mono_delivery_time bit also. + * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; @@ -9448,8 +9603,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); - /* <clear>: mono_delivery_time */ - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); + /* <clear>: skb->tstamp_type */ + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif @@ -9547,7 +9702,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { - u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); + u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ @@ -9556,7 +9711,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); - *insn++ = BPF_EMIT_STORE(BPF_H, si, off); + *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, @@ -10153,10 +10308,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, } \ } while (0) -#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ - S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) - static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -10274,10 +10425,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ @@ -10362,10 +10513,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ @@ -11005,7 +11156,6 @@ const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { - .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops cg_sock_verifier_ops = { @@ -11168,6 +11318,7 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; + int err; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) @@ -11175,10 +11326,6 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { - /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ - if (sk_is_refcounted(selected_sk)) - sock_put(selected_sk); - /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. @@ -11186,24 +11333,33 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ - return is_sockarray ? -ENOENT : -EINVAL; + err = is_sockarray ? -ENOENT : -EINVAL; + goto error; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; - if (sk->sk_protocol != selected_sk->sk_protocol) - return -EPROTOTYPE; - else if (sk->sk_family != selected_sk->sk_family) - return -EAFNOSUPPORT; - - /* Catch all. Likely bound to a different sockaddr. */ - return -EBADFD; + if (sk->sk_protocol != selected_sk->sk_protocol) { + err = -EPROTOTYPE; + } else if (sk->sk_family != selected_sk->sk_family) { + err = -EAFNOSUPPORT; + } else { + /* Catch all. Likely bound to a different sockaddr. */ + err = -EBADFD; + } + goto error; } reuse_kern->selected_sk = selected_sk; return 0; +error: + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + + return err; } static const struct bpf_func_proto sk_select_reuseport_proto = { @@ -11823,28 +11979,34 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } __bpf_kfunc_start_defs(); -__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, - struct bpf_dynptr_kern *ptr__uninit) +__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, + struct bpf_dynptr *ptr__uninit) { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)s; + if (flags) { - bpf_dynptr_set_null(ptr__uninit); + bpf_dynptr_set_null(ptr); return -EINVAL; } - bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); return 0; } -__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags, - struct bpf_dynptr_kern *ptr__uninit) +__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, + struct bpf_dynptr *ptr__uninit) { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct xdp_buff *xdp = (struct xdp_buff *)x; + if (flags) { - bpf_dynptr_set_null(ptr__uninit); + bpf_dynptr_set_null(ptr); return -EINVAL; } - bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); + bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); return 0; } @@ -11870,10 +12032,11 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, return 0; } -__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk, +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, struct bpf_tcp_req_attrs *attrs, int attrs__sz) { #if IS_ENABLED(CONFIG_SYN_COOKIES) + struct sk_buff *skb = (struct sk_buff *)s; const struct request_sock_ops *ops; struct inet_request_sock *ireq; struct tcp_request_sock *treq; @@ -11966,24 +12129,44 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk, #endif } +__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, + u64 flags) +{ + struct sk_buff *skb; + + if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) + return -EOPNOTSUPP; + + if (flags) + return -EINVAL; + + skb = skops->skb; + skb_shinfo(skb)->tx_flags |= SKBTX_BPF; + TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; + skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + + return 0; +} + __bpf_kfunc_end_defs(); -int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, - struct bpf_dynptr_kern *ptr__uninit) +int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, + struct bpf_dynptr *ptr__uninit) { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; int err; err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); if (err) return err; - bpf_dynptr_set_rdonly(ptr__uninit); + bpf_dynptr_set_rdonly(ptr); return 0; } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -11998,6 +12181,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -12018,6 +12205,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .set = &bpf_kfunc_check_set_tcp_reqsk, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_sock_ops, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -12032,10 +12224,12 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); } late_initcall(bpf_kfunc_init); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 272f09251343..1b61bb25ba0e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -106,7 +106,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, #endif /* CONFIG_BPF_SYSCALL */ /** - * __skb_flow_get_ports - extract the upper layer ports and return them + * skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset @@ -116,8 +116,8 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - const void *data, int hlen) +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); @@ -137,7 +137,7 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, return 0; } -EXPORT_SYMBOL(__skb_flow_get_ports); +EXPORT_SYMBOL(skb_flow_get_ports); static bool icmp_has_id(u8 type) { @@ -299,9 +299,10 @@ void skb_flow_dissect_meta(const struct sk_buff *skb, EXPORT_SYMBOL(skb_flow_dissect_meta); static void -skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, - struct flow_dissector *flow_dissector, - void *target_container) +skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type, + u32 ctrl_flags, + struct flow_dissector *flow_dissector, + void *target_container) { struct flow_dissector_key_control *ctrl; @@ -312,6 +313,7 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, FLOW_DISSECTOR_KEY_ENC_CONTROL, target_container); ctrl->addr_type = type; + ctrl->flags = ctrl_flags; } void @@ -367,6 +369,7 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, { struct ip_tunnel_info *info; struct ip_tunnel_key *key; + u32 ctrl_flags = 0; /* A quick check to see if there might be something to do. */ if (!dissector_uses_key(flow_dissector, @@ -391,11 +394,20 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, key = &info->key; + if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM; + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT; + if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM; + if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT; + switch (ip_tunnel_info_af(info)) { case AF_INET: - skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, - flow_dissector, - target_container); + skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + ctrl_flags, flow_dissector, + target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { struct flow_dissector_key_ipv4_addrs *ipv4; @@ -408,9 +420,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, } break; case AF_INET6: - skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, - flow_dissector, - target_container); + skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + ctrl_flags, flow_dissector, + target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *ipv6; @@ -422,6 +434,10 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, ipv6->dst = key->u.ipv6.dst; } break; + default: + skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector, + target_container); + break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { @@ -455,17 +471,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); - if (info->options_len) { - enc_opt->len = info->options_len; - ip_tunnel_info_opts_get(enc_opt->data, info); - enc_opt->dst_opt_type = info->key.tun_flags & - TUNNEL_OPTIONS_PRESENT; - } + if (!info->options_len) + return; + + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + + ip_tunnel_set_options_present(flags); + ip_tunnel_flags_and(flags, info->key.tun_flags, flags); + + val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, + IP_TUNNEL_GENEVE_OPT_BIT); + enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); @@ -829,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { - enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS; - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); - if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (!key_ports && !key_ports_range) return; - key_ports = skb_flow_dissector_target(flow_dissector, - dissector_ports, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; } static void @@ -900,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -944,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE, - target_container); - - if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { @@ -1084,21 +1117,22 @@ bool __skb_flow_dissect(const struct net *net, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + if (skb) { if (!net) { if (skb->dev) - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } } - WARN_ON_ONCE(!net); + DEBUG_NET_WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; - rcu_read_lock(); run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); @@ -1126,17 +1160,17 @@ bool __skb_flow_dissect(const struct net *net, prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); - if (result == BPF_FLOW_DISSECTOR_CONTINUE) - goto dissect_continue; - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); - rcu_read_unlock(); - return result == BPF_OK; + if (result != BPF_FLOW_DISSECTOR_CONTINUE) { + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return result == BPF_OK; + } } -dissect_continue: - rcu_read_unlock(); } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); @@ -1784,6 +1818,13 @@ u32 flow_hash_from_keys(struct flow_keys *keys) } EXPORT_SYMBOL(flow_hash_from_keys); +u32 flow_hash_from_keys_seed(struct flow_keys *keys, + const siphash_key_t *keyval) +{ + return __flow_hash_from_keys(keys, keyval); +} +EXPORT_SYMBOL(flow_hash_from_keys_seed); + static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, const siphash_key_t *keyval) @@ -1823,22 +1864,23 @@ EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; -u32 __skb_get_hash_symmetric(const struct sk_buff *skb) +u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb) { struct flow_keys keys; __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); - __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, + __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric, &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } -EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); +EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net); /** - * __skb_get_hash: calculate a flow hash + * __skb_get_hash_net: calculate a flow hash + * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to calculate flow hash from * * This function calculates a flow hash based on src/dst addresses @@ -1846,18 +1888,24 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ -void __skb_get_hash(struct sk_buff *skb) +void __skb_get_hash_net(const struct net *net, struct sk_buff *skb) { struct flow_keys keys; u32 hash; + memset(&keys, 0, sizeof(keys)); + + __skb_flow_dissect(net, skb, &flow_keys_dissector, + &keys, NULL, 0, 0, 0, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + __flow_hash_secret_init(); - hash = ___skb_get_hash(skb, &keys, &hashrnd); + hash = __flow_hash_from_keys(&keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } -EXPORT_SYMBOL(__skb_get_hash); +EXPORT_SYMBOL(__skb_get_hash_net); __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb) diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index fae9c4694186..7d426a8e29f3 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -75,7 +75,7 @@ static void est_fetch_counters(struct net_rate_estimator *e, static void est_timer(struct timer_list *t) { - struct net_rate_estimator *est = from_timer(est, t, timer); + struct net_rate_estimator *est = timer_container_of(est, t, timer); struct gnet_stats_basic_sync b; u64 b_bytes, b_packets; u64 rate, brate; @@ -177,7 +177,7 @@ int gen_new_estimator(struct gnet_stats_basic_sync *bstats, spin_lock_bh(lock); old = rcu_dereference_protected(*rate_est, 1); if (old) { - del_timer_sync(&old->timer); + timer_delete_sync(&old->timer); est->avbps = old->avbps; est->avpps = old->avpps; } @@ -206,7 +206,7 @@ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) { struct net_rate_estimator *est; - est = xchg((__force struct net_rate_estimator **)rate_est, NULL); + est = unrcu_pointer(xchg(rate_est, NULL)); if (est) { timer_shutdown_sync(&est->timer); kfree_rcu(est, rcu); diff --git a/net/core/gro.c b/net/core/gro.c index ee30d4f0c038..b350e5b69549 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -3,12 +3,10 @@ #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> +#include <linux/skbuff_ref.h> #define MAX_GRO_SKBS 8 -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(offload_lock); /** @@ -97,7 +95,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; - unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; int segs; @@ -111,12 +108,8 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; - /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ - gro_max_size = p->protocol == htons(ETH_P_IPV6) ? - READ_ONCE(p->dev->gro_max_size) : - READ_ONCE(p->dev->gro_ipv4_max_size); - - if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) + if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) || + NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { @@ -192,8 +185,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) } merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; @@ -229,8 +223,34 @@ done: return 0; } +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + + /* sk ownership - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + skb->sk = NULL; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} -static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +static void gro_complete(struct gro_node *gro, struct sk_buff *skb) { struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; @@ -263,43 +283,43 @@ static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); + gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count); } -static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, - bool flush_old) +static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old) { - struct list_head *head = &napi->gro_hash[index].list; + struct list_head *head = &gro->hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); - napi_gro_complete(napi, skb); - napi->gro_hash[index].count--; + gro_complete(gro, skb); + gro->hash[index].count--; } - if (!napi->gro_hash[index].count) - __clear_bit(index, &napi->gro_bitmask); + if (!gro->hash[index].count) + __clear_bit(index, &gro->bitmask); } -/* napi->gro_hash[].list contains packets ordered by age. +/* + * gro->hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +void __gro_flush(struct gro_node *gro, bool flush_old) { - unsigned long bitmask = napi->gro_bitmask; + unsigned long bitmask = gro->bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; - __napi_gro_flush_chain(napi, base, flush_old); + __gro_flush_chain(gro, base, flush_old); } } -EXPORT_SYMBOL(napi_gro_flush); +EXPORT_SYMBOL(__gro_flush); static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, const struct sk_buff *p, @@ -329,8 +349,6 @@ static void gro_list_prepare(const struct list_head *head, list_for_each_entry(p, head, list) { unsigned long diffs; - NAPI_GRO_CB(p)->flush = 0; - if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; @@ -347,7 +365,7 @@ static void gro_list_prepare(const struct list_head *head, skb_mac_header(skb), maclen); - /* in most common scenarions 'slow_gro' is 0 + /* in most common scenarios 'slow_gro' is 0 * otherwise we are already on some slower paths * either skip all the infrequent tests altogether or * avoid trying too hard to skip each of them individually @@ -370,6 +388,7 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) const skb_frag_t *frag0; unsigned int headlen; + NAPI_GRO_CB(skb)->network_offset = 0; NAPI_GRO_CB(skb)->data_offset = 0; headlen = skb_headlen(skb); NAPI_GRO_CB(skb)->frag0 = skb->data; @@ -380,7 +399,8 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) pinfo = skb_shinfo(skb); frag0 = &pinfo->frags[0]; - if (pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && + if (pinfo->nr_frags && skb_frag_page(frag0) && + !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, @@ -418,7 +438,7 @@ static void gro_try_pull_from_frag0(struct sk_buff *skb) gro_pull_from_frag0(skb, grow); } -static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +static void gro_flush_oldest(struct gro_node *gro, struct list_head *head) { struct sk_buff *oldest; @@ -434,14 +454,15 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) * SKB to the chain. */ skb_list_del_init(oldest); - napi_gro_complete(napi, oldest); + gro_complete(gro, oldest); } -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static enum gro_result dev_gro_receive(struct gro_node *gro, + struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &net_hotdata.offload_base; + struct gro_list *gro_list = &gro->hash[bucket]; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; @@ -469,7 +490,6 @@ found_ptype: sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; @@ -506,7 +526,7 @@ found_ptype: if (pp) { skb_list_del_init(pp); - napi_gro_complete(napi, pp); + gro_complete(gro, pp); gro_list->count--; } @@ -517,7 +537,7 @@ found_ptype: goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) - gro_flush_oldest(napi, &gro_list->list); + gro_flush_oldest(gro, &gro_list->list); else gro_list->count++; @@ -531,10 +551,10 @@ found_ptype: ret = GRO_HELD; ok: if (gro_list->count) { - if (!test_bit(bucket, &napi->gro_bitmask)) - __set_bit(bucket, &napi->gro_bitmask); - } else if (test_bit(bucket, &napi->gro_bitmask)) { - __clear_bit(bucket, &napi->gro_bitmask); + if (!test_bit(bucket, &gro->bitmask)) + __set_bit(bucket, &gro->bitmask); + } else if (test_bit(bucket, &gro->bitmask)) { + __clear_bit(bucket, &gro->bitmask); } return ret; @@ -573,13 +593,12 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static gro_result_t napi_skb_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) +static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb, 1); + gro_normal_one(gro, skb, 1); break; case GRO_MERGED_FREE: @@ -600,21 +619,21 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, return ret; } -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb) { gro_result_t ret; - skb_mark_napi_id(skb, napi); + __skb_mark_napi_id(skb, gro); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb)); trace_napi_gro_receive_exit(ret); return ret; } -EXPORT_SYMBOL(napi_gro_receive); +EXPORT_SYMBOL(gro_receive_skb); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { @@ -633,6 +652,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { @@ -670,7 +690,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb, 1); + gro_normal_one(&napi->gro, skb, 1); break; case GRO_MERGED_FREE: @@ -739,7 +759,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb)); trace_napi_gro_frags_exit(ret); return ret; @@ -771,3 +791,37 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete); + +void gro_init(struct gro_node *gro) +{ + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&gro->hash[i].list); + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +void gro_cleanup(struct gro_node *gro) +{ + struct sk_buff *skb, *n; + + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + list_for_each_entry_safe(skb, n, &gro->hash[i].list, list) + kfree_skb(skb); + + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + list_for_each_entry_safe(skb, n, &gro->rx_list, list) + kfree_skb(skb); + + gro->rx_count = 0; +} diff --git a/net/core/hotdata.c b/net/core/hotdata.c index c8a7a451c18a..0bc893d5f07b 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -1,13 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <net/hotdata.h> #include <linux/cache.h> #include <linux/jiffies.h> #include <linux/list.h> - +#include <net/hotdata.h> +#include <net/proto_memory.h> struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), - .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all), .gro_normal_batch = 8, .netdev_budget = 300, @@ -18,5 +17,8 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .max_backlog = 1000, .dev_tx_weight = 64, .dev_rx_weight = 64, + .sysctl_max_skb_frags = MAX_SKB_FRAGS, + .sysctl_skb_defer_max = 64, + .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c new file mode 100644 index 000000000000..759a9b9f3f89 --- /dev/null +++ b/net/core/ieee8021q_helpers.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <linux/types.h> +#include <net/dscp.h> +#include <net/ieee8021q.h> + +/* The following arrays map Traffic Types (TT) to traffic classes (TC) for + * different number of queues as shown in the example provided by + * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and + * Table I-1 "Traffic type to traffic class mapping". + */ +static const u8 ieee8021q_8queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, + [IEEE8021Q_TT_VO] = 5, + [IEEE8021Q_TT_IC] = 6, + [IEEE8021Q_TT_NC] = 7, +}; + +static const u8 ieee8021q_7queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4, + [IEEE8021Q_TT_IC] = 5, + [IEEE8021Q_TT_NC] = 6, +}; + +static const u8 ieee8021q_6queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2, + [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3, + [IEEE8021Q_TT_IC] = 4, + [IEEE8021Q_TT_NC] = 5, +}; + +static const u8 ieee8021q_5queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, + [IEEE8021Q_TT_NC] = 4, +}; + +static const u8 ieee8021q_4queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3, +}; + +static const u8 ieee8021q_3queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2, +}; + +static const u8 ieee8021q_2queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1, +}; + +static const u8 ieee8021q_1queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0, + [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0, +}; + +/** + * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class + * @tt: IEEE 802.1Q Traffic Type + * @num_queues: Number of queues + * + * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based + * on the number of queues configured on the NIC. The mapping is based on the + * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic + * class mapping" and Table I-1 "Traffic type to traffic class mapping". + * + * Return: Traffic Class corresponding to the given Traffic Type or negative + * value in case of error. + */ +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) +{ + if (tt < 0 || tt >= IEEE8021Q_TT_MAX) { + pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt, + IEEE8021Q_TT_MAX); + return -EINVAL; + } + + switch (num_queues) { + case 8: + compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_8queue_tt_tc_map != max - 1"); + return ieee8021q_8queue_tt_tc_map[tt]; + case 7: + compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_7queue_tt_tc_map != max - 1"); + + return ieee8021q_7queue_tt_tc_map[tt]; + case 6: + compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_6queue_tt_tc_map != max - 1"); + + return ieee8021q_6queue_tt_tc_map[tt]; + case 5: + compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_5queue_tt_tc_map != max - 1"); + + return ieee8021q_5queue_tt_tc_map[tt]; + case 4: + compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_4queue_tt_tc_map != max - 1"); + + return ieee8021q_4queue_tt_tc_map[tt]; + case 3: + compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_3queue_tt_tc_map != max - 1"); + + return ieee8021q_3queue_tt_tc_map[tt]; + case 2: + compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_2queue_tt_tc_map != max - 1"); + + return ieee8021q_2queue_tt_tc_map[tt]; + case 1: + compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_1queue_tt_tc_map != max - 1"); + + return ieee8021q_1queue_tt_tc_map[tt]; + } + + pr_err("Invalid number of queues %d\n", num_queues); + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc); + +/** + * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type + * @dscp: IETF DSCP value + * + * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT). + * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic + * Type, this function is inspired by the RFC8325 documentation which describe + * the mapping between DSCP and 802.11 User Priority (UP) values. + * + * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value + */ +int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + switch (dscp) { + case DSCP_CS0: + /* Comment from RFC8325: + * [RFC4594], Section 4.8, recommends High-Throughput Data be marked + * AF1x (that is, AF11, AF12, and AF13, according to the rules defined + * in [RFC2475]). + * + * By default (as described in Section 2.3), High-Throughput Data will + * map to UP 1 and, thus, to the Background Access Category (AC_BK), + * which is contrary to the intent expressed in [RFC4594]. + + * Unfortunately, there really is no corresponding fit for the High- + * Throughput Data service class within the constrained 4 Access + * Category [IEEE.802.11-2016] model. If the High-Throughput Data + * service class is assigned to the Best Effort Access Category (AC_BE), + * then it would contend with Low-Latency Data (while [RFC4594] + * recommends a distinction in servicing between these service classes) + * as well as with the default service class; alternatively, if it is + * assigned to the Background Access Category (AC_BK), then it would + * receive a less-then-best-effort service and contend with Low-Priority + * Data (as discussed in Section 4.2.10). + * + * As such, since there is no directly corresponding fit for the High- + * Throughout Data service class within the [IEEE.802.11-2016] model, it + * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby + * admitting it to the Best Effort Access Category (AC_BE). + * + * Note: The above text is from RFC8325 which is describing the mapping + * between DSCP and 802.11 User Priority (UP) values. The mapping + * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but + * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q + * Traffic Types BE and BK. + */ + case DSCP_AF11: + case DSCP_AF12: + case DSCP_AF13: + return IEEE8021Q_TT_BE; + /* Comment from RFC8325: + * RFC3662 and RFC4594 both recommend Low-Priority Data be marked + * with DSCP CS1. The Low-Priority Data service class loosely + * corresponds to the [IEEE.802.11-2016] Background Access Category + */ + case DSCP_CS1: + return IEEE8021Q_TT_BK; + case DSCP_CS2: + case DSCP_AF21: + case DSCP_AF22: + case DSCP_AF23: + return IEEE8021Q_TT_EE; + case DSCP_CS3: + case DSCP_AF31: + case DSCP_AF32: + case DSCP_AF33: + return IEEE8021Q_TT_CA; + case DSCP_CS4: + case DSCP_AF41: + case DSCP_AF42: + case DSCP_AF43: + return IEEE8021Q_TT_VI; + case DSCP_CS5: + case DSCP_EF: + case DSCP_VOICE_ADMIT: + return IEEE8021Q_TT_VO; + case DSCP_CS6: + return IEEE8021Q_TT_IC; + case DSCP_CS7: + return IEEE8021Q_TT_NC; + } + + return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp); +} +EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 8ec35194bfcb..864f3bbc3a4c 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -42,12 +42,21 @@ static unsigned int default_operstate(const struct net_device *dev) * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { - int iflink = dev_get_iflink(dev); struct net_device *peer; + int iflink; + + /* If called from netdev_run_todo()/linkwatch_sync_dev(), + * dev_net(dev) can be already freed, and RTNL is not held. + */ + if (dev->reg_state <= NETREG_REGISTERED) + iflink = dev_get_iflink(dev); + else + iflink = dev->ifindex; if (iflink == dev->ifindex) return IF_OPER_DOWN; + ASSERT_RTNL(); peer = __dev_get_by_index(dev_net(dev), iflink); if (!peer) return IF_OPER_DOWN; @@ -148,9 +157,9 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_wq, &linkwatch_work, 0); + mod_delayed_work(system_unbound_wq, &linkwatch_work, 0); else - schedule_delayed_work(&linkwatch_work, delay); + queue_delayed_work(system_unbound_wq, &linkwatch_work, delay); } @@ -174,7 +183,7 @@ static void linkwatch_do_dev(struct net_device *dev) else dev_deactivate(dev); - netdev_state_change(dev); + netif_state_change(dev); } /* Note: our callers are responsible for calling netdev_tracker_free(). * This is the reason we use __dev_put() instead of dev_put(). @@ -231,7 +240,9 @@ static void __linkwatch_run_queue(int urgent_only) */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); + netdev_lock_ops(dev); linkwatch_do_dev(dev); + netdev_unlock_ops(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } @@ -244,25 +255,41 @@ static void __linkwatch_run_queue(int urgent_only) spin_unlock_irq(&lweventlist_lock); } -void linkwatch_sync_dev(struct net_device *dev) +static bool linkwatch_clean_dev(struct net_device *dev) { unsigned long flags; - int clean = 0; + bool clean = false; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); - clean = 1; + clean = true; /* We must release netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); - if (clean) + + return clean; +} + +void __linkwatch_sync_dev(struct net_device *dev) +{ + netdev_ops_assert_locked(dev); + + if (linkwatch_clean_dev(dev)) linkwatch_do_dev(dev); } +void linkwatch_sync_dev(struct net_device *dev) +{ + if (linkwatch_clean_dev(dev)) { + netdev_lock_ops(dev); + linkwatch_do_dev(dev); + netdev_unlock_ops(dev); + } +} /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c new file mode 100644 index 000000000000..9e9fb25314b9 --- /dev/null +++ b/net/core/lock_debug.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Amazon.com Inc. or its affiliates. */ + +#include <linux/init.h> +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/netdev_lock.h> +#include <net/netns/generic.h> + +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + enum netdev_cmd cmd = event; + + /* Keep enum and don't add default to trigger -Werror=switch */ + switch (cmd) { + case NETDEV_XDP_FEAT_CHANGE: + netdev_assert_locked(dev); + fallthrough; + case NETDEV_CHANGE: + case NETDEV_REGISTER: + case NETDEV_UP: + netdev_ops_assert_locked(dev); + fallthrough; + case NETDEV_DOWN: + case NETDEV_REBOOT: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_PRE_CHANGEADDR: + case NETDEV_GOING_DOWN: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_PRE_UP: + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + case NETDEV_POST_INIT: + case NETDEV_PRE_UNINIT: + case NETDEV_RELEASE: + case NETDEV_NOTIFY_PEERS: + case NETDEV_JOIN: + case NETDEV_CHANGEUPPER: + case NETDEV_RESEND_IGMP: + case NETDEV_PRECHANGEMTU: + case NETDEV_CHANGEINFODATA: + case NETDEV_BONDING_INFO: + case NETDEV_PRECHANGEUPPER: + case NETDEV_CHANGELOWERSTATE: + case NETDEV_UDP_TUNNEL_PUSH_INFO: + case NETDEV_UDP_TUNNEL_DROP_INFO: + case NETDEV_CHANGE_TX_QUEUE_LEN: + case NETDEV_CVLAN_FILTER_PUSH_INFO: + case NETDEV_CVLAN_FILTER_DROP_INFO: + case NETDEV_SVLAN_FILTER_PUSH_INFO: + case NETDEV_SVLAN_FILTER_DROP_INFO: + case NETDEV_OFFLOAD_XSTATS_ENABLE: + case NETDEV_OFFLOAD_XSTATS_DISABLE: + case NETDEV_OFFLOAD_XSTATS_REPORT_USED: + case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: + ASSERT_RTNL(); + break; + + case NETDEV_CHANGENAME: + ASSERT_RTNL_NET(net); + break; + } + + return NOTIFY_DONE; +} +EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL"); + +static int rtnl_net_debug_net_id; + +static int __net_init rtnl_net_debug_net_init(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + nb->notifier_call = netdev_debug_event; + + return register_netdevice_notifier_net(net, nb); +} + +static void __net_exit rtnl_net_debug_net_exit(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + unregister_netdevice_notifier_net(net, nb); +} + +static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { + .init = rtnl_net_debug_net_init, + .exit = rtnl_net_debug_net_exit, + .id = &rtnl_net_debug_net_id, + .size = sizeof(struct notifier_block), +}; + +static struct notifier_block rtnl_net_debug_block = { + .notifier_call = netdev_debug_event, +}; + +static int __init rtnl_net_debug_init(void) +{ + int ret; + + ret = register_pernet_subsys(&rtnl_net_debug_net_ops); + if (ret) + return ret; + + ret = register_netdevice_notifier(&rtnl_net_debug_block); + if (ret) + unregister_pernet_subsys(&rtnl_net_debug_net_ops); + + return ret; +} + +subsys_initcall(rtnl_net_debug_init); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 4a0797f0a154..ae74634310a3 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -10,8 +10,10 @@ #include <linux/bpf.h> #include <net/lwtunnel.h> #include <net/gre.h> +#include <net/ip.h> #include <net/ip6_route.h> #include <net/ipv6_stubs.h> +#include <net/inet_dscp.h> struct bpf_lwt_prog { struct bpf_prog *prog; @@ -38,13 +40,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, struct dst_entry *dst, bool can_redirect) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int ret; - /* Migration disable and BH disable are needed to protect per-cpu - * redirect_info between BPF prog and skb_do_redirect(). + /* Disabling BH is needed to protect per-CPU bpf_redirect_info between + * BPF prog and skb_do_redirect(). */ - migrate_disable(); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -77,24 +80,26 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, break; } + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); - migrate_enable(); return ret; } static int bpf_lwt_input_reroute(struct sk_buff *skb) { + enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { struct net_device *dev = skb_dst(skb)->dev; - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); dev_hold(dev); skb_dst_drop(skb); - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); + reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); @@ -204,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 711cd3b4347a..f9d76d85d04f 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -23,6 +23,8 @@ #include <net/ip6_fib.h> #include <net/rtnh.h> +#include "dev.h" + DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); @@ -158,21 +160,14 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) return ret; } - rcu_read_lock(); - ops = rcu_dereference(lwtun_encaps[encap_type]); - rcu_read_unlock(); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); - rtnl_lock(); - - rcu_read_lock(); - ops = rcu_dereference(lwtun_encaps[encap_type]); - rcu_read_unlock(); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } #endif @@ -206,8 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, } encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type, - extack) != 0) + if (lwtunnel_valid_encap_type(encap_type, extack)) return -EOPNOTSUPP; } } @@ -325,82 +319,132 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; + + local_bh_disable(); - if (!dst) + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->output)) + if (likely(ops && ops->output)) { + dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + local_bh_disable(); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->xmit)) + if (likely(ops && ops->xmit)) { + dev_xmit_recursion_inc(); ret = ops->xmit(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || @@ -410,8 +454,11 @@ int lwtunnel_input(struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->input)) + if (likely(ops && ops->input)) { + dev_xmit_recursion_inc(); ret = ops->input(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) diff --git a/net/core/mp_dmabuf_devmem.h b/net/core/mp_dmabuf_devmem.h new file mode 100644 index 000000000000..67cd0dd7319c --- /dev/null +++ b/net/core/mp_dmabuf_devmem.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Dmabuf device memory provider. + * + * Authors: Mina Almasry <almasrymina@google.com> + * + */ +#ifndef _NET_MP_DMABUF_DEVMEM_H +#define _NET_MP_DMABUF_DEVMEM_H + +#include <net/netmem.h> + +#if defined(CONFIG_NET_DEVMEM) +int mp_dmabuf_devmem_init(struct page_pool *pool); + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp); + +void mp_dmabuf_devmem_destroy(struct page_pool *pool); + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem); +#else +static inline int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + return -EOPNOTSUPP; +} + +static inline netmem_ref +mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + return 0; +} + +static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ +} + +static inline bool +mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + return false; +} +#endif + +#endif /* _NET_MP_DMABUF_DEVMEM_H */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 552719c3bbc3..49dce9a82295 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -14,7 +14,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> -#include <linux/kmemleak.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> @@ -61,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, static const struct seq_operations neigh_stat_seq_ops; #endif +static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) +{ + int i; + + switch (family) { + default: + DEBUG_NET_WARN_ON_ONCE(1); + fallthrough; /* to avoid panic by null-ptr-deref */ + case AF_INET: + i = NEIGH_ARP_TABLE; + break; + case AF_INET6: + i = NEIGH_ND_TABLE; + break; + } + + return &dev->neighbours[i]; +} + /* Neighbour hash table buckets are protected with rwlock tbl->lock. @@ -205,18 +223,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, } } -static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, - struct neigh_table *tbl) +bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { - struct neighbour *neigh; - - neigh = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(*np, neigh); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } @@ -226,29 +240,6 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, return retval; } -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) -{ - struct neigh_hash_table *nht; - void *pkey = ndel->primary_key; - u32 hash_val; - struct neighbour *n; - struct neighbour __rcu **np; - - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); - hash_val = hash_val >> (32 - nht->hash_shift); - - np = &nht->hash_buckets[hash_val]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock)))) { - if (n == ndel) - return neigh_del(n, np, tbl); - np = &n->next; - } - return false; -} - static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - @@ -276,7 +267,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) remove = true; write_unlock(&n->lock); - if (remove && neigh_remove_one(n, tbl)) + if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; @@ -318,7 +309,7 @@ static void neigh_add_timer(struct neighbour *n, unsigned long when) static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && - del_timer(&n->timer)) { + timer_delete(&n->timer)) { neigh_release(n); return 1; } @@ -380,54 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - int i; - struct neigh_hash_table *nht; + struct hlist_head *dev_head; + struct hlist_node *tmp; + struct neighbour *n; - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); + dev_head = neigh_get_dev_table(dev, tbl->family); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np = &nht->hash_buckets[i]; + hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { + if (skip_perm && n->nud_state & NUD_PERMANENT) + continue; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - if (dev && n->dev != dev) { - np = &n->next; - continue; - } - if (skip_perm && n->nud_state & NUD_PERMANENT) { - np = &n->next; - continue; - } - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - write_lock(&n->lock); - neigh_del_timer(n); - neigh_mark_dead(n); - if (refcount_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - We must destroy neighbour entry, - but someone still uses it. - - The destroy will be delayed until - the last user releases us, but - we must kill timers etc. and move - it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - WRITE_ONCE(n->output, neigh_blackhole); - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + write_lock(&n->lock); + neigh_del_timer(n); + neigh_mark_dead(n); + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + neigh_dbg(2, "neigh %p is stray\n", n); } + write_unlock(&n->lock); + neigh_cleanup_and_release(n); } } @@ -448,7 +427,7 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) - del_timer_sync(&tbl->proxy_timer); + timer_delete_sync(&tbl->proxy_timer); return 0; } @@ -530,27 +509,21 @@ static void neigh_get_hash_rnd(u32 *x) static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { - size_t size = (1 << shift) * sizeof(struct neighbour *); + size_t size = (1 << shift) * sizeof(struct hlist_head); + struct hlist_head *hash_heads; struct neigh_hash_table *ret; - struct neighbour __rcu **buckets; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) { - buckets = kzalloc(size, GFP_ATOMIC); - } else { - buckets = (struct neighbour __rcu **) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); - kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); - } - if (!buckets) { + + hash_heads = kzalloc(size, GFP_ATOMIC); + if (!hash_heads) { kfree(ret); return NULL; } - ret->hash_buckets = buckets; + ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); @@ -562,15 +535,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); - size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); - struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) { - kfree(buckets); - } else { - kmemleak_free(buckets); - free_pages((unsigned long)buckets, get_order(size)); - } + kfree(nht->hash_heads); kfree(nht); } @@ -589,24 +555,17 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { - struct neighbour *n, *next; + struct hlist_node *tmp; + struct neighbour *n; - for (n = rcu_dereference_protected(old_nht->hash_buckets[i], - lockdep_is_held(&tbl->lock)); - n != NULL; - n = next) { + neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); - next = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(n->next, - rcu_dereference_protected( - new_nht->hash_buckets[hash], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(new_nht->hash_buckets[hash], n); + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } @@ -693,11 +652,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, goto out_tbl_unlock; } - for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); - n1 != NULL; - n1 = rcu_dereference_protected(n1->next, - lockdep_is_held(&tbl->lock))) { + neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); @@ -713,10 +668,11 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); - rcu_assign_pointer(n->next, - rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(nht->hash_buckets[hash_val], n); + hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); + + hlist_add_head_rcu(&n->dev_list, + neigh_get_dev_table(dev, tbl->family)); + write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; @@ -734,7 +690,9 @@ out_neigh_release: struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); + bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + + return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -874,12 +832,10 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, return -ENOENT; } -static void neigh_parms_destroy(struct neigh_parms *parms); - static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) - neigh_parms_destroy(parms); + kfree(parms); } /* @@ -946,10 +902,10 @@ static void neigh_connect(struct neighbour *neigh) static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neigh_hash_table *nht; + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np; unsigned int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); @@ -974,10 +930,7 @@ static void neigh_periodic_work(struct work_struct *work) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { - np = &nht->hash_buckets[i]; - - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); @@ -986,7 +939,7 @@ static void neigh_periodic_work(struct work_struct *work) if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); - goto next_elt; + continue; } if (time_before(n->used, n->confirmed) && @@ -997,18 +950,14 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); - -next_elt: - np = &n->next; } /* * It's fine to release lock here, even if hash table @@ -1082,7 +1031,7 @@ static void neigh_probe(struct neighbour *neigh) static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; - struct neighbour *neigh = from_timer(neigh, t, timer); + struct neighbour *neigh = timer_container_of(neigh, t, timer); unsigned int state; int notify = 0; @@ -1568,7 +1517,7 @@ out: return rc; out_kfree_skb: rc = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); @@ -1592,7 +1541,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) err = dev_queue_xmit(skb); else { err = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } @@ -1620,7 +1569,7 @@ static void neigh_managed_work(struct work_struct *work) static void neigh_proxy_process(struct timer_list *t) { - struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); + struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; @@ -1648,7 +1597,7 @@ static void neigh_proxy_process(struct timer_list *t) } else if (!sched_next || tdif < sched_next) sched_next = tdif; } - del_timer(&tbl->proxy_timer); + timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); @@ -1679,7 +1628,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); - if (del_timer(&tbl->proxy_timer)) { + if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } @@ -1762,14 +1711,9 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } EXPORT_SYMBOL(neigh_parms_release); -static void neigh_parms_destroy(struct neigh_parms *parms) -{ - kfree(parms); -} - static struct lock_class_key neigh_table_proxy_queue_class; -static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { @@ -1826,17 +1770,23 @@ void neigh_table_init(int index, struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; - neigh_tables[index] = tbl; + rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */ int neigh_table_clear(int index, struct neigh_table *tbl) { - neigh_tables[index] = NULL; + RCU_INIT_POINTER(neigh_tables[index], NULL); + synchronize_rcu(); + /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); - del_timer_sync(&tbl->proxy_timer); + timer_delete_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) @@ -1864,10 +1814,10 @@ static struct neigh_table *neigh_find_table(int family) switch (family) { case AF_INET: - tbl = neigh_tables[NEIGH_ARP_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: - tbl = neigh_tables[NEIGH_ND_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } @@ -1949,7 +1899,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); - neigh_remove_one(neigh, tbl); + neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); out: @@ -2293,6 +2243,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, @@ -2331,7 +2282,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) @@ -2479,12 +2430,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, { struct ndtmsg *ndtm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); + if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } - ndtm = nlmsg_data(nlh); if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; @@ -2519,7 +2470,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; @@ -2674,7 +2625,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = dev ? netdev_master_upper_dev_get(dev) : NULL; + master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". @@ -2707,7 +2658,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct net *net = sock_net(skb->sk); struct neighbour *n; - int rc, h, s_h = cb->args[1]; + int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; @@ -2715,37 +2666,31 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - rcu_read_lock(); nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; - for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0; - n != NULL; - n = rcu_dereference(n->next)) { + idx = 0; + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - flags) < 0) { - rc = -1; + err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags); + if (err < 0) goto out; - } next: idx++; } } - rc = skb->len; out: - rcu_read_unlock(); cb->args[1] = h; cb->args[2] = idx; - return rc; + return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, @@ -2754,7 +2699,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); - int rc, h, s_h = cb->args[3]; + int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; @@ -2772,11 +2717,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, flags, tbl) < 0) { + err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags, tbl); + if (err < 0) { read_unlock_bh(&tbl->lock); - rc = -1; goto out; } next: @@ -2785,12 +2730,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, } read_unlock_bh(&tbl->lock); - rc = skb->len; out: cb->args[3] = h; cb->args[4] = idx; - return rc; - + return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, @@ -2804,12 +2747,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, if (strict_check) { struct ndmsg *ndm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); @@ -2875,11 +2818,13 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; + err = 0; s_t = cb->args[0]; + rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { - tbl = neigh_tables[t]; + tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; @@ -2895,9 +2840,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } + rcu_read_unlock(); cb->args[0] = t; - return skb->len; + return err; } static int neigh_valid_get_req(const struct nlmsghdr *nlh, @@ -2909,12 +2855,12 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); @@ -3094,9 +3040,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; - for (n = rcu_dereference(nht->hash_buckets[chain]); - n != NULL; - n = rcu_dereference(n->next)) + neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } read_unlock_bh(&tbl->lock); @@ -3108,29 +3052,25 @@ EXPORT_SYMBOL(neigh_for_each); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { - int chain; struct neigh_hash_table *nht; + int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[chain]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); - } else - np = &n->next; + } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); @@ -3143,14 +3083,15 @@ int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; + if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; - tbl = neigh_tables[index]; - if (!tbl) - goto out; rcu_read_lock(); + tbl = rcu_dereference(neigh_tables[index]); + if (!tbl) + goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); @@ -3166,6 +3107,7 @@ int neigh_xmit(int index, struct net_device *dev, goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { @@ -3185,43 +3127,53 @@ EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS -static struct neighbour *neigh_get_first(struct seq_file *seq) +static struct neighbour *neigh_get_valid(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); + + if (!net_eq(dev_net(n->dev), net)) + return NULL; + + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); + if (!v) + return NULL; + if (pos) + return v; + } + + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + return n; + + if (READ_ONCE(n->nud_state) & ~NUD_NOARP) + return n; + + return NULL; +} + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; - struct neighbour *n = NULL; - int bucket; + struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { - n = rcu_dereference(nht->hash_buckets[bucket]); - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - loff_t fakep = 0; - void *v; - - v = state->neigh_sub_iter(state, n, &fakep); - if (!v) - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - if (READ_ONCE(n->nud_state) & ~NUD_NOARP) - break; -next: - n = rcu_dereference(n->next); + while (++state->bucket < (1 << nht->hash_shift)) { + neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { + tmp = neigh_get_valid(seq, n, NULL); + if (tmp) + return tmp; } - - if (n) - break; } - state->bucket = bucket; - return n; + return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, @@ -3229,46 +3181,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; - struct net *net = seq_file_net(seq); - struct neigh_hash_table *nht = state->nht; + struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); + if (v) return n; } - n = rcu_dereference(n->next); - while (1) { - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - void *v = state->neigh_sub_iter(state, n, pos); - if (v) - return n; - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - - if (READ_ONCE(n->nud_state) & ~NUD_NOARP) - break; -next: - n = rcu_dereference(n->next); + hlist_for_each_entry_continue(n, hash) { + tmp = neigh_get_valid(seq, n, pos); + if (tmp) { + n = tmp; + goto out; } - - if (n) - break; - - if (++state->bucket >= (1 << nht->hash_shift)) - break; - - n = rcu_dereference(nht->hash_buckets[state->bucket]); } + n = neigh_get_first(seq); +out: if (n && pos) --(*pos); + return n; } @@ -3371,7 +3305,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl struct neigh_seq_state *state = seq->private; state->tbl = tbl; - state->bucket = 0; + state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock(); @@ -3507,10 +3441,12 @@ static const struct seq_operations neigh_stat_seq_ops = { static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { - struct net *net = dev_net(n->dev); struct sk_buff *skb; int err = -ENOBUFS; + struct net *net; + rcu_read_lock(); + net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; @@ -3523,10 +3459,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; + goto out; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +out: + rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) @@ -3538,7 +3475,7 @@ EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); -static int proc_unres_qlen(struct ctl_table *ctl, int write, +static int proc_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; @@ -3573,7 +3510,7 @@ static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, rcu_read_unlock(); } -static void neigh_proc_update(struct ctl_table *ctl, int write) +static void neigh_proc_update(const struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; @@ -3590,7 +3527,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) neigh_copy_dflt_parms(net, p, index); } -static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3605,7 +3542,7 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; @@ -3621,7 +3558,7 @@ static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int wr return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, +int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3631,7 +3568,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, +int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3641,7 +3578,7 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, } EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); -static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3651,7 +3588,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, +int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3661,7 +3598,7 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); -static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3671,7 +3608,7 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, return ret; } -static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, +static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3728,7 +3665,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3779,7 +3716,6 @@ static struct neigh_sysctl_table { .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, - {}, }, }; @@ -3807,8 +3743,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, - sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; @@ -3885,16 +3819,18 @@ EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ +static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, + {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, + {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, + {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, +}; + static int __init neigh_init(void) { - rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); - - rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, - 0); - rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0); - + rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index a97eceb84e61..4f0f0709a1cb 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); + /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) - flow_limit_count = fl->count; + flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif @@ -144,10 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count, + READ_ONCE(sd->received_rps), flow_limit_count, input_qlen + process_qlen, (int)seq->index, input_qlen, process_qlen); return 0; @@ -184,7 +186,13 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) } } - list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) { + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) { if (i == pos) return pt; ++i; @@ -209,6 +217,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); struct net_device *dev; struct packet_type *pt; struct list_head *nxt; @@ -231,15 +240,22 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) goto found; } } - - nxt = net_hotdata.ptype_all.next; - goto ptype_all; + nxt = net->ptype_all.next; + goto net_ptype_all; } - if (pt->type == htons(ETH_P_ALL)) { -ptype_all: - if (nxt != &net_hotdata.ptype_all) + if (pt->af_packet_net) { +net_ptype_all: + if (nxt != &net->ptype_all && nxt != &net->ptype_specific) goto found; + + if (nxt == &net->ptype_all) { + /* continue with ->ptype_specific if it's not empty */ + nxt = net->ptype_specific.next; + if (nxt != &net->ptype_specific) + goto found; + } + hash = 0; nxt = ptype_base[0].next; } else diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e3d7a8cfa20b..1ace0cd01adc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> +#include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/rps.h> @@ -32,15 +33,97 @@ #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; static const char fmt_dec[] = "%d\n"; +static const char fmt_uint[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; -/* Caller holds RTNL or RCU */ +/* Caller holds RTNL, netdev->lock or RCU */ static inline int dev_isalive(const struct net_device *dev) { return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } +/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, + * when unregistering a net device and accessing associated sysfs files. The + * potential deadlock is as follow: + * + * CPU 0 CPU 1 + * + * rtnl_lock vfs_read + * unregister_netdevice_many kernfs_seq_start + * device_del / kobject_put kernfs_get_active (kn->active++) + * kernfs_drain sysfs_kf_seq_show + * wait_event( rtnl_lock + * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release + * -> waits on CPU 1 to decrease kn->active the rtnl lock. + * + * The historical fix was to use rtnl_trylock with restart_syscall to bail out + * of sysfs operations when the lock couldn't be taken. This fixed the above + * issue as it allowed CPU 1 to bail out of the ABBA situation. + * + * But it came with performances issues, as syscalls are being restarted in + * loops when there was contention on the rtnl lock, with huge slow downs in + * specific scenarios (e.g. lots of virtual interfaces created and userspace + * daemons querying their attributes). + * + * The idea below is to bail out of the active kernfs_node protection + * (kn->active) while trying to take the rtnl lock. + * + * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The + * net device is guaranteed to be alive if this returns successfully. + */ +static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, + struct net_device *ndev) +{ + struct kernfs_node *kn; + int ret = 0; + + /* First, we hold a reference to the net device as the unregistration + * path might run in parallel. This will ensure the net device and the + * associated sysfs objects won't be freed while we try to take the rtnl + * lock. + */ + dev_hold(ndev); + /* sysfs_break_active_protection was introduced to allow self-removal of + * devices and their associated sysfs files by bailing out of the + * sysfs/kernfs protection. We do this here to allow the unregistration + * path to complete in parallel. The following takes a reference on the + * kobject and the kernfs_node being accessed. + * + * This works because we hold a reference onto the net device and the + * unregistration path will wait for us eventually in netdev_run_todo + * (outside an rtnl lock section). + */ + kn = sysfs_break_active_protection(kobj, attr); + /* We can now try to take the rtnl lock. This can't deadlock us as the + * unregistration path is able to drain sysfs files (kernfs_node) thanks + * to the above dance. + */ + if (rtnl_lock_interruptible()) { + ret = -ERESTARTSYS; + goto unbreak; + } + /* Check dismantle on the device hasn't started, otherwise deny the + * operation. + */ + if (!dev_isalive(ndev)) { + rtnl_unlock(); + ret = -ENODEV; + goto unbreak; + } + /* We are now sure the device dismantle hasn't started nor that it can + * start before we exit the locking section as we hold the rtnl lock. + * There's no need to keep unbreaking the sysfs protection nor to hold + * a net device reference from that point; that was only needed to take + * the rtnl lock. + */ +unbreak: + sysfs_unbreak_active_protection(kn); + dev_put(ndev); + + return ret; +} + /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, @@ -94,16 +177,46 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (ret) goto err; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + goto err; + + ret = (*set)(netdev, new); + if (ret == 0) + ret = len; + + rtnl_unlock(); + err: + return ret; +} + +/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */ +static ssize_t +netdev_lock_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *netdev = to_net_dev(dev); + struct net *net = dev_net(netdev); + unsigned long new; + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + ret = kstrtoul(buf, 0, &new); + if (ret) + return ret; + + netdev_lock(netdev); if (dev_isalive(netdev)) { ret = (*set)(netdev, new); if (ret == 0) ret = len; } - rtnl_unlock(); - err: + netdev_unlock(netdev); + return ret; } @@ -189,7 +302,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early - * without hitting the trylock/restart in netdev_store. + * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; @@ -201,11 +314,13 @@ static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - int ret = -EINVAL; + int ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { /* Synchronize carrier state with link watch, * see also rtnl_getlink(). @@ -214,8 +329,8 @@ static ssize_t carrier_show(struct device *dev, ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } - rtnl_unlock(); + rtnl_unlock(); return ret; } static DEVICE_ATTR_RW(carrier); @@ -227,15 +342,17 @@ static ssize_t speed_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (netif_running(netdev) && netif_device_present(netdev)) { + ret = -EINVAL; + if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) @@ -253,14 +370,16 @@ static ssize_t duplex_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -408,7 +527,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - WRITE_ONCE(dev->gro_flush_timeout, val); + netdev_set_gro_flush_timeout(dev, val); return 0; } @@ -419,13 +538,16 @@ static ssize_t gro_flush_timeout_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_gro_flush_timeout); + return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout); } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { - WRITE_ONCE(dev->napi_defer_hard_irqs, val); + if (val > S32_MAX) + return -ERANGE; + + netdev_set_defer_hard_irqs(dev, (u32)val); return 0; } @@ -436,9 +558,10 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); + return netdev_lock_store(dev, attr, buf, len, + change_napi_defer_hard_irqs); } -NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint); static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -446,7 +569,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; - ssize_t ret = 0; + ssize_t ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -455,16 +578,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - ret = dev_set_alias(netdev, buf, count); - if (ret < 0) - goto err; - ret = len; - netdev_state_change(netdev); - } + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); err: rtnl_unlock(); @@ -476,7 +598,7 @@ static ssize_t ifalias_show(struct device *dev, { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; - ssize_t ret = 0; + ssize_t ret; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) @@ -516,24 +638,23 @@ static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + struct netdev_phys_item_id ppid; + ssize_t ret; /* The check is also done in dev_get_phys_port_id; this helps returning - * early without hitting the trylock/restart below. + * early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_id) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_phys_port_id(netdev, &ppid); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -544,25 +665,24 @@ static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + char name[IFNAMSIZ]; + ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - char name[IFNAMSIZ]; + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sysfs_emit(buf, "%s\n", name); - ret = dev_get_phys_port_name(netdev, name, sizeof(name)); - if (!ret) - ret = sysfs_emit(buf, "%s\n", name); - } rtnl_unlock(); return ret; @@ -573,26 +693,25 @@ static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + struct netdev_phys_item_id ppid = { }; + ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. This works + * returning early without hitting the locking section below. This works * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid = { }; + ret = dev_get_port_parent_id(netdev, &ppid, false); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_port_parent_id(netdev, &ppid, false); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -605,13 +724,13 @@ static ssize_t threaded_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + rcu_read_lock(); if (dev_isalive(netdev)) - ret = sysfs_emit(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + + rcu_read_unlock(); - rtnl_unlock(); return ret; } @@ -634,7 +753,7 @@ static ssize_t threaded_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - return netdev_store(dev, attr, buf, len, modify_napi_threaded); + return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded); } static DEVICE_ATTR_RW(threaded); @@ -937,7 +1056,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) - val = (unsigned long)flow_table->mask + 1; + val = 1UL << flow_table->log; rcu_read_unlock(); return sysfs_emit(buf, "%lu\n", val); @@ -990,7 +1109,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, if (!table) return -ENOMEM; - table->mask = mask; + table->log = ilog2(mask) + 1; for (count = 0; count <= mask; count++) table->flows[count].cpu = RPS_NO_CPU; } else { @@ -1056,7 +1175,7 @@ static const void *rx_queue_namespace(const struct kobject *kobj) struct device *dev = &queue->dev->dev; const void *ns = NULL; - if (dev->class && dev->class->ns_type) + if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; @@ -1073,7 +1192,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; @@ -1096,6 +1214,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Rx queues are cleared in rx_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ @@ -1107,20 +1241,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = rx_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) - goto err; + goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) - goto err; + goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1165,12 +1306,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { - struct kobject *kobj = &dev->_rx[i].kobj; + struct netdev_rx_queue *queue = &dev->_rx[i]; + struct kobject *kobj = &queue->kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } @@ -1209,9 +1352,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num, */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, char *buf); - ssize_t (*store)(struct netdev_queue *queue, - const char *buf, size_t len); + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) @@ -1228,7 +1373,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - return attribute->show(queue, buf); + return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, @@ -1242,7 +1387,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - return attribute->store(queue, buf, count); + return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -1250,7 +1395,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; -static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) +static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); @@ -1268,18 +1414,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } -static ssize_t traffic_class_show(struct netdev_queue *queue, - char *buf) +static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int num_tc, tc; - int index; + int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; index = get_netdev_queue_index(queue); @@ -1306,24 +1452,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static ssize_t tx_maxrate_show(struct netdev_queue *queue, - char *buf) +static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } -static ssize_t tx_maxrate_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { - struct net_device *dev = queue->dev; int err, index = get_netdev_queue_index(queue); + struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without - * hitting the trylock/restart below. + * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; @@ -1332,18 +1479,23 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (err < 0) return err; - if (!rtnl_trylock()) - return restart_syscall(); + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) + return err; err = -EOPNOTSUPP; + netdev_lock_ops(dev); if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); + netdev_unlock_ops(dev); - rtnl_unlock(); if (!err) { queue->tx_maxrate = rate; + rtnl_unlock(); return len; } + + rtnl_unlock(); return err; } @@ -1387,16 +1539,17 @@ static ssize_t bql_set(const char *buf, const size_t count, return count; } -static ssize_t bql_show_hold_time(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } -static ssize_t bql_set_hold_time(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1415,15 +1568,17 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); -static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } -static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1449,13 +1604,15 @@ static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); -static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { - return sprintf(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); + return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } -static ssize_t bql_set_stall_max(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { WRITE_ONCE(queue->dql.stall_max, 0); return len; @@ -1464,18 +1621,19 @@ static ssize_t bql_set_stall_max(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); -static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; - return sprintf(buf, "%lu\n", dql->stall_cnt); + return sysfs_emit(buf, "%lu\n", dql->stall_cnt); } static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); -static ssize_t bql_show_inflight(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1486,13 +1644,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ -static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - char *buf) \ +static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ -static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ +static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ @@ -1524,7 +1685,7 @@ static const struct attribute_group dql_group = { }; #else /* Fake declaration, all the code using it should be dead */ -extern const struct attribute_group dql_group; +static const struct attribute_group dql_group = {}; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS @@ -1578,19 +1739,21 @@ out_no_maps: return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int len, tc; + int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; @@ -1601,18 +1764,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) return -EINVAL; } - /* Make sure the subordinate device can't be freed */ - get_device(&dev->dev); + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); - put_device(&dev->dev); + dev_put(dev); return len; } -static ssize_t xps_cpus_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct net_device *dev = queue->dev; unsigned int index; @@ -1636,9 +1802,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { free_cpumask_var(mask); - return restart_syscall(); + return err; } err = netif_set_xps_queue(dev, mask, index); @@ -1652,26 +1819,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); -static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int tc; + int tc, ret; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, dev); + if (ret) + return ret; tc = netdev_txq_to_tc(dev, index); + + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); - if (tc < 0) - return -EINVAL; - return xps_queue_show(dev, index, tc, buf, XPS_RXQS); + ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; + dev_put(dev); + return ret; } -static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, +static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; @@ -1695,9 +1870,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { bitmap_free(mask); - return restart_syscall(); + return err; } cpus_read_lock(); @@ -1740,7 +1916,7 @@ static const void *netdev_queue_namespace(const struct kobject *kobj) struct device *dev = &queue->dev->dev; const void *ns = NULL; - if (dev->class && dev->class->ns_type) + if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; @@ -1757,15 +1933,13 @@ static void netdev_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; static bool netdev_uses_bql(const struct net_device *dev) { - if (dev->features & NETIF_F_LLTX || - dev->priv_flags & IFF_NO_QUEUE) + if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE)) return false; return IS_ENABLED(CONFIG_BQL); @@ -1777,6 +1951,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Tx queues are cleared in netdev_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ @@ -1788,15 +1978,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = netdev_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) - goto err; + goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1851,6 +2048,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); + sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } @@ -1950,8 +2148,10 @@ static void remove_queue_kobjects(struct net_device *dev) net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); + netdev_lock_ops(dev); dev->real_num_rx_queues = 0; dev->real_num_tx_queues = 0; + netdev_unlock_ops(dev); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif @@ -2028,7 +2228,7 @@ static void netdev_release(struct device *d) * device is dead and about to be freed. */ kfree(rcu_access_pointer(dev->ifalias)); - netdev_freemem(dev); + kvfree(dev); } static const void *net_namespace(const struct device *d) @@ -2046,7 +2246,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) net_ns_get_ownership(net, uid, gid); } -static struct class net_class __ro_after_init = { +static const struct class net_class = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 6aef976bc1da..f2fa34b1d78d 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -23,7 +23,7 @@ #include <linux/net_dropmon.h> #include <linux/slab.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <asm/bitops.h> #define CREATE_TRACE_POINTS diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f0540c557515..ae54f26709ca 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -56,7 +56,6 @@ static bool init_net_initialized; * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); -EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) @@ -69,12 +68,15 @@ DEFINE_COOKIE(net_cookie); static struct net_generic *net_alloc_generic(void) { + unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); + unsigned int generic_size; struct net_generic *ng; - unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + generic_size = offsetof(struct net_generic, ptr[gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) - ng->s.len = max_gen_ptrs; + ng->s.len = gen_ptrs; return ng; } @@ -122,7 +124,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) int err = -ENOMEM; void *data = NULL; - if (ops->id && ops->size) { + if (ops->id) { data = kzalloc(ops->size, GFP_KERNEL); if (!data) goto out; @@ -137,7 +139,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; - if (ops->id && ops->size) { + if (ops->id) { ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; @@ -161,16 +163,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops, } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -179,12 +210,63 @@ static void ops_free_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; - if (ops->size && ops->id) { + + if (ops->id) { list_for_each_entry(net, net_exit_list, exit_list) kfree(net_generic(net, *ops->id)); } } +static void ops_undo_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list, + bool expedite_rcu) +{ + const struct pernet_operations *saved_ops; + bool hold_rtnl = false; + + if (!ops) + ops = list_entry(ops_list, typeof(*ops), list); + + saved_ops = ops; + + list_for_each_entry_continue_reverse(ops, ops_list, list) { + hold_rtnl |= !!ops->exit_rtnl; + ops_pre_exit_list(ops, net_exit_list); + } + + /* Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so the + * rcu_barrier() after ops_undo_list() isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + if (expedite_rcu) + synchronize_rcu_expedited(); + else + synchronize_rcu(); + + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_exit_list(ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_free_list(ops, net_exit_list); +} + +static void ops_undo_single(struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + LIST_HEAD(ops_list); + + list_add(&ops->list, &ops_list); + ops_undo_list(&ops_list, NULL, net_exit_list, false); + list_del(&ops->list); +} + /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { @@ -305,36 +387,56 @@ struct net *get_net_ns_by_id(const struct net *net, int id) } EXPORT_SYMBOL_GPL(get_net_ns_by_id); +static __net_init void preinit_net_sysctl(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + net->core.sysctl_tstamp_allow_data = 1; +} + /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net) +static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) { + refcount_set(&net->passive, 1); + refcount_set(&net->ns.count, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); + + get_random_bytes(&net->hash_mix, sizeof(u32)); + net->dev_base_seq = 1; + net->user_ns = user_ns; + + idr_init(&net->netns_ids); + spin_lock_init(&net->nsid_lock); + mutex_init(&net->ipv4.ra_mutex); + +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + mutex_init(&net->rtnl_mutex); + lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); +#endif + + INIT_LIST_HEAD(&net->ptype_all); + INIT_LIST_HEAD(&net->ptype_specific); + preinit_net_sysctl(net); } /* * setup_net runs the initializers for the network namespace object. */ -static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) +static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ - const struct pernet_operations *ops, *saved_ops; + const struct pernet_operations *ops; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); int error = 0; - refcount_set(&net->ns.count, 1); - ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); - - refcount_set(&net->passive, 1); - get_random_bytes(&net->hash_mix, sizeof(u32)); preempt_disable(); net->net_cookie = gen_cookie_next(&net_cookie); preempt_enable(); - net->dev_base_seq = 1; - net->user_ns = user_ns; - idr_init(&net->netns_ids); - spin_lock_init(&net->nsid_lock); - mutex_init(&net->ipv4.ra_mutex); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -352,59 +454,11 @@ out_undo: * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - synchronize_rcu(); - - ops = saved_ops; - rtnl_lock(); - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); - + ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } -static int __net_init net_defaults_init_net(struct net *net) -{ - net->core.sysctl_somaxconn = SOMAXCONN; - /* Limits per socket sk_omem_alloc usage. - * TCP zerocopy regular usage needs 128 KB. - */ - net->core.sysctl_optmem_max = 128 * 1024; - net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; - - return 0; -} - -static struct pernet_operations net_defaults_ops = { - .init = net_defaults_init_net, -}; - -static __init int net_defaults_init(void) -{ - if (register_pernet_subsys(&net_defaults_ops)) - panic("Cannot initialize net default settings"); - - return 0; -} - -core_initcall(net_defaults_init); - #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) { @@ -453,7 +507,22 @@ out_free: goto out; } -static void net_free(struct net *net) +static LLIST_HEAD(defer_free_list); + +static void net_complete_free(void) +{ + struct llist_node *kill_list; + struct net *net, *next; + + /* Get the list of namespaces to free from last round. */ + kill_list = llist_del_all(&defer_free_list); + + llist_for_each_entry_safe(net, next, kill_list, defer_free_list) + kmem_cache_free(net_cachep, net); + +} + +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -461,7 +530,8 @@ static void net_free(struct net *net) /* There should not be any trackers left there. */ ref_tracker_dir_exit(&net->notrefcnt_tracker); - kmem_cache_free(net_cachep, net); + /* Wait for an extra rcu_barrier() before final free. */ + llist_add(&net->defer_free_list, &defer_free_list); } } @@ -470,7 +540,7 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, @@ -493,8 +563,7 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net); - refcount_set(&net->passive, 1); + preinit_net(net, user_ns); net->ucounts = ucounts; get_user_ns(user_ns); @@ -502,7 +571,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) goto put_userns; - rv = setup_net(net, user_ns); + rv = setup_net(net); up_read(&pernet_ops_rwsem); @@ -512,7 +581,7 @@ put_userns: key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -577,13 +646,15 @@ static void unhash_nsid(struct net *net, struct net *last) static LLIST_HEAD(cleanup_list); +struct task_struct *cleanup_net_task; + static void cleanup_net(struct work_struct *work) { - const struct pernet_operations *ops; - struct net *net, *tmp, *last; struct llist_node *net_kill_list; + struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); + + WRITE_ONCE(cleanup_net_task, current); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -612,33 +683,7 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } - /* Run all of the network namespace pre_exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - /* - * Another CPU might be rcu-iterating the list, wait for it. - * This needs to be before calling the exit() notifiers, so - * the rcu_barrier() below isn't sufficient alone. - * Also the pre_exit() and exit() methods need this barrier. - */ - synchronize_rcu_expedited(); - - rtnl_lock(); - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true); up_read(&pernet_ops_rwsem); @@ -647,6 +692,8 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); + net_complete_free(); + /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); @@ -655,8 +702,9 @@ static void cleanup_net(struct work_struct *work) key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } + WRITE_ONCE(cleanup_net_task, NULL); } /** @@ -690,30 +738,33 @@ EXPORT_SYMBOL_GPL(__put_net); * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * - * Returns the net's common namespace. + * Returns the net's common namespace or ERR_PTR() if ref is zero. */ struct ns_common *get_net_ns(struct ns_common *ns) { - return &get_net(container_of(ns, struct net, ns))->ns; + struct net *net; + + net = maybe_get_net(container_of(ns, struct net, ns)); + if (net) + return &net->ns; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { - struct fd f = fdget(fd); - struct net *net = ERR_PTR(-EINVAL); + CLASS(fd, f)(fd); - if (!f.file) + if (fd_empty(f)) return ERR_PTR(-EBADF); - if (proc_ns_file(f.file)) { - struct ns_common *ns = get_proc_ns(file_inode(f.file)); + if (proc_ns_file(fd_file(f))) { + struct ns_common *ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); + return get_net(container_of(ns, struct net, ns)); } - fdput(f); - return net; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif @@ -1090,7 +1141,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); - return err < 0 ? err : skb->len; + return err; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, @@ -1159,13 +1210,23 @@ static void __init netns_ipv4_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_l3mdev_accept); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif +static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid, + .dumpit = rtnl_net_dumpid, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + void __init net_ns_init(void) { struct net_generic *ng; @@ -1191,9 +1252,10 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif + preinit_net(&init_net, &init_user_ns); + down_write(&pernet_ops_rwsem); - preinit_net(&init_net); - if (setup_net(&init_net, &init_user_ns)) + if (setup_net(&init_net)) panic("Could not setup the initial network namespace"); init_net_initialized = true; @@ -1202,40 +1264,19 @@ void __init net_ns_init(void) if (register_pernet_subsys(&net_ns_ops)) panic("Could not register network namespace subsystems"); - rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - RTNL_FLAG_DOIT_UNLOCKED); -} - -static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) -{ - ops_pre_exit_list(ops, net_exit_list); - synchronize_rcu(); - - if (ops->exit_batch_rtnl) { - LIST_HEAD(dev_kill_list); - - rtnl_lock(); - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - ops_exit_list(ops, net_exit_list); - - ops_free_list(ops, net_exit_list); + rtnl_register_many(net_ns_rtnl_msg_handlers); } #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + LIST_HEAD(net_exit_list); struct net *net; int error; - LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); - if (ops->init || (ops->id && ops->size)) { + if (ops->init || ops->id) { /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ @@ -1251,21 +1292,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { - struct net *net; LIST_HEAD(net_exit_list); + struct net *net; - list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + list_del(&ops->list); + ops_undo_single(ops, &net_exit_list); } #else @@ -1287,8 +1328,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) list_del(&ops->list); } else { LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); } } @@ -1301,13 +1343,20 @@ static int register_pernet_operations(struct list_head *list, { int error; + if (WARN_ON(!!ops->id ^ !!ops->size)) + return -EINVAL; + if (ops->id) { error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, GFP_KERNEL); if (error < 0) return error; *ops->id = error; - max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); + /* This does not require READ_ONCE as writers already hold + * pernet_ops_rwsem. But WRITE_ONCE is needed to protect + * net_alloc_generic. + */ + WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1)); } error = __register_pernet_operations(list, ops); if (error) { diff --git a/net/core/gso_test.c b/net/core/net_test.c index 358c44680d91..9c3a590865d2 100644 --- a/net/core/gso_test.c +++ b/net/core/net_test.c @@ -1,6 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <kunit/test.h> + +/* GSO */ + #include <linux/skbuff.h> static const char hdr[] = "abcdefgh"; @@ -258,17 +261,127 @@ free_gso_skb: consume_skb(skb); } -static struct kunit_case gso_test_cases[] = { - KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), - {} +/* IP tunnel flags */ + +#include <net/ip_tunnels.h> + +struct ip_tunnel_flags_test { + const char *name; + + const u16 *src_bits; + const u16 *exp_bits; + u8 src_num; + u8 exp_num; + + __be16 exp_val; + bool exp_comp; +}; + +#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \ + .name = (n), \ + .src_bits = (src), \ + .src_num = ARRAY_SIZE(src), \ + .exp_comp = (comp), \ + .exp_val = (eval), \ + .exp_bits = (exp), \ + .exp_num = ARRAY_SIZE(exp), \ +} + +/* These are __be16-compatible and can be compared as is */ +static const u16 ip_tunnel_flags_1[] = { + IP_TUNNEL_KEY_BIT, + IP_TUNNEL_STRICT_BIT, + IP_TUNNEL_ERSPAN_OPT_BIT, +}; + +/* Due to the previous flags design limitation, setting either + * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT`` + * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they + * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is + * backward-compatible. + */ +#ifdef __LITTLE_ENDIAN +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT +#else +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT +#endif + +static const u16 ip_tunnel_flags_2_src[] = { + IP_TUNNEL_CONFLICT_BIT, +}; + +static const u16 ip_tunnel_flags_2_exp[] = { + IP_TUNNEL_CONFLICT_BIT, + IP_TUNNEL_SIT_ISATAP_BIT, }; -static struct kunit_suite gso_test_suite = { - .name = "net_core_gso", - .test_cases = gso_test_cases, +/* Bits 17 and higher are not compatible with __be16 flags */ +static const u16 ip_tunnel_flags_3_src[] = { + IP_TUNNEL_VXLAN_OPT_BIT, + 17, + 18, + 20, }; -kunit_test_suite(gso_test_suite); +static const u16 ip_tunnel_flags_3_exp[] = { + IP_TUNNEL_VXLAN_OPT_BIT, +}; + +static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = { + IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true, + cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) | + BIT(IP_TUNNEL_STRICT_BIT) | + BIT(IP_TUNNEL_ERSPAN_OPT_BIT)), + ip_tunnel_flags_1), + IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true, + VTI_ISVTI, ip_tunnel_flags_2_exp), + IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false, + cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)), + ip_tunnel_flags_3_exp), +}; + +static void +ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t, + char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} +KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test, + ip_tunnel_flags_test_case_to_desc); + +static void ip_tunnel_flags_test_run(struct kunit *test) +{ + const struct ip_tunnel_flags_test *t = test->param_value; + IP_TUNNEL_DECLARE_FLAGS(src) = { }; + IP_TUNNEL_DECLARE_FLAGS(exp) = { }; + IP_TUNNEL_DECLARE_FLAGS(out); + + for (u32 j = 0; j < t->src_num; j++) + __set_bit(t->src_bits[j], src); + for (u32 j = 0; j < t->exp_num; j++) + __set_bit(t->exp_bits[j], exp); + + KUNIT_ASSERT_EQ(test, t->exp_comp, + ip_tunnel_flags_is_be16_compat(src)); + KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val, + (__force u16)ip_tunnel_flags_to_be16(src)); + + ip_tunnel_flags_from_be16(out, t->exp_val); + KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out)); +} + +static struct kunit_case net_test_cases[] = { + KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), + KUNIT_CASE_PARAM(ip_tunnel_flags_test_run, + ip_tunnel_flags_test_gen_params), + { }, +}; + +static struct kunit_suite net_test_suite = { + .name = "net_core", + .test_cases = net_test_cases, +}; +kunit_test_suite(net_test_suite); +MODULE_DESCRIPTION("KUnit tests for networking core"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("KUnit tests for segmentation offload"); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 8d8ace9ef87f..4fc44587f493 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -9,16 +9,21 @@ #include "netdev-genl-gen.h" #include <uapi/linux/netdev.h> +#include <net/netdev_netlink.h> /* Integer value ranges */ static const struct netlink_range_validation netdev_a_page_pool_id_range = { .min = 1ULL, - .max = 4294967295ULL, + .max = U32_MAX, }; static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = { .min = 1ULL, - .max = 2147483647ULL, + .max = S32_MAX, +}; + +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = { + .max = S32_MAX, }; /* Common nested types */ @@ -27,6 +32,11 @@ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFIND [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), }; +const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), +}; + /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), @@ -70,9 +80,31 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN /* NETDEV_CMD_QSTATS_GET - dump */ static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { + [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), }; +/* NETDEV_CMD_BIND_RX - do */ +static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, + [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), +}; + +/* NETDEV_CMD_NAPI_SET - do */ +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, + [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), + [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, +}; + +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -150,6 +182,27 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_QSTATS_SCOPE, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = NETDEV_CMD_BIND_RX, + .doit = netdev_nl_bind_rx_doit, + .policy = netdev_bind_rx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_NAPI_SET, + .doit = netdev_nl_napi_set_doit, + .policy = netdev_napi_set_nl_policy, + .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { @@ -157,6 +210,16 @@ static const struct genl_multicast_group netdev_nl_mcgrps[] = { [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", }, }; +static void __netdev_nl_sock_priv_init(void *priv) +{ + netdev_nl_sock_priv_init(priv); +} + +static void __netdev_nl_sock_priv_destroy(void *priv) +{ + netdev_nl_sock_priv_destroy(priv); +} + struct genl_family netdev_nl_family __ro_after_init = { .name = NETDEV_FAMILY_NAME, .version = NETDEV_FAMILY_VERSION, @@ -167,4 +230,7 @@ struct genl_family netdev_nl_family __ro_after_init = { .n_split_ops = ARRAY_SIZE(netdev_nl_ops), .mcgrps = netdev_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), + .sock_priv_size = sizeof(struct netdev_nl_sock), + .sock_priv_init = __netdev_nl_sock_priv_init, + .sock_priv_destroy = __netdev_nl_sock_priv_destroy, }; diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 4db40fd5b4a9..cf3fad74511f 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -10,9 +10,11 @@ #include <net/genetlink.h> #include <uapi/linux/netdev.h> +#include <net/netdev_netlink.h> /* Common nested types */ extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; +extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); @@ -30,6 +32,9 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, @@ -38,4 +43,7 @@ enum { extern struct genl_family netdev_nl_family; +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv); +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv); + #endif /* _LINUX_NETDEV_GEN_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 7004b3399c2b..2afa7b2141aa 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -3,16 +3,18 @@ #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> +#include <net/busy_poll.h> #include <net/net_namespace.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> -#include <net/netdev_rx_queue.h> -#include <net/netdev_queues.h> -#include <net/busy_poll.h> +#include <net/page_pool/memory_provider.h> -#include "netdev-genl-gen.h" #include "dev.h" +#include "devmem.h" +#include "netdev-genl-gen.h" struct netdev_nl_dump_ctx { unsigned long ifindex; @@ -23,7 +25,7 @@ struct netdev_nl_dump_ctx { static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { - NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); + NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } @@ -36,6 +38,8 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, u64 xdp_rx_meta = 0; void *hdr; + netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ + hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; @@ -51,6 +55,8 @@ XDP_METADATA_KFUNC_xxx xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) + xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || @@ -59,22 +65,22 @@ XDP_METADATA_KFUNC_xxx nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, - xsk_features, NETDEV_A_DEV_PAD)) { - genlmsg_cancel(rsp, hdr); - return -EINVAL; - } + xsk_features, NETDEV_A_DEV_PAD)) + goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, - netdev->xdp_zc_max_segs)) { - genlmsg_cancel(rsp, hdr); - return -EINVAL; - } + netdev->xdp_zc_max_segs)) + goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; } static void @@ -118,15 +124,14 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = __dev_get_by_index(genl_info_net(info), ifindex); - if (netdev) - err = netdev_nl_dev_fill(netdev, rsp, info); - else + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { err = -ENODEV; + goto err_free_msg; + } - rtnl_unlock(); + err = netdev_nl_dev_fill(netdev, rsp, info); + netdev_unlock(netdev); if (err) goto err_free_msg; @@ -142,38 +147,35 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); - struct net_device *netdev; - int err = 0; + int err; - rtnl_lock(); - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) - break; + return err; } - rtnl_unlock(); - return err; + return 0; } static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long irq_suspend_timeout; + unsigned long gro_flush_timeout; + u32 napi_defer_hard_irqs; void *hdr; pid_t pid; - if (WARN_ON_ONCE(!napi->dev)) - return -EINVAL; - if (!(napi->dev->flags & IFF_UP)) + if (!napi->dev->up) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; - if (napi->napi_id >= MIN_NAPI_ID && - nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) @@ -188,6 +190,21 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, goto nla_put_failure; } + napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); + if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, + napi_defer_hard_irqs)) + goto nla_put_failure; + + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + + gro_flush_timeout = napi_get_gro_flush_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + gro_flush_timeout)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); return 0; @@ -213,18 +230,21 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - napi = napi_by_id(napi_id); - if (napi) + napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); + if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); - else - err = -EINVAL; - - rtnl_unlock(); + netdev_unlock(napi->dev); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } - if (err) + if (err) { + goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; goto err_free_msg; + } return genlmsg_reply(rsp, info); @@ -239,12 +259,21 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; + unsigned int prev_id; int err = 0; - if (!(netdev->flags & IFF_UP)) + if (!netdev->up) return err; + prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (!napi_id_valid(napi->napi_id)) + continue; + + /* Dump continuation below depends on the list being sorted */ + WARN_ON_ONCE(napi->napi_id >= prev_id); + prev_id = napi->napi_id; + if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; @@ -268,30 +297,86 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev) + netdev = netdev_get_by_index_lock(net, ifindex); + if (netdev) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); - else + netdev_unlock(netdev); + } else { err = -ENODEV; + } } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } - rtnl_unlock(); return err; } static int +netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) +{ + u64 irq_suspend_timeout = 0; + u64 gro_flush_timeout = 0; + u32 defer = 0; + + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { + defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); + napi_set_defer_hard_irqs(napi, defer); + } + + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { + gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); + napi_set_gro_flush_timeout(napi, gro_flush_timeout); + } + + return 0; +} + +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + unsigned int napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); + if (napi) { + err = netdev_nl_napi_set_config(napi, info); + netdev_unlock(napi->dev); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + return err; +} + +static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) +{ + if (napi && napi_id_valid(napi->napi_id)) + return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); + return 0; +} + +static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { + struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -308,15 +393,30 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); - if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - rxq->napi->napi_id)) + if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; + + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) + goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif + break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); - if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - txq->napi->napi_id)) + if (nla_put_napi_id(rsp, txq->napi)) goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (txq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif + break; } genlmsg_end(rsp, hdr); @@ -347,10 +447,10 @@ static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - int err = 0; + int err; - if (!(netdev->flags & IFF_UP)) - return err; + if (!netdev->up) + return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) @@ -379,15 +479,14 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = __dev_get_by_index(genl_info_net(info), ifindex); - if (netdev) + netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), + ifindex); + if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); - else + netdev_unlock_ops_compat(netdev); + } else { err = -ENODEV; - - rtnl_unlock(); + } if (err) goto err_free_msg; @@ -405,24 +504,21 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { int err = 0; - int i; - if (!(netdev->flags & IFF_UP)) + if (!netdev->up) return err; - for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; - ctx->rxq_idx = i++; } - for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; - ctx->txq_idx = i++; } return err; @@ -440,15 +536,17 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev) + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); - else + netdev_unlock_ops_compat(netdev); + } else { err = -ENODEV; + } } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_ops_compat_scoped(net, netdev, + ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; @@ -456,7 +554,6 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) ctx->txq_idx = 0; } } - rtnl_unlock(); return err; } @@ -489,7 +586,18 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || - netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail)) + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } @@ -498,7 +606,18 @@ static int netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || - netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes)) + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) return -EMSGSIZE; return 0; } @@ -567,7 +686,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->rxq_idx = i++; + ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { @@ -575,7 +694,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->txq_idx = i++; + ctx->txq_idx = ++i; } ctx->rxq_idx = 0; @@ -583,25 +702,66 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, return 0; } +/** + * netdev_stat_queue_sum() - add up queue stats from range of queues + * @netdev: net_device + * @rx_start: index of the first Rx queue to query + * @rx_end: index after the last Rx queue (first *not* to query) + * @rx_sum: output Rx stats, should be already initialized + * @tx_start: index of the first Tx queue to query + * @tx_end: index after the last Tx queue (first *not* to query) + * @tx_sum: output Tx stats, should be already initialized + * + * Add stats from [start, end) range of queue IDs to *x_sum structs. + * The sum structs must be already initialized. Usually this + * helper is invoked from the .get_base_stats callbacks of drivers + * to account for stats of disabled queues. In that case the ranges + * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues). + */ +void netdev_stat_queue_sum(struct net_device *netdev, + int rx_start, int rx_end, + struct netdev_queue_stats_rx *rx_sum, + int tx_start, int tx_end, + struct netdev_queue_stats_tx *tx_sum) +{ + const struct netdev_stat_ops *ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + int i; + + ops = netdev->stat_ops; + + for (i = rx_start; i < rx_end; i++) { + memset(&rx, 0xff, sizeof(rx)); + if (ops->get_queue_stats_rx) + ops->get_queue_stats_rx(netdev, i, &rx); + netdev_nl_stats_add(rx_sum, &rx, sizeof(rx)); + } + for (i = tx_start; i < tx_end; i++) { + memset(&tx, 0xff, sizeof(tx)); + if (ops->get_queue_stats_tx) + ops->get_queue_stats_tx(netdev, i, &tx); + netdev_nl_stats_add(tx_sum, &tx, sizeof(tx)); + } +} +EXPORT_SYMBOL(netdev_stat_queue_sum); + static int netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { - struct netdev_queue_stats_rx rx_sum, rx; - struct netdev_queue_stats_tx tx_sum, tx; - const struct netdev_stat_ops *ops; + struct netdev_queue_stats_rx rx_sum; + struct netdev_queue_stats_tx tx_sum; void *hdr; - int i; - ops = netdev->stat_ops; /* Netdev can't guarantee any complete counters */ - if (!ops->get_base_stats) + if (!netdev->stat_ops->get_base_stats) return 0; memset(&rx_sum, 0xff, sizeof(rx_sum)); memset(&tx_sum, 0xff, sizeof(tx_sum)); - ops->get_base_stats(netdev, &rx_sum, &tx_sum); + netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum); /* The op was there, but nothing reported, don't bother */ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && @@ -614,18 +774,8 @@ netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) goto nla_put_failure; - for (i = 0; i < netdev->real_num_rx_queues; i++) { - memset(&rx, 0xff, sizeof(rx)); - if (ops->get_queue_stats_rx) - ops->get_queue_stats_rx(netdev, i, &rx); - netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx)); - } - for (i = 0; i < netdev->real_num_tx_queues; i++) { - memset(&tx, 0xff, sizeof(tx)); - if (ops->get_queue_stats_tx) - ops->get_queue_stats_tx(netdev, i, &tx); - netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx)); - } + netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum, + 0, netdev->real_num_tx_queues, &tx_sum); if (netdev_nl_stats_write_rx(rsp, &rx_sum) || netdev_nl_stats_write_tx(rsp, &tx_sum)) @@ -639,6 +789,24 @@ nla_put_failure: return -EMSGSIZE; } +static int +netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, + struct sk_buff *skb, const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + if (!netdev->stat_ops) + return 0; + + switch (scope) { + case 0: + return netdev_nl_stats_by_netdev(netdev, skb, info); + case NETDEV_QSTATS_SCOPE_QUEUE: + return netdev_nl_stats_by_queue(netdev, skb, info, ctx); + } + + return -EINVAL; /* Should not happen, per netlink policy */ +} + int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { @@ -646,6 +814,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; + unsigned int ifindex; unsigned int scope; int err = 0; @@ -653,27 +822,259 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, if (info->attrs[NETDEV_A_QSTATS_SCOPE]) scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); - rtnl_lock(); - for_each_netdev_dump(net, netdev, ctx->ifindex) { - if (!netdev->stat_ops) - continue; + ifindex = 0; + if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); - switch (scope) { - case 0: - err = netdev_nl_stats_by_netdev(netdev, skb, info); - break; - case NETDEV_QSTATS_SCOPE_QUEUE: - err = netdev_nl_stats_by_queue(netdev, skb, info, ctx); - break; + if (ifindex) { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (!netdev) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + return -ENODEV; + } + if (netdev->stat_ops) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = -EOPNOTSUPP; } + netdev_unlock_ops_compat(netdev); + return err; + } + + for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); if (err < 0) break; } - rtnl_unlock(); return err; } +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct net_devmem_dmabuf_binding *binding; + u32 ifindex, dmabuf_fd, rxq_idx; + struct netdev_nl_sock *priv; + struct net_device *netdev; + struct sk_buff *rsp; + struct nlattr *attr; + int rem, err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + err = 0; + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + if (!netif_device_present(netdev)) + err = -ENODEV; + else if (!netdev_need_ops_lock(netdev)) + err = -EOPNOTSUPP; + if (err) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_DEV_IFINDEX]); + goto err_unlock; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, + priv, info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock; + } + + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + err = nla_parse_nested( + tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + goto err_unbind; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { + err = -EINVAL; + goto err_unbind; + } + + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); + err = -EINVAL; + goto err_unbind; + } + + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + + err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, + info->extack); + if (err) + goto err_unbind; + } + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + err = genlmsg_reply(rsp, info); + if (err) + goto err_unbind; + + netdev_unlock(netdev); + + mutex_unlock(&priv->lock); + + return 0; + +err_unbind: + net_devmem_unbind_dmabuf(binding); +err_unlock: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_devmem_dmabuf_binding *binding; + struct netdev_nl_sock *priv; + struct net_device *netdev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + + if (!netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + if (!netdev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver does not support netmem TX"); + goto err_unlock_netdev; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv, + info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock_netdev; + } + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + netdev_unlock(netdev); + mutex_unlock(&priv->lock); + + return genlmsg_reply(rsp, info); + +err_unlock_netdev: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) +{ + INIT_LIST_HEAD(&priv->bindings); + mutex_init(&priv->lock); +} + +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) +{ + struct net_devmem_dmabuf_binding *binding; + struct net_devmem_dmabuf_binding *temp; + netdevice_tracker dev_tracker; + struct net_device *dev; + + mutex_lock(&priv->lock); + list_for_each_entry_safe(binding, temp, &priv->bindings, list) { + mutex_lock(&binding->lock); + dev = binding->dev; + if (!dev) { + mutex_unlock(&binding->lock); + net_devmem_unbind_dmabuf(binding); + continue; + } + netdev_hold(dev, &dev_tracker, GFP_KERNEL); + mutex_unlock(&binding->lock); + + netdev_lock(dev); + net_devmem_unbind_dmabuf(binding); + netdev_unlock(dev); + netdev_put(dev, &dev_tracker); + } + mutex_unlock(&priv->lock); +} + static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -681,10 +1082,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_REGISTER: + netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: + netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c new file mode 100644 index 000000000000..d126f10197bf --- /dev/null +++ b/net/core/netdev_rx_queue.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/ethtool_netlink.h> +#include <linux/netdevice.h> +#include <net/netdev_lock.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/memory_provider.h> + +#include "page_pool_priv.h" + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; + void *new_mem, *old_mem; + int err; + + if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free || + !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start) + return -EOPNOTSUPP; + + netdev_assert_locked(dev); + + new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); + if (!new_mem) + return -ENOMEM; + + old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); + if (!old_mem) { + err = -ENOMEM; + goto err_free_new_mem; + } + + err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + if (err) + goto err_free_old_mem; + + err = page_pool_check_memory_provider(dev, rxq); + if (err) + goto err_free_new_queue_mem; + + if (netif_running(dev)) { + err = qops->ndo_queue_stop(dev, old_mem, rxq_idx); + if (err) + goto err_free_new_queue_mem; + + err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + if (err) + goto err_start_queue; + } else { + swap(new_mem, old_mem); + } + + qops->ndo_queue_mem_free(dev, old_mem); + + kvfree(old_mem); + kvfree(new_mem); + + return 0; + +err_start_queue: + /* Restarting the queue with old_mem should be successful as we haven't + * changed any of the queue configuration, and there is not much we can + * do to recover from a failure here. + * + * WARN if we fail to recover the old rx queue, and at least free + * old_mem so we don't also leak that. + */ + if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { + WARN(1, + "Failed to restart old queue in error path. RX queue %d may be unhealthy.", + rxq_idx); + qops->ndo_queue_mem_free(dev, old_mem); + } + +err_free_new_queue_mem: + qops->ndo_queue_mem_free(dev, new_mem); + +err_free_old_mem: + kvfree(old_mem); + +err_free_new_mem: + kvfree(new_mem); + + return err; +} +EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (!netdev_need_ops_lock(dev)) + return -EOPNOTSUPP; + + if (rxq_idx >= dev->real_num_rx_queues) + return -EINVAL; + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); + return -EINVAL; + } + if (dev->cfg->hds_thresh) { + NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); + return -EINVAL; + } + if (dev_xdp_prog_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); + return -EEXIST; + } + + rxq = __netif_get_rx_queue(dev, rxq_idx); + if (rxq->mp_params.mp_ops) { + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); + return -EEXIST; + } +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) { + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); + return -EBUSY; + } +#endif + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, rxq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + netdev_lock(dev); + ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL); + netdev_unlock(dev); + return ret; +} + +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, + const struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + int err; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + err = netdev_rx_queue_restart(dev, ifq_idx); + WARN_ON(err && err != -ENETDOWN); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + netdev_lock(dev); + __net_mp_close_rxq(dev, ifq_idx, old_p); + netdev_unlock(dev); +} diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h new file mode 100644 index 000000000000..cd95394399b4 --- /dev/null +++ b/net/core/netmem_priv.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NETMEM_PRIV_H +#define __NETMEM_PRIV_H + +static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) +{ + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; +} + +static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) +{ + __netmem_clear_lsb(netmem)->pp_magic |= pp_magic; +} + +static inline void netmem_clear_pp_magic(netmem_ref netmem) +{ + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + + __netmem_clear_lsb(netmem)->pp_magic = 0; +} + +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + +static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) +{ + __netmem_clear_lsb(netmem)->pp = pool; +} + +static inline void netmem_set_dma_addr(netmem_ref netmem, + unsigned long dma_addr) +{ + __netmem_clear_lsb(netmem)->dma_addr = dma_addr; +} + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = __netmem_clear_lsb(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + __netmem_clear_lsb(netmem)->pp_magic = magic; +} +#endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 543007f159f9..6ad84d4a2b46 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -34,7 +34,7 @@ #include <net/addrconf.h> #include <net/ndisc.h> #include <net/ip6_checksum.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <trace/events/napi.h> #include <linux/kconfig.h> @@ -45,11 +45,6 @@ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 - -static struct sk_buff_head skb_pool; - -DEFINE_STATIC_SRCU(netpoll_srcu); - #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ @@ -162,7 +157,7 @@ static void poll_one_napi(struct napi_struct *napi) if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) return; - /* We explicilty pass the polling call a budget of 0 to + /* We explicitly pass the polling call a budget of 0 to * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); @@ -220,41 +215,39 @@ EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; - int idx; + might_sleep(); - idx = srcu_read_lock(&netpoll_srcu); - ni = srcu_dereference(dev->npinfo, &netpoll_srcu); + ni = rtnl_dereference(dev->npinfo); if (ni) down(&ni->dev_lock); - srcu_read_unlock(&netpoll_srcu, idx); } -EXPORT_SYMBOL(netpoll_poll_disable); void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; - rcu_read_lock(); - ni = rcu_dereference(dev->npinfo); + + ni = rtnl_dereference(dev->npinfo); if (ni) up(&ni->dev_lock); - rcu_read_unlock(); } -EXPORT_SYMBOL(netpoll_poll_enable); -static void refill_skbs(void) +static void refill_skbs(struct netpoll *np) { + struct sk_buff_head *skb_pool; struct sk_buff *skb; unsigned long flags; - spin_lock_irqsave(&skb_pool.lock, flags); - while (skb_pool.qlen < MAX_SKBS) { + skb_pool = &np->skb_pool; + + spin_lock_irqsave(&skb_pool->lock, flags); + while (skb_pool->qlen < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; - __skb_queue_tail(&skb_pool, skb); + __skb_queue_tail(skb_pool, skb); } - spin_unlock_irqrestore(&skb_pool.lock, flags); + spin_unlock_irqrestore(&skb_pool->lock, flags); } static void zap_completion_queue(void) @@ -291,12 +284,13 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) struct sk_buff *skb; zap_completion_queue(); - refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) - skb = skb_dequeue(&skb_pool); + if (!skb) { + skb = skb_dequeue(&np->skb_pool); + schedule_work(&np->refill_wq); + } if (!skb) { if (++count < 10) { @@ -316,7 +310,7 @@ static int netpoll_owner_active(struct net_device *dev) struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; @@ -326,6 +320,7 @@ static int netpoll_owner_active(struct net_device *dev) static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; + netdev_tx_t ret = NET_XMIT_DROP; struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ @@ -334,11 +329,12 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) lockdep_assert_irqs_disabled(); dev = np->dev; + rcu_read_lock(); npinfo = rcu_dereference_bh(dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return NET_XMIT_DROP; + goto out; } /* don't get messages out of order, and no recursion */ @@ -377,7 +373,10 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } - return NETDEV_TX_OK; + ret = NETDEV_TX_OK; +out: + rcu_read_unlock(); + return ret; } netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) @@ -397,7 +396,7 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } EXPORT_SYMBOL(netpoll_send_skb); -void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +int netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; @@ -421,7 +420,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb = find_skb(np, total_len + np->dev->needed_tailroom, total_len - len); if (!skb) - return; + return -ENOMEM; skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); @@ -433,8 +432,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) udph->dest = htons(np->remote_port); udph->len = htons(udp_len); + udph->check = 0; if (np->ipv6) { - udph->check = 0; udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, udp_len, IPPROTO_UDP, @@ -462,7 +461,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { - udph->check = 0; udph->check = csum_tcpudp_magic(np->local_ip.ip, np->remote_ip.ip, udp_len, IPPROTO_UDP, @@ -497,7 +495,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb->dev = np->dev; - netpoll_send_skb(np, skb); + return (int)netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); @@ -508,7 +506,8 @@ void netpoll_print_options(struct netpoll *np) np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); else np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); - np_info(np, "interface '%s'\n", np->dev_name); + np_info(np, "interface name '%s'\n", np->dev_name); + np_info(np, "local ethernet address '%pM'\n", np->dev_mac); np_info(np, "remote port %d\n", np->remote_port); if (np->ipv6) np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); @@ -538,6 +537,15 @@ static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) return -1; } +static void skb_pool_flush(struct netpoll *np) +{ + struct sk_buff_head *skb_pool; + + cancel_work_sync(&np->refill_wq); + skb_pool = &np->skb_pool; + skb_queue_purge_reason(skb_pool, SKB_CONSUMED); +} + int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; @@ -569,11 +577,18 @@ int netpoll_parse_options(struct netpoll *np, char *opt) cur++; if (*cur != ',') { - /* parse out dev name */ + /* parse out dev_name or dev_mac */ if ((delim = strchr(cur, ',')) == NULL) goto parse_failed; *delim = 0; - strscpy(np->dev_name, cur, sizeof(np->dev_name)); + + np->dev_name[0] = '\0'; + eth_broadcast_addr(np->dev_mac); + if (!strchr(cur, ':')) + strscpy(np->dev_name, cur, sizeof(np->dev_name)); + else if (!mac_pton(cur, np->dev_mac)) + goto parse_failed; + cur = delim; } cur++; @@ -620,23 +635,31 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); +static void refill_skbs_work_handler(struct work_struct *work) +{ + struct netpoll *np = + container_of(work, struct netpoll, refill_wq); + + refill_skbs(np); +} + int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; const struct net_device_ops *ops; int err; - np->dev = ndev; - strscpy(np->dev_name, ndev->name, IFNAMSIZ); + skb_queue_head_init(&np->skb_pool); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", - np->dev_name); + ndev->name); err = -ENOTSUPP; goto out; } - if (!ndev->npinfo) { + npinfo = rtnl_dereference(ndev->npinfo); + if (!npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; @@ -649,19 +672,24 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) refcount_set(&npinfo->refcnt, 1); - ops = np->dev->netdev_ops; + ops = ndev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev); if (err) goto free_npinfo; } } else { - npinfo = rtnl_dereference(ndev->npinfo); refcount_inc(&npinfo->refcnt); } + np->dev = ndev; + strscpy(np->dev_name, ndev->name, IFNAMSIZ); npinfo->netpoll = np; + /* fill up the skb queue */ + refill_skbs(np); + INIT_WORK(&np->refill_wq, refill_skbs_work_handler); + /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -674,26 +702,45 @@ out: } EXPORT_SYMBOL_GPL(__netpoll_setup); +/* + * Returns a pointer to a string representation of the identifier used + * to select the egress interface for the given netpoll instance. buf + * must be a buffer of length at least MAC_ADDR_STR_LEN + 1. + */ +static char *egress_dev(struct netpoll *np, char *buf) +{ + if (np->dev_name[0]) + return np->dev_name; + + snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac); + return buf; +} + int netpoll_setup(struct netpoll *np) { + struct net *net = current->nsproxy->net_ns; + char buf[MAC_ADDR_STR_LEN + 1]; struct net_device *ndev = NULL; + bool ip_overwritten = false; struct in_device *in_dev; int err; rtnl_lock(); - if (np->dev_name[0]) { - struct net *net = current->nsproxy->net_ns; + if (np->dev_name[0]) ndev = __dev_get_by_name(net, np->dev_name); - } + else if (is_valid_ether_addr(np->dev_mac)) + ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); + if (!ndev) { - np_err(np, "%s doesn't exist, aborting\n", np->dev_name); + np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf)); err = -ENODEV; goto unlock; } netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { - np_err(np, "%s is a slave device, aborting\n", np->dev_name); + np_err(np, "%s is a slave device, aborting\n", + egress_dev(np, buf)); err = -EBUSY; goto put; } @@ -701,7 +748,8 @@ int netpoll_setup(struct netpoll *np) if (!netif_running(ndev)) { unsigned long atmost; - np_info(np, "device %s not up yet, forcing it\n", np->dev_name); + np_info(np, "device %s not up yet, forcing it\n", + egress_dev(np, buf)); err = dev_open(ndev, NULL); @@ -735,12 +783,13 @@ int netpoll_setup(struct netpoll *np) if (!ifa) { put_noaddr: np_err(np, "no IP address for %s, aborting\n", - np->dev_name); + egress_dev(np, buf)); err = -EDESTADDRREQ; goto put; } np->local_ip.ip = ifa->ifa_local; + ip_overwritten = true; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) @@ -757,6 +806,7 @@ put_noaddr: !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; + ip_overwritten = true; err = 0; break; } @@ -764,29 +814,31 @@ put_noaddr: } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", - np->dev_name); + egress_dev(np, buf)); goto put; } else np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); #else np_err(np, "IPv6 is not supported %s, aborting\n", - np->dev_name); + egress_dev(np, buf)); err = -EINVAL; goto put; #endif } } - /* fill up the skb queue */ - refill_skbs(); - err = __netpoll_setup(np, ndev); if (err) - goto put; + goto flush; rtnl_unlock(); return 0; +flush: + skb_pool_flush(np); put: + DEBUG_NET_WARN_ON_ONCE(np->dev); + if (ip_overwritten) + memset(&np->local_ip, 0, sizeof(np->local_ip)); netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); @@ -794,13 +846,6 @@ unlock: } EXPORT_SYMBOL(netpoll_setup); -static int __init netpoll_init(void) -{ - skb_queue_head_init(&skb_pool); - return 0; -} -core_initcall(netpoll_init); - static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = @@ -826,8 +871,6 @@ void __netpoll_cleanup(struct netpoll *np) if (!npinfo) return; - synchronize_srcu(&netpoll_srcu); - if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; @@ -839,6 +882,8 @@ void __netpoll_cleanup(struct netpoll *np) call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); + + skb_pool_flush(np); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); @@ -853,14 +898,20 @@ void __netpoll_free(struct netpoll *np) } EXPORT_SYMBOL_GPL(__netpoll_free); +void do_netpoll_cleanup(struct netpoll *np) +{ + __netpoll_cleanup(np); + netdev_put(np->dev, &np->dev_tracker); + np->dev = NULL; +} +EXPORT_SYMBOL(do_netpoll_cleanup); + void netpoll_cleanup(struct netpoll *np) { rtnl_lock(); if (!np->dev) goto out; - __netpoll_cleanup(np); - netdev_put(np->dev, &np->dev_tracker); - np->dev = NULL; + do_netpoll_cleanup(np); out: rtnl_unlock(); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dd364d738c00..ba7cf3e3c32f 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -5,12 +5,16 @@ * Copyright (C) 2016 Red Hat, Inc. */ +#include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -23,8 +27,13 @@ #include <trace/events/page_pool.h> +#include "dev.h" +#include "mp_dmabuf_devmem.h" +#include "netmem_priv.h" #include "page_pool_priv.h" +DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); + #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) @@ -123,9 +132,9 @@ int page_pool_ethtool_stats_get_count(void) } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { - struct page_pool_stats *pool_stats = stats; + const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; @@ -144,9 +153,9 @@ u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else -#define alloc_stat_inc(pool, __stat) -#define recycle_stat_inc(pool, __stat) -#define recycle_stat_add(pool, __stat, val) +#define alloc_stat_inc(...) do { } while (0) +#define recycle_stat_inc(...) do { } while (0) +#define recycle_stat_add(...) do { } while (0) #endif static bool page_pool_producer_lock(struct page_pool *pool) @@ -172,19 +181,33 @@ static void page_pool_producer_unlock(struct page_pool *pool, spin_unlock_bh(&pool->ring.producer_lock); } +static void page_pool_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); + CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, + PAGE_POOL_FRAG_GROUP_ALIGN); +} + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params, int cpuid) { unsigned int ring_qsize = 1024; /* Default */ + struct netdev_rx_queue *rxq; + int err; + + page_pool_struct_check(); memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ - if (pool->p.flags & ~(PP_FLAG_ALL)) + if (pool->slow.flags & ~PP_FLAG_ALL) return -EINVAL; if (pool->p.pool_size) @@ -198,22 +221,26 @@ static int page_pool_init(struct page_pool *pool, * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ - if (pool->p.flags & PP_FLAG_DMA_MAP) { + if (pool->slow.flags & PP_FLAG_DMA_MAP) { if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + + pool->dma_map = true; } - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page * needs to be mapped */ - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) return -EINVAL; if (!pool->p.max_len) return -EINVAL; + pool->dma_sync = true; + /* pool->p.offset has to be set according to the address * offset used by the DMA engine to start copying rx data */ @@ -222,7 +249,7 @@ static int page_pool_init(struct page_pool *pool, pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS - if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) { + if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); if (!pool->recycle_stats) return -ENOMEM; @@ -232,12 +259,13 @@ static int page_pool_init(struct page_pool *pool, * (also percpu) page pool instance. */ pool->recycle_stats = &pp_system_recycle_stats; + pool->system = true; } #endif if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { #ifdef CONFIG_PAGE_POOL_STATS - if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) + if (!pool->system) free_percpu(pool->recycle_stats); #endif return -ENOMEM; @@ -248,21 +276,53 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->p.flags & PP_FLAG_DMA_MAP) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); + + if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { + netdev_assert_locked(pool->slow.netdev); + rxq = __netif_get_rx_queue(pool->slow.netdev, + pool->slow.queue_idx); + pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; + } + + if (pool->mp_ops) { + if (!pool->dma_map || !pool->dma_sync) + return -EOPNOTSUPP; + + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); + if (err) { + pr_warn("%s() mem-provider init failed %d\n", __func__, + err); + goto free_ptr_ring; + } + + static_branch_inc(&page_pool_mem_providers); + } return 0; + +free_ptr_ring: + ptr_ring_cleanup(&pool->ring, NULL); +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif + return err; } static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->p.flags & PP_FLAG_DMA_MAP) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS - if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) + if (!pool->system) free_percpu(pool->recycle_stats); #endif } @@ -311,19 +371,18 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); -static void page_pool_return_page(struct page_pool *pool, struct page *page); +static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); -noinline -static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) +static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; - struct page *page; + netmem_ref netmem; int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) { alloc_stat_inc(pool, empty); - return NULL; + return 0; } /* Softirq guarantee CPU and thus NUMA node is stable. This, @@ -338,118 +397,127 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) /* Refill alloc array, but only if NUMA match */ do { - page = __ptr_ring_consume(r); - if (unlikely(!page)) + netmem = (__force netmem_ref)__ptr_ring_consume(r); + if (unlikely(!netmem)) break; - if (likely(page_to_nid(page) == pref_nid)) { - pool->alloc.cache[pool->alloc.count++] = page; + if (likely(netmem_is_pref_nid(netmem, pref_nid))) { + pool->alloc.cache[pool->alloc.count++] = netmem; } else { /* NUMA mismatch; * (1) release 1 page to page-allocator and * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); alloc_stat_inc(pool, waive); - page = NULL; + netmem = 0; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) { - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, refill); } - return page; + return netmem; } /* fast path */ -static struct page *__page_pool_get_cached(struct page_pool *pool) +static netmem_ref __page_pool_get_cached(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ if (likely(pool->alloc.count)) { /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, fast); } else { - page = page_pool_refill_alloc_cache(pool); + netmem = page_pool_refill_alloc_cache(pool); } - return page; + return netmem; } -static void page_pool_dma_sync_for_device(struct page_pool *pool, - struct page *page, - unsigned int dma_sync_size) +static void __page_pool_dma_sync_for_device(const struct page_pool *pool, + netmem_ref netmem, + u32 dma_sync_size) { - dma_addr_t dma_addr = page_pool_get_dma_addr(page); +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); dma_sync_size = min(dma_sync_size, pool->p.max_len); - dma_sync_single_range_for_device(pool->p.dev, dma_addr, - pool->p.offset, dma_sync_size, - pool->p.dma_dir); + __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, + dma_sync_size, pool->p.dma_dir); +#endif +} + +static __always_inline void +page_pool_dma_sync_for_device(const struct page_pool *pool, + netmem_ref netmem, + u32 dma_sync_size) +{ + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } } -static bool page_pool_dma_map(struct page_pool *pool, struct page *page) +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ - dma = dma_map_page_attrs(pool->p.dev, page, 0, - (PAGE_SIZE << pool->p.order), - pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | - DMA_ATTR_WEAK_ORDERING); + dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, + (PAGE_SIZE << pool->p.order), pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC | + DMA_ATTR_WEAK_ORDERING); if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr(page, dma)) + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; + } + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto unset_failed; + } - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + netmem_set_dma_index(netmem, id); + page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: - WARN_ON_ONCE("unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); return false; } -static void page_pool_set_pp_info(struct page_pool *pool, - struct page *page) -{ - page->pp = pool; - page->pp_magic |= PP_SIGNATURE; - - /* Ensuring all pages have been split into one fragment initially: - * page_pool_set_pp_info() is only called once for every page when it - * is allocated from the page allocator and page_pool_fragment_page() - * is dirtying the same cache line as the page->pp_magic above, so - * the overhead is negligible. - */ - page_pool_fragment_page(page, 1); - if (pool->has_init_callback) - pool->slow.init_callback(page, pool->slow.init_arg); -} - -static void page_pool_clear_pp_info(struct page *page) -{ - page->pp_magic = 0; - page->pp = NULL; -} - static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -460,94 +528,102 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if ((pool->p.flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } alloc_stat_inc(pool, slow_high_order); - page_pool_set_pp_info(pool, page); + page_pool_set_pp_info(pool, page_to_netmem(page)); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + trace_page_pool_state_hold(pool, page_to_netmem(page), + pool->pages_state_hold_cnt); return page; } /* slow path */ -noinline -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, - gfp_t gfp) +static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; - unsigned int pp_flags = pool->p.flags; unsigned int pp_order = pool->p.order; - struct page *page; + bool dma_map = pool->dma_map; + netmem_ref netmem; int i, nr_pages; /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) - return __page_pool_alloc_page_order(pool, gfp); + return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); /* Unnecessary as alloc cache is empty, but guarantees zero count */ if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, - pool->alloc.cache); + nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, + (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) - return NULL; + return 0; /* Pages have been filled into alloc.cache array, but count is zero and * page element have not been (possibly) DMA mapped. */ for (i = 0; i < nr_pages; i++) { - page = pool->alloc.cache[i]; - if ((pp_flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { - put_page(page); + netmem = pool->alloc.cache[i]; + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { + put_page(netmem_to_page(netmem)); continue; } - page_pool_set_pp_info(pool, page); - pool->alloc.cache[pool->alloc.count++] = page; + page_pool_set_pp_info(pool, netmem); + pool->alloc.cache[pool->alloc.count++] = netmem; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Return last page */ if (likely(pool->alloc.count > 0)) { - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, slow); } else { - page = NULL; + netmem = 0; } /* When page just alloc'ed is should/must have refcnt 1. */ - return page; + return netmem; } /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ -struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { - struct page *page; + netmem_ref netmem; /* Fast-path: Get a page from cache */ - page = __page_pool_get_cached(pool); - if (page) - return page; + netmem = __page_pool_get_cached(pool); + if (netmem) + return netmem; /* Slow-path: cache empty, do real allocation */ - page = __page_pool_alloc_pages_slow(pool, gfp); - return page; + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); + else + netmem = __page_pool_alloc_pages_slow(pool, gfp); + return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_netmems); +ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); @@ -575,24 +651,60 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) return inflight; } -static __always_inline -void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { + netmem_set_pp(netmem, pool); + netmem_or_pp_magic(netmem, PP_SIGNATURE); + + /* Ensuring all pages have been split into one fragment initially: + * page_pool_set_pp_info() is only called once for every page when it + * is allocated from the page allocator and page_pool_fragment_page() + * is dirtying the same cache line as the page->pp_magic above, so + * the overhead is negligible. + */ + page_pool_fragment_netmem(netmem, 1); + if (pool->has_init_callback) + pool->slow.init_callback(netmem, pool->slow.init_arg); +} + +void page_pool_clear_pp_info(netmem_ref netmem) +{ + netmem_clear_pp_magic(netmem); + netmem_set_pp(netmem, NULL); +} + +static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, + netmem_ref netmem) +{ + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + if (!pool->dma_map) /* Always account for inflight pages, even if we didn't * map them */ return; - dma = page_pool_get_dma_addr(page); + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); - page_pool_set_dma_addr(page, 0); + page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -600,42 +712,45 @@ void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_return_page(struct page_pool *pool, struct page *page) +void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) { int count; + bool put; - __page_pool_release_page_dma(pool, page); - - page_pool_clear_pp_info(page); + put = true; + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); + else + __page_pool_release_page_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); - trace_page_pool_state_release(pool, page, count); + trace_page_pool_state_release(pool, netmem, count); - put_page(page); + if (put) { + page_pool_clear_pp_info(netmem); + put_page(netmem_to_page(netmem)); + } /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). */ } -static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) +static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { - int ret; - /* BH protection not needed if current is softirq */ - if (in_softirq()) - ret = ptr_ring_produce(&pool->ring, page); - else - ret = ptr_ring_produce_bh(&pool->ring, page); + bool in_softirq, ret; - if (!ret) { + /* BH protection not needed if current is softirq */ + in_softirq = page_pool_producer_lock(pool); + ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); + if (ret) recycle_stat_inc(pool, ring); - return true; - } + page_pool_producer_unlock(pool, in_softirq); - return false; + return ret; } /* Only allow direct recycling in special circumstances, into the @@ -643,7 +758,7 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) * * Caller must provide appropriate safe context. */ -static bool page_pool_recycle_in_cache(struct page *page, +static bool page_pool_recycle_in_cache(netmem_ref netmem, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { @@ -652,24 +767,26 @@ static bool page_pool_recycle_in_cache(struct page *page, } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ - pool->alloc.cache[pool->alloc.count++] = page; + pool->alloc.cache[pool->alloc.count++] = netmem; recycle_stat_inc(pool, cached); return true; } -static bool __page_pool_page_can_be_recycled(const struct page *page) +static bool __page_pool_page_can_be_recycled(netmem_ref netmem) { - return page_ref_count(page) == 1 && !page_is_pfmemalloc(page); + return netmem_is_net_iov(netmem) || + (page_ref_count(netmem_to_page(netmem)) == 1 && + !page_is_pfmemalloc(netmem_to_page(netmem))); } /* If the page refcnt == 1, this will try to recycle the page. - * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * If pool->dma_sync is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -static __always_inline struct page * -__page_pool_put_page(struct page_pool *pool, struct page *page, +static __always_inline netmem_ref +__page_pool_put_page(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { lockdep_assert_no_hardirq(); @@ -683,20 +800,18 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ - if (likely(__page_pool_page_can_be_recycled(page))) { + if (likely(__page_pool_page_can_be_recycled(netmem))) { /* Read barrier done in page_ref_count / READ_ONCE */ - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, - dma_sync_size); + page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); - if (allow_direct && in_softirq() && - page_pool_recycle_in_cache(page, pool)) - return NULL; + if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) + return 0; /* Page found as candidate for recycling */ - return page; + return netmem; } + /* Fallback/non-XDP mode: API user have elevated refcnt. * * Many drivers split up the page into fragments, and some @@ -711,174 +826,257 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); - return NULL; + return 0; } -void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static bool page_pool_napi_local(const struct page_pool *pool) { - page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); - if (page && !page_pool_recycle_in_ring(pool, page)) { + const struct napi_struct *napi; + u32 cpuid; + + /* On PREEMPT_RT the softirq can be preempted by the consumer */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + +void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, + unsigned int dma_sync_size, bool allow_direct) +{ + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + + netmem = __page_pool_put_page(pool, netmem, dma_sync_size, + allow_direct); + if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); } } +EXPORT_SYMBOL(page_pool_put_unrefed_netmem); + +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, + allow_direct); +} EXPORT_SYMBOL(page_pool_put_unrefed_page); -/** - * page_pool_put_page_bulk() - release references on multiple pages - * @pool: pool from which pages were allocated - * @data: array holding page pointers - * @count: number of pages in @data - * - * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring - * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() - * will release leftover pages to the page allocator. - * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx - * completion loop for the XDP_REDIRECT use case. - * - * Please note the caller must not use data area after running - * page_pool_put_page_bulk(), as this function overwrites it. - */ -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static void page_pool_recycle_ring_bulk(struct page_pool *pool, + netmem_ref *bulk, + u32 bulk_len) { - int i, bulk_len = 0; bool in_softirq; + u32 i; - for (i = 0; i < count; i++) { - struct page *page = virt_to_head_page(data[i]); - - /* It is not the last user for the page frag case */ - if (!page_pool_is_last_ref(page)) - continue; - - page = __page_pool_put_page(pool, page, -1, false); - /* Approved for bulk recycling in ptr_ring cache */ - if (page) - data[bulk_len++] = page; - } - - if (unlikely(!bulk_len)) - return; - - /* Bulk producer into ptr_ring page_pool cache */ + /* Bulk produce into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); + for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) { + if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } - recycle_stat_add(pool, ring, i); + page_pool_producer_unlock(pool, in_softirq); + recycle_stat_add(pool, ring, i); - /* Hopefully all pages was return into ptr_ring */ + /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; - /* ptr_ring cache full, free remaining pages outside producer lock - * since put_page() with refcnt == 1 can be an expensive operation + /* + * ptr_ring cache is full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) - page_pool_return_page(pool, data[i]); + page_pool_return_page(pool, bulk[i]); } -EXPORT_SYMBOL(page_pool_put_page_bulk); -static struct page *page_pool_drain_frag(struct page_pool *pool, - struct page *page) +/** + * page_pool_put_netmem_bulk() - release references on multiple netmems + * @data: array holding netmem references + * @count: number of entries in @data + * + * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() + * will release leftover netmems to the memory provider. + * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_netmem_bulk(), as this function overwrites it. + */ +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) +{ + u32 bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + netmem_ref netmem = netmem_compound_head(data[i]); + + if (page_pool_unref_and_test(netmem)) + data[bulk_len++] = netmem; + } + + count = bulk_len; + while (count) { + netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; + struct page_pool *pool = NULL; + bool allow_direct; + u32 foreign = 0; + + bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + struct page_pool *netmem_pp; + netmem_ref netmem = data[i]; + + netmem_pp = netmem_get_pp(netmem); + if (unlikely(!pool)) { + pool = netmem_pp; + allow_direct = page_pool_napi_local(pool); + } else if (netmem_pp != pool) { + /* + * If the netmem belongs to a different + * page_pool, save it for another round. + */ + data[foreign++] = netmem; + continue; + } + + netmem = __page_pool_put_page(pool, netmem, -1, + allow_direct); + /* Approved for bulk recycling in ptr_ring cache */ + if (netmem) + bulk[bulk_len++] = netmem; + } + + if (bulk_len) + page_pool_recycle_ring_bulk(pool, bulk, bulk_len); + + count = foreign; + } +} +EXPORT_SYMBOL(page_pool_put_netmem_bulk); + +static netmem_ref page_pool_drain_frag(struct page_pool *pool, + netmem_ref netmem) { long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ - if (likely(page_pool_unref_page(page, drain_count))) - return NULL; - - if (__page_pool_page_can_be_recycled(page)) { - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, -1); + if (likely(page_pool_unref_netmem(netmem, drain_count))) + return 0; - return page; + if (__page_pool_page_can_be_recycled(netmem)) { + page_pool_dma_sync_for_device(pool, netmem, -1); + return netmem; } - page_pool_return_page(pool, page); - return NULL; + page_pool_return_page(pool, netmem); + return 0; } static void page_pool_free_frag(struct page_pool *pool) { long drain_count = BIAS_MAX - pool->frag_users; - struct page *page = pool->frag_page; + netmem_ref netmem = pool->frag_page; - pool->frag_page = NULL; + pool->frag_page = 0; - if (!page || page_pool_unref_page(page, drain_count)) + if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); } -struct page *page_pool_alloc_frag(struct page_pool *pool, - unsigned int *offset, - unsigned int size, gfp_t gfp) +netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, + unsigned int *offset, unsigned int size, + gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; - struct page *page = pool->frag_page; + netmem_ref netmem = pool->frag_page; if (WARN_ON(size > max_size)) - return NULL; + return 0; size = ALIGN(size, dma_get_cache_alignment()); *offset = pool->frag_offset; - if (page && *offset + size > max_size) { - page = page_pool_drain_frag(pool, page); - if (page) { + if (netmem && *offset + size > max_size) { + netmem = page_pool_drain_frag(pool, netmem); + if (netmem) { + recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } } - if (!page) { - page = page_pool_alloc_pages(pool, gfp); - if (unlikely(!page)) { - pool->frag_page = NULL; - return NULL; + if (!netmem) { + netmem = page_pool_alloc_netmems(pool, gfp); + if (unlikely(!netmem)) { + pool->frag_page = 0; + return 0; } - pool->frag_page = page; + pool->frag_page = netmem; frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; - page_pool_fragment_page(page, BIAS_MAX); - return page; + page_pool_fragment_netmem(netmem, BIAS_MAX); + return netmem; } pool->frag_users++; pool->frag_offset = *offset + size; - alloc_stat_inc(pool, fast); - return page; + return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_frag_netmem); + +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, + gfp)); } EXPORT_SYMBOL(page_pool_alloc_frag); static void page_pool_empty_ring(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; /* Empty recycle ring */ - while ((page = ptr_ring_consume_bh(&pool->ring))) { + while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ - if (!(page_ref_count(page) == 1)) + if (!(netmem_ref_count(netmem) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", - __func__, page_ref_count(page)); + __func__, netmem_ref_count(netmem)); - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); } } @@ -889,12 +1087,18 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); + + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); + static_branch_dec(&page_pool_mem_providers); + } + kfree(pool); } static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; if (pool->destroy_cnt) return; @@ -904,15 +1108,36 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) * call concurrently. */ while (pool->alloc.count) { - page = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, page); + netmem = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_page(pool, netmem); } } static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. @@ -922,10 +1147,14 @@ static void page_pool_scrub(struct page_pool *pool) static int page_pool_release(struct page_pool *pool) { + bool in_softirq; int inflight; page_pool_scrub(pool); inflight = page_pool_inflight(pool, true); + /* Acquire producer lock to make sure producers have exited. */ + in_softirq = page_pool_producer_lock(pool); + page_pool_producer_unlock(pool, in_softirq); if (!inflight) __page_pool_destroy(pool); @@ -940,7 +1169,13 @@ static void page_pool_release_retry(struct work_struct *wq) int inflight; inflight = page_pool_release(pool); - if (!inflight) + /* In rare cases, a driver bug may cause inflight to go negative. + * Don't reschedule release if inflight is 0 or negative. + * - If 0, the page_pool has been destroyed + * - if negative, we will never recover + * in both cases no reschedule is necessary. + */ + if (inflight <= 0) return; /* Periodic warning for page pools the user can't see */ @@ -959,31 +1194,30 @@ static void page_pool_release_retry(struct work_struct *wq) } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem) + const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } -static void page_pool_disable_direct_recycling(struct page_pool *pool) +void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. - * Paired with READ_ONCE() in napi_pp_put_page(). + * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); if (!pool->p.napi) return; - /* To avoid races with recycling and additional barriers make sure - * pool and NAPI are unlinked when NAPI is disabled. - */ - WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || - READ_ONCE(pool->p.napi->list_owner) != -1); + napi_assert_will_not_race(pool->p.napi); + mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); + mutex_unlock(&page_pools_lock); } +EXPORT_SYMBOL(page_pool_disable_direct_recycling); void page_pool_destroy(struct page_pool *pool) { @@ -1011,15 +1245,43 @@ EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { - struct page *page; + netmem_ref netmem; trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { - page = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, page); + netmem = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_page(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h index 90665d40f1eb..2fb06d5f6d55 100644 --- a/net/core/page_pool_priv.h +++ b/net/core/page_pool_priv.h @@ -3,10 +3,58 @@ #ifndef __PAGE_POOL_PRIV_H #define __PAGE_POOL_PRIV_H +#include <net/page_pool/helpers.h> + +#include "netmem_priv.h" + +extern struct mutex page_pools_lock; + s32 page_pool_inflight(const struct page_pool *pool, bool strict); int page_pool_list(struct page_pool *pool); void page_pool_detached(struct page_pool *pool); void page_pool_unlist(struct page_pool *pool); +static inline bool +page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr) +{ + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { + netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT); + + /* We assume page alignment to shave off bottom bits, + * if this "compression" doesn't work we need to drop. + */ + return addr != (dma_addr_t)netmem_get_dma_addr(netmem) + << PAGE_SHIFT; + } + + netmem_set_dma_addr(netmem, addr); + return false; +} + +static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr); +} + +#if defined(CONFIG_PAGE_POOL) +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem); +void page_pool_clear_pp_info(netmem_ref netmem); +int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq); +#else +static inline void page_pool_set_pp_info(struct page_pool *pool, + netmem_ref netmem) +{ +} +static inline void page_pool_clear_pp_info(netmem_ref netmem) +{ +} +static inline int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq) +{ + return 0; +} +#endif + #endif diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 3a3277ba167b..c82a95beceff 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -3,19 +3,23 @@ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/xarray.h> +#include <net/busy_poll.h> #include <net/net_debug.h> -#include <net/page_pool/types.h> +#include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/types.h> +#include <net/page_pool/memory_provider.h> #include <net/sock.h> #include "page_pool_priv.h" #include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); -/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. +/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, + * pool->user. * Ordering: inside rtnl_lock */ -static DEFINE_MUTEX(page_pools_lock); +DEFINE_MUTEX(page_pools_lock); /* Page pools are only reachable from user space (via netlink) if they are * linked to a netdev at creation time. Following page pool "visibility" @@ -213,6 +217,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { size_t inflight, refsz; + unsigned int napi_id; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -226,8 +231,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex)) goto err_cancel; - if (pool->user.napi_id && - nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) + + napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; + if (napi_id_valid(napi_id) && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) goto err_cancel; inflight = page_pool_inflight(pool, false); @@ -241,6 +248,9 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) + goto err_cancel; + genlmsg_end(rsp, hdr); return 0; @@ -313,8 +323,6 @@ int page_pool_list(struct page_pool *pool) if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); - pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; - netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } @@ -344,6 +352,30 @@ void page_pool_unlist(struct page_pool *pool) mutex_unlock(&page_pools_lock); } +int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq) +{ + void *binding = rxq->mp_params.mp_priv; + struct page_pool *pool; + struct hlist_node *n; + + if (!binding) + return 0; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { + if (pool->mp_priv != binding) + continue; + + if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { + mutex_unlock(&page_pools_lock); + return 0; + } + } + mutex_unlock(&page_pools_lock); + return -ENODATA; +} + static void page_pool_unreg_netdev_wipe(struct net_device *netdev) { struct page_pool *pool; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index ea55a758a475..0ebe5461d4d9 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -69,7 +69,7 @@ * * By design there should only be *one* "controlling" process. In practice * multiple write accesses gives unpredictable result. Understood by "write" - * to /proc gives result code thats should be read be the "writer". + * to /proc gives result code that should be read be the "writer". * For practical use this should be no problem. * * Note when adding devices to a specific CPU there good idea to also assign @@ -158,9 +158,7 @@ #include <net/udp.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> -#ifdef CONFIG_XFRM #include <net/xfrm.h> -#endif #include <net/netns/generic.h> #include <asm/byteorder.h> #include <linux/rcupdate.h> @@ -179,7 +177,7 @@ #define MAX_IMIX_ENTRIES 20 #define IMIX_PRECISION 100 /* Precision of IMIX distribution */ -#define func_enter() pr_debug("entering %s\n", __func__); +#define func_enter() pr_debug("entering %s\n", __func__) #define PKT_FLAGS \ pf(IPV6) /* Interface in IPV6 Mode */ \ @@ -229,12 +227,12 @@ static char *pkt_flag_names[] = { /* Xmit modes */ #define M_START_XMIT 0 /* Default normal TX */ -#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ -#define if_lock(t) mutex_lock(&(t->if_lock)); -#define if_unlock(t) mutex_unlock(&(t->if_lock)); +#define if_lock(t) mutex_lock(&(t->if_lock)) +#define if_unlock(t) mutex_unlock(&(t->if_lock)) /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 @@ -285,7 +283,8 @@ struct pktgen_dev { int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ + * removal by worker thread + */ struct page *page; u64 delay; /* nano-seconds */ @@ -348,10 +347,12 @@ struct pktgen_dev { __u16 udp_dst_max; /* exclusive, dest UDP port */ /* DSCP + ECN */ - __u8 tos; /* six MSB of (former) IPv4 TOS - are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 - (see RFC 3260, sec. 4) */ + __u8 tos; /* six MSB of (former) IPv4 TOS + * are for dscp codepoint + */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + * (see RFC 3260, sec. 4) + */ /* IMIX */ unsigned int n_imix_entries; @@ -391,12 +392,12 @@ struct pktgen_dev { __u8 hh[14]; /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; + * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + * + * We fill in SRC address later + * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + * 0x08, 0x00 + * }; */ __u16 pad; /* pad out the hh struct to an even 16 bytes */ @@ -460,7 +461,8 @@ struct pktgen_thread { char result[512]; /* Field for thread to receive "posted" events terminate, - stop ifs etc. */ + * stop ifs etc. + */ u32 control; int cpu; @@ -474,8 +476,7 @@ struct pktgen_thread { #define FIND 0 static const char version[] = - "Packet Generator for packet performance testing. " - "Version: " VERSION "\n"; + "Packet Generator for packet performance testing. Version: " VERSION "\n"; static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); @@ -517,21 +518,23 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char data[128]; + size_t max; struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (count == 0) + if (count < 1) return -EINVAL; - if (count > sizeof(data)) - count = sizeof(data); - - if (copy_from_user(data, buf, count)) + max = min(count, sizeof(data) - 1); + if (copy_from_user(data, buf, max)) return -EFAULT; - data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ + if (data[max - 1] == '\n') + data[max - 1] = 0; /* strip trailing '\n', terminate string */ + else + data[max] = 0; /* terminate string */ if (!strcmp(data, "stop")) pktgen_stop_all_threads(pn); @@ -624,8 +627,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, - " udp_src_min: %d udp_src_max: %d" - " udp_dst_min: %d udp_dst_max: %d\n", + " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); @@ -744,34 +746,37 @@ static int pktgen_if_show(struct seq_file *seq, void *v) } -static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, - __u32 *num) +static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen, + __u32 *num) { - int i = 0; + size_t i = 0; + *num = 0; for (; i < maxlen; i++) { int value; char c; - *num <<= 4; + if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); - if (value >= 0) + if (value >= 0) { + *num <<= 4; *num |= value; - else + } else { break; + } } return i; } -static int count_trail_chars(const char __user * user_buffer, - unsigned int maxlen) +static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -790,14 +795,15 @@ done: return i; } -static long num_arg(const char __user *user_buffer, unsigned long maxlen, - unsigned long *num) +static ssize_t num_arg(const char __user *user_buffer, size_t maxlen, + unsigned long *num) { - int i; + size_t i; *num = 0; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; if ((c >= '0') && (c <= '9')) { @@ -809,12 +815,13 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen, return i; } -static int strn_len(const char __user * user_buffer, unsigned int maxlen) +static ssize_t strn_len(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -823,6 +830,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\r': case '\t': case ' ': + case '=': goto done_str; default: break; @@ -838,11 +846,11 @@ done_str: * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. */ static ssize_t get_imix_entries(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { - const int max_digits = 10; - int i = 0; - long len; + size_t i = 0, max; + ssize_t len; char c; pkt_dev->n_imix_entries = 0; @@ -851,21 +859,33 @@ static ssize_t get_imix_entries(const char __user *buffer, unsigned long weight; unsigned long size; - len = num_arg(&buffer[i], max_digits, &size); + if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES) + return -E2BIG; + + if (i >= maxlen) + return -EINVAL; + + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &size); if (len < 0) return len; i += len; + if (i >= maxlen) + return -EINVAL; if (get_user(c, &buffer[i])) return -EFAULT; /* Check for comma between size_i and weight_i */ if (c != ',') return -EINVAL; i++; + if (i >= maxlen) + return -EINVAL; if (size < 14 + 20 + 8) size = 14 + 20 + 8; - len = num_arg(&buffer[i], max_digits, &weight); + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &weight); if (len < 0) return len; if (weight <= 0) @@ -875,42 +895,55 @@ static ssize_t get_imix_entries(const char __user *buffer, pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; i += len; + pkt_dev->n_imix_entries++; + + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; - i++; - pkt_dev->n_imix_entries++; - - if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES) - return -E2BIG; } while (c == ' '); return i; } -static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +static ssize_t get_labels(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { unsigned int n = 0; + size_t i = 0, max; + ssize_t len; char c; - ssize_t i = 0; - int len; pkt_dev->nr_labels = 0; do { __u32 tmp; - len = hex32_arg(&buffer[i], 8, &tmp); - if (len <= 0) + + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + + if (i >= maxlen) + return -EINVAL; + + max = min(8, maxlen - i); + len = hex32_arg(&buffer[i], max, &tmp); + if (len < 0) return len; + + /* return empty list in case of invalid input or zero value */ + if (len == 0 || tmp == 0) + return maxlen; + pkt_dev->labels[n] = htonl(tmp); if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM) pkt_dev->flags |= F_MPLS_RND; i += len; + n++; + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; i++; - n++; - if (n >= MAX_MPLS_LABELS) - return -E2BIG; } while (c == ','); pkt_dev->nr_labels = n; @@ -947,16 +980,16 @@ static __u32 pktgen_read_flag(const char *f, bool *disable) } static ssize_t pktgen_if_write(struct file *file, - const char __user * user_buffer, size_t count, - loff_t * offset) + const char __user *user_buffer, size_t count, + loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i, max, len; + size_t i, max; + ssize_t len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; - int tmp = 0; char buf[128]; pg_result = &(pkt_dev->result[0]); @@ -967,16 +1000,16 @@ static ssize_t pktgen_if_write(struct file *file, } max = count; - tmp = count_trail_chars(user_buffer, max); - if (tmp < 0) { + len = count_trail_chars(user_buffer, max); + if (len < 0) { pr_warn("illegal format\n"); - return tmp; + return len; } - i = tmp; + i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1004,11 +1037,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "min_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -1021,11 +1054,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "max_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->max_pkt_size) { @@ -1040,11 +1073,11 @@ static ssize_t pktgen_if_write(struct file *file, /* Shortcut for min = max */ if (!strcmp(name, "pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -1060,43 +1093,43 @@ static ssize_t pktgen_if_write(struct file *file, if (pkt_dev->clone_skb > 0) return -EINVAL; - len = get_imix_entries(&user_buffer[i], pkt_dev); + max = count - i; + len = get_imix_entries(&user_buffer[i], max, pkt_dev); if (len < 0) return len; fill_imix_distribution(pkt_dev); - i += len; return count; } if (!strcmp(name, "debug")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; debug = value; sprintf(pg_result, "OK: debug=%u", debug); return count; } if (!strcmp(name, "frags")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->nfrags = value; sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags); return count; } if (!strcmp(name, "delay")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value == 0x7FFFFFFF) pkt_dev->delay = ULLONG_MAX; else @@ -1107,13 +1140,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "rate")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1122,13 +1155,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "ratep")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = NSEC_PER_SEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1137,11 +1170,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_min) { pkt_dev->udp_src_min = value; pkt_dev->cur_udp_src = value; @@ -1150,11 +1183,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_min) { pkt_dev->udp_dst_min = value; pkt_dev->cur_udp_dst = value; @@ -1163,11 +1196,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_max) { pkt_dev->udp_src_max = value; pkt_dev->cur_udp_src = value; @@ -1176,11 +1209,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_max) { pkt_dev->udp_dst_max = value; pkt_dev->cur_udp_dst = value; @@ -1189,7 +1222,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "clone_skb")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; /* clone_skb is not supported for netif_receive xmit_mode and @@ -1198,34 +1232,33 @@ static ssize_t pktgen_if_write(struct file *file, if ((value > 0) && ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) - return -ENOTSUPP; + return -EOPNOTSUPP; if (value > 0 && (pkt_dev->n_imix_entries > 0 || !(pkt_dev->flags & F_SHARED))) return -EINVAL; - i += len; pkt_dev->clone_skb = value; sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); return count; } if (!strcmp(name, "count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->count = value; sprintf(pg_result, "OK: count=%llu", (unsigned long long)pkt_dev->count); return count; } if (!strcmp(name, "src_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->src_mac_count != value) { pkt_dev->src_mac_count = value; pkt_dev->cur_src_mac_offset = 0; @@ -1235,11 +1268,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->dst_mac_count != value) { pkt_dev->dst_mac_count = value; pkt_dev->cur_dst_mac_offset = 0; @@ -1249,16 +1282,16 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "burst")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value > 1) && ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || ((pkt_dev->xmit_mode == M_START_XMIT) && (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) - return -ENOTSUPP; + return -EOPNOTSUPP; if (value > 1 && !(pkt_dev->flags & F_SHARED)) return -EINVAL; @@ -1268,12 +1301,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "node")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; - if (node_possible(value)) { pkt_dev->node = value; sprintf(pg_result, "OK: node=%d", pkt_dev->node); @@ -1281,29 +1313,29 @@ static ssize_t pktgen_if_write(struct file *file, put_page(pkt_dev->page); pkt_dev->page = NULL; } - } - else + } else { sprintf(pg_result, "ERROR: node not possible"); + } return count; } if (!strcmp(name, "xmit_mode")) { char f[32]; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, sizeof(f)); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; if (strcmp(f, "start_xmit") == 0) { pkt_dev->xmit_mode = M_START_XMIT; } else if (strcmp(f, "netif_receive") == 0) { /* clone_skb set earlier, not supported in this mode */ if (pkt_dev->clone_skb > 0) - return -ENOTSUPP; + return -EOPNOTSUPP; pkt_dev->xmit_mode = M_NETIF_RECEIVE; @@ -1329,14 +1361,14 @@ static ssize_t pktgen_if_write(struct file *file, char f[32]; char *end; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, 32); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; flag = pktgen_read_flag(f, &disable); if (flag) { @@ -1378,7 +1410,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + max = min(sizeof(pkt_dev->dst_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1386,19 +1419,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { - memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strcpy(pkt_dev->dst_min, buf); + strscpy_pad(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } if (debug) pr_debug("dst_min set to: %s\n", pkt_dev->dst_min); - i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); return count; } if (!strcmp(name, "dst_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + max = min(sizeof(pkt_dev->dst_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1406,19 +1439,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { - memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strcpy(pkt_dev->dst_max, buf); + strscpy_pad(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } if (debug) pr_debug("dst_max set to: %s\n", pkt_dev->dst_max); - i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); return count; } if (!strcmp(name, "dst6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1436,12 +1469,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6=%s", buf); return count; } if (!strcmp(name, "dst6_min")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1458,12 +1491,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_min set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_min=%s", buf); return count; } if (!strcmp(name, "dst6_max")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1479,12 +1512,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_max set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_max=%s", buf); return count; } if (!strcmp(name, "src6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1502,12 +1535,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("src6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: src6=%s", buf); return count; } if (!strcmp(name, "src_min")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + max = min(sizeof(pkt_dev->src_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1515,19 +1548,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { - memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strcpy(pkt_dev->src_min, buf); + strscpy_pad(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } if (debug) pr_debug("src_min set to: %s\n", pkt_dev->src_min); - i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); return count; } if (!strcmp(name, "src_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + max = min(sizeof(pkt_dev->src_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1535,19 +1568,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { - memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strcpy(pkt_dev->src_max, buf); + strscpy_pad(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } if (debug) pr_debug("src_max set to: %s\n", pkt_dev->src_max); - i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); return count; } if (!strcmp(name, "dst_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1564,7 +1597,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "src_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1588,11 +1622,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "flows")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value > MAX_CFLOWS) value = MAX_CFLOWS; @@ -1602,44 +1636,44 @@ static ssize_t pktgen_if_write(struct file *file, } #ifdef CONFIG_XFRM if (!strcmp(name, "spi")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->spi = value; sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); return count; } #endif if (!strcmp(name, "flowlen")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->lflow = value; sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); return count; } if (!strcmp(name, "queue_map_min")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_min = value; sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); return count; } if (!strcmp(name, "queue_map_max")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_max = value; sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); return count; @@ -1648,10 +1682,11 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "mpls")) { unsigned int n, cnt; - len = get_labels(&user_buffer[i], pkt_dev); + max = count - i; + len = get_labels(&user_buffer[i], max, pkt_dev); if (len < 0) return len; - i += len; + cnt = sprintf(pg_result, "OK: mpls="); for (n = 0; n < pkt_dev->nr_labels; n++) cnt += sprintf(pg_result + cnt, @@ -1669,11 +1704,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value <= 4095) { pkt_dev->vlan_id = value; /* turn on VLAN */ @@ -1696,11 +1731,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_p = value; sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p); @@ -1711,11 +1746,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_cfi = value; sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi); @@ -1726,11 +1761,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { pkt_dev->svlan_id = value; /* turn on SVLAN */ @@ -1753,11 +1788,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_p = value; sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p); @@ -1768,11 +1803,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_cfi = value; sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi); @@ -1783,12 +1818,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "tos")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->tos = tmp_value; sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos); @@ -1799,12 +1835,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "traffic_class")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->traffic_class = tmp_value; sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class); @@ -1815,11 +1852,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "skb_priority")) { - len = num_arg(&user_buffer[i], 9, &value); + max = min(9, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->skb_priority = value; sprintf(pg_result, "OK: skb_priority=%i", pkt_dev->skb_priority); @@ -1874,12 +1911,13 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) } static ssize_t pktgen_thread_write(struct file *file, - const char __user * user_buffer, - size_t count, loff_t * offset) + const char __user *user_buffer, + size_t count, loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i, max, len, ret; + size_t i, max; + ssize_t len, ret; char name[40]; char *pg_result; @@ -1896,8 +1934,8 @@ static ssize_t pktgen_thread_write(struct file *file, i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1926,15 +1964,17 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; + memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) { ret = len; goto out; } if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; + mutex_lock(&pktgen_thread_lock); ret = pktgen_add_device(t, f); mutex_unlock(&pktgen_thread_lock); @@ -2285,7 +2325,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) s64 remaining; struct hrtimer_sleeper t; - hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); @@ -2358,24 +2398,25 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) } -#ifdef CONFIG_XFRM /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... -*/ + */ #define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { +#ifdef CONFIG_XFRM struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { if (pkt_dev->spi) { /* We need as quick as possible to find the right SA - * Searching with minimum criteria to archieve this. + * Searching with minimum criteria to achieve, this. */ x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); } else { - /* slow path: we dont already have xfrm_state */ + /* slow path: we don't already have xfrm_state */ x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, @@ -2390,16 +2431,16 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) } } -} #endif +} static void set_cur_queue_map(struct pktgen_dev *pkt_dev) { - if (pkt_dev->flags & F_QUEUE_MAP_CPU) pkt_dev->cur_queue_map = smp_processor_id(); else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { t = get_random_u32_inclusive(pkt_dev->queue_map_min, pkt_dev->queue_map_max); @@ -2481,6 +2522,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_MPLS_RND) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | @@ -2525,6 +2567,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) imx = ntohl(pkt_dev->saddr_max); if (imn < imx) { __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) t = get_random_u32_inclusive(imn, imx - 1); else { @@ -2545,6 +2588,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { do { @@ -2569,10 +2613,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].flags |= F_INIT; pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; -#ifdef CONFIG_XFRM if (pkt_dev->flags & F_IPSEC) get_ipsec_sa(pkt_dev, flow); -#endif pkt_dev->nflows++; } } @@ -2594,6 +2636,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { t = get_random_u32_inclusive(pkt_dev->min_pkt_size, pkt_dev->max_pkt_size - 1); @@ -2660,7 +2703,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) if (!x) return 0; /* XXX: we dont support tunnel mode for now until - * we resolve the dst issue */ + * we resolve the dst issue + */ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; @@ -2695,8 +2739,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { xfrm_state_put(x); pkt_dev->flows[i].x = NULL; @@ -2711,6 +2757,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; + if (x) { struct ethhdr *eth; struct iphdr *iph; @@ -2754,6 +2801,7 @@ err: static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; @@ -2866,7 +2914,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb->dev = dev; } } else { - skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } /* the caller pre-fetches from skb->data and reserves for the mac hdr */ @@ -2947,7 +2995,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->priority = pkt_dev->skb_priority; memcpy(eth, pkt_dev->hh, 12); - *(__be16 *) & eth[12] = protocol; + *(__be16 *)ð[12] = protocol; /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - @@ -3176,11 +3224,11 @@ static void pktgen_run(struct pktgen_thread *t) set_pkt_overhead(pkt_dev); - strcpy(pkt_dev->result, "Starting"); + strscpy(pkt_dev->result, "Starting"); pkt_dev->running = 1; /* Cranke yeself! */ started++; } else - strcpy(pkt_dev->result, "Error starting"); + strscpy(pkt_dev->result, "Error starting"); } rcu_read_unlock(); if (started) @@ -3439,6 +3487,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static void pktgen_resched(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); + schedule(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } @@ -3654,7 +3703,7 @@ static int pktgen_thread_worker(void *arg) struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - WARN_ON(smp_processor_id() != cpu); + WARN_ON_ONCE(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); complete(&t->start_done); @@ -3754,7 +3803,8 @@ static int add_dev_to_thread(struct pktgen_thread *t, * userspace on another CPU than the kthread. The if_lock() * is used here to sync with concurrent instances of * _rem_dev_from_if_list() invoked via kthread, which is also - * updating the if_list */ + * updating the if_list + */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3792,7 +3842,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) if (!pkt_dev) return -ENOMEM; - strcpy(pkt_dev->odevname, ifname); + strscpy(pkt_dev->odevname, ifname); pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, sizeof(struct flow_state)), node); @@ -3838,8 +3888,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; pkt_dev->ipsproto = IPPROTO_ESP; - /* xfrm tunnel mode needs additional dst to extract outter - * ip header protocol/ttl/id field, here creat a phony one. + /* xfrm tunnel mode needs additional dst to extract outer + * ip header protocol/ttl/id field, here create a phony one. * instead of looking for a valid rt, which definitely hurting * performance under such circumstance. */ @@ -3883,17 +3933,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) list_add_tail(&t->th_list, &pn->pktgen_threads); init_completion(&t->start_done); - p = kthread_create_on_node(pktgen_thread_worker, - t, - cpu_to_node(cpu), - "kpktgend_%d", cpu); + p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d"); if (IS_ERR(p)) { pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu); list_del(&t->th_list); kfree(t); return PTR_ERR(p); } - kthread_bind(p, cpu); + t->tsk = p; pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, @@ -3952,7 +3999,8 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race - * with proc_create_data() */ + * with proc_create_data() + */ proc_remove(pkt_dev->entry); /* And update the thread if_list */ @@ -3989,6 +4037,7 @@ static int __net_init pg_net_init(struct net *net) goto remove; } + cpus_read_lock(); for_each_online_cpu(cpu) { int err; @@ -3997,6 +4046,7 @@ static int __net_init pg_net_init(struct net *net) pr_warn("Cannot create thread for cpu %d (%d)\n", cpu, err); } + cpus_read_unlock(); if (list_empty(&pn->pktgen_threads)) { pr_err("Initialization failed for all threads\n"); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a3d7847ce69d..c57692eb8da9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -53,6 +53,7 @@ #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include <net/netdev_lock.h> #include <net/devlink.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/addrconf.h> @@ -80,11 +81,15 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_interruptible(void) +{ + return mutex_lock_interruptible(&rtnl_mutex); +} + int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } -EXPORT_SYMBOL(rtnl_lock_killable); static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) @@ -179,6 +184,176 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_lock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_lock); + +void __rtnl_net_unlock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_unlock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_unlock); + +void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); + __rtnl_net_lock(net); +} +EXPORT_SYMBOL(rtnl_net_lock); + +void rtnl_net_unlock(struct net *net) +{ + __rtnl_net_unlock(net); + rtnl_unlock(); +} +EXPORT_SYMBOL(rtnl_net_unlock); + +int rtnl_net_trylock(struct net *net) +{ + int ret = rtnl_trylock(); + + if (ret) + __rtnl_net_lock(net); + + return ret; +} +EXPORT_SYMBOL(rtnl_net_trylock); + +int rtnl_net_lock_killable(struct net *net) +{ + int ret = rtnl_lock_killable(); + + if (!ret) + __rtnl_net_lock(net); + + return ret; +} + +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + if (net_eq(net_a, net_b)) + return 0; + + /* always init_net first */ + if (net_eq(net_a, &init_net)) + return -1; + + if (net_eq(net_b, &init_net)) + return 1; + + /* otherwise lock in ascending order */ + return net_a < net_b ? -1 : 1; +} + +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) +{ + const struct net *net_a, *net_b; + + net_a = container_of(a, struct net, rtnl_mutex.dep_map); + net_b = container_of(b, struct net, rtnl_mutex.dep_map); + + return rtnl_net_cmp_locks(net_a, net_b); +} + +bool rtnl_net_is_locked(struct net *net) +{ + return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_net_is_locked); + +bool lockdep_rtnl_net_is_held(struct net *net) +{ + return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_net_is_held); +#else +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + /* No need to swap */ + return -1; +} +#endif + +struct rtnl_nets { + /* ->newlink() needs to freeze 3 netns at most; + * 2 for the new device, 1 for its peer. + */ + struct net *net[3]; + unsigned char len; +}; + +static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) +{ + memset(rtnl_nets, 0, sizeof(*rtnl_nets)); +} + +static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) { + put_net(rtnl_nets->net[i]); + rtnl_nets->net[i] = NULL; + } + + rtnl_nets->len = 0; +} + +/** + * rtnl_nets_add - Add netns to be locked before ->newlink(). + * + * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). + * @net: netns pointer with an extra refcnt held. + * + * The extra refcnt is released in rtnl_nets_destroy(). + */ +static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) +{ + int i; + + DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); + + for (i = 0; i < rtnl_nets->len; i++) { + switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { + case 0: + put_net(net); + return; + case 1: + swap(rtnl_nets->net[i], net); + } + } + + rtnl_nets->net[i] = net; + rtnl_nets->len++; +} + +static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) +{ + int i; + + rtnl_lock(); + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_lock(rtnl_nets->net[i]); +} + +static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_unlock(rtnl_nets->net[i]); + + rtnl_unlock(); +} + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) @@ -269,64 +444,13 @@ unlock: } /** - * rtnl_register_module - Register a rtnetlink message type - * - * @owner: module registering the hook (THIS_MODULE) - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Like rtnl_register, but for use by removable modules. - */ -int rtnl_register_module(struct module *owner, - int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - return rtnl_register_internal(owner, protocol, msgtype, - doit, dumpit, flags); -} -EXPORT_SYMBOL_GPL(rtnl_register_module); - -/** - * rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - */ -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - int err; - - err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, - flags); - if (err) - pr_err("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", protocol, msgtype); -} - -/** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ -int rtnl_unregister(int protocol, int msgtype) +static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; @@ -349,7 +473,6 @@ int rtnl_unregister(int protocol, int msgtype) return 0; } -EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol @@ -384,46 +507,86 @@ void rtnl_unregister_all(int protocol) } EXPORT_SYMBOL_GPL(rtnl_unregister_all); -static LIST_HEAD(link_ops); - -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +/** + * __rtnl_register_many - Register rtnetlink message types + * @handlers: Array of struct rtnl_msg_handlers + * @n: The length of @handlers + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * When one element of @handlers fails to register, + * 1) built-in: panics. + * 2) modules : the previous successful registrations are unwinded + * and an error is returned. + * + * Use rtnl_register_many(). + */ +int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { - const struct rtnl_link_ops *ops; + const struct rtnl_msg_handler *handler; + int i, err; + + for (i = 0, handler = handlers; i < n; i++, handler++) { + err = rtnl_register_internal(handler->owner, handler->protocol, + handler->msgtype, handler->doit, + handler->dumpit, handler->flags); + if (err) { + if (!handler->owner) + panic("Unable to register rtnetlink message " + "handlers, %pS\n", handlers); - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; + __rtnl_unregister_many(handlers, i); + break; + } } - return NULL; + + return err; } +EXPORT_SYMBOL_GPL(__rtnl_register_many); -/** - * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register - * - * The caller must hold the rtnl_mutex. This function should be used - * by drivers that create devices during module initialization. It - * must be called before registering the devices. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_link_register(struct rtnl_link_ops *ops) +void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) { - if (rtnl_link_ops_get(ops->kind)) - return -EEXIST; + const struct rtnl_msg_handler *handler; + int i; - /* The check for alloc/setup is here because if ops - * does not have that filled up, it is not possible - * to use the ops for creating device. So do not - * fill up dellink as well. That disables rtnl_dellink. - */ - if ((ops->alloc || ops->setup) && !ops->dellink) - ops->dellink = unregister_netdevice_queue; + for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) + rtnl_unregister(handler->protocol, handler->msgtype); +} +EXPORT_SYMBOL_GPL(__rtnl_unregister_many); - list_add_tail(&ops->list, &link_ops); - return 0; +static DEFINE_MUTEX(link_ops_mutex); +static LIST_HEAD(link_ops); + +static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) +{ + struct rtnl_link_ops *ops; + + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } + } + + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } -EXPORT_SYMBOL_GPL(__rtnl_link_register); /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. @@ -433,6 +596,7 @@ EXPORT_SYMBOL_GPL(__rtnl_link_register); */ int rtnl_link_register(struct rtnl_link_ops *ops) { + struct rtnl_link_ops *tmp; int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ @@ -440,9 +604,31 @@ int rtnl_link_register(struct rtnl_link_ops *ops) ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; - rtnl_lock(); - err = __rtnl_link_register(ops); - rtnl_unlock(); + /* The check for alloc/setup is here because if ops + * does not have that filled up, it is not possible + * to use the ops for creating device. So do not + * fill up dellink as well. That disables rtnl_dellink. + */ + if ((ops->alloc || ops->setup) && !ops->dellink) + ops->dellink = unregister_netdevice_queue; + + err = init_srcu_struct(&ops->srcu); + if (err) + return err; + + mutex_lock(&link_ops_mutex); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) { + err = -EEXIST; + goto unlock; + } + } + + list_add_tail_rcu(&ops->list, &link_ops); +unlock: + mutex_unlock(&link_ops_mutex); + return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); @@ -459,25 +645,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -/** - * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - * - * The caller must hold the rtnl_mutex and guarantee net_namespace_list - * integrity (hold pernet_ops_rwsem for writing to close the race - * with setup_net() and cleanup_net()). - */ -void __rtnl_link_unregister(struct rtnl_link_ops *ops) -{ - struct net *net; - - for_each_net(net) { - __rtnl_kill_links(net, ops); - } - list_del(&ops->list); -} -EXPORT_SYMBOL_GPL(__rtnl_link_unregister); - /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ @@ -506,10 +673,22 @@ static void rtnl_lock_unregistering_all(void) */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { + struct net *net; + + mutex_lock(&link_ops_mutex); + list_del_rcu(&ops->list); + mutex_unlock(&link_ops_mutex); + + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); - __rtnl_link_unregister(ops); + + for_each_net(net) + __rtnl_kill_links(net, ops); + rtnl_unlock(); up_write(&pernet_ops_rwsem); } @@ -566,31 +745,51 @@ static size_t rtnl_link_get_size(const struct net_device *dev) static LIST_HEAD(rtnl_af_ops); -static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { - const struct rtnl_af_ops *ops; + struct rtnl_af_ops *ops; ASSERT_RTNL(); - list_for_each_entry(ops, &rtnl_af_ops, list) { - if (ops->family == family) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + if (ops->family == family) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * - * Returns 0 on success or a negative error code. + * Return: 0 on success or a negative error code. */ -void rtnl_af_register(struct rtnl_af_ops *ops) +int rtnl_af_register(struct rtnl_af_ops *ops) { + int err = init_srcu_struct(&ops->srcu); + + if (err) + return err; + rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); + + return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); @@ -605,6 +804,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops) rtnl_unlock(); synchronize_rcu(); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -842,7 +1043,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); -void netdev_set_operstate(struct net_device *dev, int newstate) +void netif_set_operstate(struct net_device *dev, int newstate) { unsigned int old = READ_ONCE(dev->operstate); @@ -851,9 +1052,9 @@ void netdev_set_operstate(struct net_device *dev, int newstate) return; } while (!try_cmpxchg(&dev->operstate, &old, newstate)); - netdev_state_change(dev); + netif_state_change(dev); } -EXPORT_SYMBOL(netdev_set_operstate); +EXPORT_SYMBOL(netif_set_operstate); static void set_operstate(struct net_device *dev, unsigned char transition) { @@ -879,7 +1080,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - netdev_set_operstate(dev, operstate); + netif_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) @@ -976,6 +1177,9 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } + if (dev->netdev_ops->ndo_get_vf_guid) + size += num_vfs * 2 * + nla_total_size(sizeof(struct ifla_vf_guid)); return size; } else return 0; @@ -1036,8 +1240,8 @@ static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); - if (dev->proto_down_reason) - size += nla_total_size(0) + nla_total_size(4); + /* Assume dev->proto_down_reason is not zero. */ + size += nla_total_size(0) + nla_total_size(4); return size; } @@ -1092,6 +1296,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ @@ -1118,6 +1323,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + 0; } @@ -1477,13 +1683,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; + u32 res = 0; - ASSERT_RTNL(); + rcu_read_lock(); + generic_xdp_prog = rcu_dereference(dev->xdp_prog); + if (generic_xdp_prog) + res = generic_xdp_prog->aux->id; + rcu_read_unlock(); - generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (!generic_xdp_prog) - return 0; - return generic_xdp_prog->aux->id; + return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) @@ -1603,7 +1811,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) - ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + ret = nla_put_u32(skb, IFLA_MASTER, + READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; @@ -1736,10 +1945,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb, struct nlattr *pr; u32 preason; - if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; - preason = dev->proto_down_reason; + preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; @@ -1812,6 +2021,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { + char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; @@ -1824,41 +2034,54 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; - ifm->ifi_type = dev->type; - ifm->ifi_index = dev->ifindex; + ifm->ifi_type = READ_ONCE(dev->type); + ifm->ifi_index = READ_ONCE(dev->ifindex); ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; - qdisc = rtnl_dereference(dev->qdisc); - if (nla_put_string(skb, IFLA_IFNAME, dev->name) || - nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || + netdev_copy_name(dev, devname); + if (nla_put_string(skb, IFLA_IFNAME, devname)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || - nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || - nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || - nla_put_u32(skb, IFLA_GROUP, dev->group) || - nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || - nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || - nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || - nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || - nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || - nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || - nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || - nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || + netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN) || + nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || + nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || + nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || + nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || + nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || + nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, + READ_ONCE(dev->num_tx_queues)) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, + READ_ONCE(dev->gso_max_segs)) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, + READ_ONCE(dev->gso_max_size)) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, + READ_ONCE(dev->gro_max_size)) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, + READ_ONCE(dev->gso_ipv4_max_size)) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, + READ_ONCE(dev->gro_ipv4_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, + READ_ONCE(dev->tso_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, + READ_ONCE(dev->tso_max_segs)) || + nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, + READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS - nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, + READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (qdisc && - nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + @@ -1909,9 +2132,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) - goto nla_put_failure; - if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; @@ -1924,6 +2144,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; rcu_read_lock(); + if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) + goto nla_put_failure_rcu; + qdisc = rcu_dereference(dev->qdisc); + if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) + goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; if (rtnl_fill_link_ifmap(skb, dev)) @@ -1959,6 +2184,7 @@ nla_put_failure: } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, @@ -1987,7 +2213,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, - [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, @@ -2012,8 +2238,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, - [IFLA_GSO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2067,10 +2294,11 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; -static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, + int *ops_srcu_index) { - const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; + struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; @@ -2079,7 +2307,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; @@ -2162,12 +2390,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); @@ -2199,8 +2427,8 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { - const struct rtnl_link_ops *kind_ops = NULL; struct netlink_ext_ack *extack = cb->extack; + struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; @@ -2211,6 +2439,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; + int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; @@ -2234,7 +2463,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); - return PTR_ERR(tgt_net); + err = PTR_ERR(tgt_net); + netnsid = -1; + goto out; } break; case IFLA_EXT_MASK: @@ -2244,12 +2475,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: - kind_ops = linkinfo_to_kind_ops(tb[i]); + kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); - return -EINVAL; + err = -EINVAL; + goto out; } } } @@ -2270,8 +2502,15 @@ walk_entries: if (err < 0) break; } + + cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + +out: + + if (kind_ops) + rtnl_link_ops_put(kind_ops, ops_srcu_index); if (netnsid >= 0) put_net(tgt_net); @@ -2300,9 +2539,10 @@ int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); -struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { - struct net *net; + struct net *net = NULL; + /* Examine the link attributes and figure out which * network namespace we are talking about. */ @@ -2310,8 +2550,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); - else + + return net; +} + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net = rtnl_link_get_net_ifla(tb); + + if (!net) net = get_net(src_net); + return net; } EXPORT_SYMBOL(rtnl_link_get_net); @@ -2451,20 +2700,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - af_ops = rtnl_af_lookup(nla_type(af)); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) - return -EOPNOTSUPP; - - if (af_ops->validate_link_af) { + err = -EOPNOTSUPP; + else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); - if (err < 0) - return err; - } + else + err = 0; + + rtnl_af_put(af_ops, af_ops_srcu_index); + + if (err < 0) + return err; } } @@ -2530,7 +2783,7 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || - nla_len(attr) < NLA_HDRLEN) { + nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) @@ -2663,12 +2916,19 @@ static int do_set_master(struct net_device *dev, int ifindex, const struct net_device_ops *ops; int err; + /* Release the lower lock, the upper is responsible for locking + * the lower if needed. None of the existing upper devices + * use netdev instance lock, so don't grab it. + */ + if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { + netdev_unlock_ops(dev); err = ops->ndo_del_slave(upper_dev, dev); + netdev_lock_ops(dev); if (err) return err; } else { @@ -2682,7 +2942,9 @@ static int do_set_master(struct net_device *dev, int ifindex, return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { + netdev_unlock_ops(dev); err = ops->ndo_add_slave(upper_dev, dev, extack); + netdev_lock_ops(dev); if (err) return err; } else { @@ -2708,7 +2970,7 @@ static int do_set_proto_down(struct net_device *dev, bool proto_down; int err; - if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) { + if (!dev->change_proto_down) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } @@ -2732,7 +2994,7 @@ static int do_set_proto_down(struct net_device *dev, if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); - dev_change_proto_down_reason(dev, mask, value); + netdev_change_proto_down_reason_locked(dev, mask, value); } if (nl_proto_down) { @@ -2743,8 +3005,7 @@ static int do_set_proto_down(struct net_device *dev, NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } - err = dev_change_proto_down(dev, - proto_down); + err = netif_change_proto_down(dev, proto_down); if (err) return err; } @@ -2755,8 +3016,8 @@ static int do_set_proto_down(struct net_device *dev, #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 -static int do_setlink(const struct sk_buff *skb, - struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, struct net_device *dev, + struct net *tgt_net, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { @@ -2764,35 +3025,31 @@ static int do_setlink(const struct sk_buff *skb, char ifname[IFNAMSIZ]; int err; + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + return err; + if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { + if (!net_eq(tgt_net, dev_net(dev))) { const char *pat = ifname[0] ? ifname : NULL; - struct net *net; int new_ifindex; - net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); - if (IS_ERR(net)) { - err = PTR_ERR(net); - goto errout; - } - - if (tb[IFLA_NEW_IFINDEX]) - new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); - else - new_ifindex = 0; + new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); - err = __dev_change_net_namespace(dev, net, pat, new_ifindex); - put_net(net); + err = __dev_change_net_namespace(dev, tgt_net, pat, + new_ifindex, extack); if (err) - goto errout; + return err; + status |= DO_SETLINK_MODIFIED; } + netdev_lock_ops(dev); + if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; @@ -2823,35 +3080,35 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_ADDRESS]) { - struct sockaddr *sa; - int len; - - len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, - sizeof(*sa)); - sa = kmalloc(len, GFP_KERNEL); - if (!sa) { - err = -ENOMEM; + struct sockaddr_storage ss = { }; + + netdev_unlock_ops(dev); + + /* dev_addr_sem is an outer lock, enforce proper ordering */ + down_write(&dev_addr_sem); + netdev_lock_ops(dev); + + ss.ss_family = dev->type; + memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); + err = netif_set_mac_address(dev, &ss, extack); + if (err) { + up_write(&dev_addr_sem); goto errout; } - sa->sa_family = dev->type; - memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), - dev->addr_len); - err = dev_set_mac_address_user(dev, sa, extack); - kfree(sa); - if (err) - goto errout; status |= DO_SETLINK_MODIFIED; + + up_write(&dev_addr_sem); } if (tb[IFLA_MTU]) { - err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); + err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } @@ -2861,15 +3118,15 @@ static int do_setlink(const struct sk_buff *skb, * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { - err = dev_change_name(dev, ifname); + err = netif_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { - err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), - nla_len(tb[IFLA_IFALIAS])); + err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), + nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; @@ -2881,8 +3138,8 @@ static int do_setlink(const struct sk_buff *skb, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), - extack); + err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } @@ -2895,7 +3152,7 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_CARRIER]) { - err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); + err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -2904,7 +3161,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); - err = dev_change_tx_queue_len(dev, value); + err = netif_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -3048,11 +3305,18 @@ static int do_setlink(const struct sk_buff *skb, int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); + if (!af_ops) { + err = -EAFNOSUPPORT; + goto errout; + } err = af_ops->set_link_af(dev, af, extack); + rtnl_af_put(af_ops, af_ops_srcu_index); + if (err < 0) goto errout; @@ -3121,13 +3385,15 @@ static int do_setlink(const struct sk_buff *skb, errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) - netdev_state_change(dev); + netif_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } + netdev_unlock_ops(dev); + return err; } @@ -3149,11 +3415,13 @@ static struct net_device *rtnl_dev_get(struct net *net, static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); - struct ifinfomsg *ifm; - struct net_device *dev; - int err; struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + struct rtnl_nets rtnl_nets; + struct net *tgt_net; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3164,25 +3432,32 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - err = -EINVAL; - ifm = nlmsg_data(nlh); + tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + err = PTR_ERR(tgt_net); + goto errout; + } + + rtnl_nets_init(&rtnl_nets); + rtnl_nets_add(&rtnl_nets, get_net(net)); + rtnl_nets_add(&rtnl_nets, tgt_net); + + rtnl_nets_lock(&rtnl_nets); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else - goto errout; + err = -EINVAL; - if (dev == NULL) { + if (dev) + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); + else if (!err) err = -ENODEV; - goto errout; - } - - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - goto errout; - err = do_setlink(skb, dev, ifm, extack, tb, 0); + rtnl_nets_unlock(&rtnl_nets); + rtnl_nets_destroy(&rtnl_nets); errout: return err; } @@ -3242,14 +3517,14 @@ EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; - struct net *tgt_net = net; - struct net_device *dev = NULL; - struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; - int err; + struct net_device *dev = NULL; + struct net *tgt_net = net; int netnsid = -1; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3267,27 +3542,24 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - err = -EINVAL; - ifm = nlmsg_data(nlh); + rtnl_net_lock(tgt_net); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(tgt_net, tb); + + if (dev) + err = rtnl_delete_link(dev, portid, nlh); + else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + err = -ENODEV; else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - goto out; - - if (!dev) { - if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0) - err = -ENODEV; - - goto out; - } + err = -EINVAL; - err = rtnl_delete_link(dev, portid, nlh); + rtnl_net_unlock(tgt_net); -out: if (netnsid >= 0) put_net(tgt_net); @@ -3297,7 +3569,7 @@ out: int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { - unsigned int old_flags; + unsigned int old_flags, changed; int err; old_flags = dev->flags; @@ -3308,12 +3580,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, return err; } - if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); - } else { - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); + changed = old_flags ^ dev->flags; + if (dev->rtnl_link_initializing) { + dev->rtnl_link_initializing = false; + changed = ~0U; } + + __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); @@ -3371,7 +3644,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev_net_set(dev, net); dev->rtnl_link_ops = ops; - dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); @@ -3398,7 +3671,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) @@ -3414,21 +3687,90 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, } EXPORT_SYMBOL(rtnl_create_link); +struct rtnl_newlink_tbs { + struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; +}; + +static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net_device *dev, struct net *tgt_net, + struct rtnl_newlink_tbs *tbs, + struct nlattr **data, + struct netlink_ext_ack *extack) +{ + struct nlattr ** const linkinfo = tbs->linkinfo; + struct nlattr ** const tb = tbs->tb; + int status = 0; + int err; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + const struct rtnl_link_ops *m_ops = NULL; + struct nlattr **slave_data = NULL; + struct net_device *master_dev; + + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + m_ops = master_dev->rtnl_link_ops; + + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; + + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; + + if (m_ops->slave_maxtype) { + err = nla_parse_nested_deprecated(tbs->slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + + slave_data = tbs->slave_attr; + } + + err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); +} + static int rtnl_group_changelink(const struct sk_buff *skb, - struct net *net, int group, - struct ifinfomsg *ifm, - struct netlink_ext_ack *extack, - struct nlattr **tb) + struct net *net, struct net *tgt_net, + int group, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; - err = do_setlink(skb, dev, ifm, extack, tb, 0); + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } @@ -3439,14 +3781,21 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, + struct net *tgt_net, struct net *link_net, + struct net *peer_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; - struct net *net = sock_net(skb->sk); + struct rtnl_newlink_params params = { + .src_net = sock_net(skb->sk), + .link_net = link_net, + .peer_net = peer_net, + .tb = tb, + .data = data, + }; u32 portid = NETLINK_CB(skb).portid; - struct net *dest_net, *link_net; struct net_device *dev; char ifname[IFNAMSIZ]; int err; @@ -3461,28 +3810,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } else { - link_net = NULL; - } - - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb, extack); + dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb, + extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; @@ -3491,7 +3820,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, dev->ifindex = ifm->ifi_index; if (ops->newlink) - err = ops->newlink(link_net ? : net, dev, tb, data, extack); + err = ops->newlink(dev, ¶ms, extack); else err = register_netdevice(dev); if (err < 0) { @@ -3499,25 +3828,22 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, goto out; } + netdev_lock_ops(dev); + err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } + + netdev_unlock_ops(dev); out: - if (link_net) - put_net(link_net); - put_net(dest_net); return err; out_unregister: + netdev_unlock_ops(dev); if (ops->newlink) { LIST_HEAD(list_kill); @@ -3529,202 +3855,212 @@ out_unregister: goto out; } -struct rtnl_newlink_tbs { +static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *tbp[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ struct nlattr *tb[IFLA_MAX + 1]; - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; -}; + int err; + + if (!data || !data[ops->peer_type]) + return rtnl_link_get_net_ifla(tbp); + + err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); + if (err < 0) + return ERR_PTR(err); + + if (ops->validate) { + err = ops->validate(tb, NULL, extack); + if (err < 0) + return ERR_PTR(err); + } + + return rtnl_link_get_net_ifla(tb); +} static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net *tgt_net, struct net *link_net, + struct net *peer_net, struct rtnl_newlink_tbs *tbs, + struct nlattr **data, struct netlink_ext_ack *extack) { - struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr ** const tb = tbs->tb; - const struct rtnl_link_ops *m_ops; - struct net_device *master_dev; struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; - struct nlattr **slave_data; - char kind[MODULE_NAME_LEN]; + struct net *device_net; struct net_device *dev; struct ifinfomsg *ifm; - struct nlattr **data; bool link_specified; - int err; -#ifdef CONFIG_MODULES -replay: -#endif - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, - ifla_policy, extack); - if (err < 0) - return err; - - err = rtnl_ensure_unique_netns(tb, extack, false); - if (err < 0) - return err; + /* When creating, lookup for existing device in target net namespace */ + device_net = (nlh->nlmsg_flags & NLM_F_CREATE) && + (nlh->nlmsg_flags & NLM_F_EXCL) ? + tgt_net : net; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; - dev = __dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(device_net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(device_net, tb); } else { link_specified = false; dev = NULL; } - master_dev = NULL; - m_ops = NULL; - if (dev) { - master_dev = netdev_master_upper_dev_get(dev); - if (master_dev) - m_ops = master_dev->rtnl_link_ops; + if (dev) + return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, + * or it's for a group + */ + if (link_specified || !tb[IFLA_GROUP]) + return -ENODEV; + + return rtnl_group_changelink(skb, net, tgt_net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, extack, tb); } + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; + + if (!ops) { + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } + + return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh, + tb, data, extack); +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *tgt_net, *link_net = NULL, *peer_net = NULL; + struct nlattr **tb, **linkinfo, **data = NULL; + struct rtnl_link_ops *ops = NULL; + struct rtnl_newlink_tbs *tbs; + struct rtnl_nets rtnl_nets; + int ops_srcu_index; + int ret; + + tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); + if (!tbs) + return -ENOMEM; + + tb = tbs->tb; + ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + if (ret < 0) + goto free; + + ret = rtnl_ensure_unique_netns(tb, extack, false); + if (ret < 0) + goto free; + + linkinfo = tbs->linkinfo; if (tb[IFLA_LINKINFO]) { - err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, + ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); - if (err < 0) - return err; - } else - memset(linkinfo, 0, sizeof(linkinfo)); + if (ret < 0) + goto free; + } else { + memset(linkinfo, 0, sizeof(tbs->linkinfo)); + } if (linkinfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); - } else { - kind[0] = '\0'; - ops = NULL; + ops = rtnl_link_ops_get(kind, &ops_srcu_index); +#ifdef CONFIG_MODULES + if (!ops) { + request_module("rtnl-link-%s", kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); + } +#endif } - data = NULL; + rtnl_nets_init(&rtnl_nets); + if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + if (ops->maxtype > RTNL_MAX_TYPE) { + ret = -EINVAL; + goto put_ops; + } if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, + ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); - if (err < 0) - return err; + if (ret < 0) + goto put_ops; + data = tbs->attr; } + if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; + ret = ops->validate(tb, data, extack); + if (ret < 0) + goto put_ops; } - } - slave_data = NULL; - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; - - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested_deprecated(tbs->slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - extack); - if (err < 0) - return err; - slave_data = tbs->slave_attr; + if (ops->peer_type) { + peer_net = rtnl_get_peer_net(ops, tb, data, extack); + if (IS_ERR(peer_net)) { + ret = PTR_ERR(peer_net); + goto put_ops; + } + if (peer_net) + rtnl_nets_add(&rtnl_nets, peer_net); } } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + ret = PTR_ERR(tgt_net); + goto put_net; + } - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; + rtnl_nets_add(&rtnl_nets, tgt_net); - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; + link_net = get_net_ns_by_id(tgt_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + ret = -EINVAL; + goto put_net; } - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + rtnl_nets_add(&rtnl_nets, link_net); - err = m_ops->slave_changelink(master_dev, dev, tb, - slave_data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto put_net; } - - return do_setlink(skb, dev, ifm, extack, tb, status); } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, - * or it's for a group - */ - if (link_specified) - return -ENODEV; - if (tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, - nla_get_u32(tb[IFLA_GROUP]), - ifm, extack, tb); - return -ENODEV; - } + rtnl_nets_lock(&rtnl_nets); + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack); + rtnl_nets_unlock(&rtnl_nets); - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; - - if (!ops) { -#ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - NL_SET_ERR_MSG(extack, "Unknown device type"); - return -EOPNOTSUPP; - } - - return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack); -} - -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct rtnl_newlink_tbs *tbs; - int ret; - - tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); - if (!tbs) - return -ENOMEM; - - ret = __rtnl_newlink(skb, nlh, tbs, extack); +put_net: + rtnl_nets_destroy(&rtnl_nets); +put_ops: + if (ops) + rtnl_link_ops_put(ops, ops_srcu_index); +free: kfree(tbs); return ret; } @@ -3737,7 +4073,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, struct ifinfomsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } @@ -3746,7 +4083,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); @@ -3953,22 +4289,28 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } -static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb, + struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); size_t min_ifinfo_dump_size = 0; - struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; struct net_device *dev; - int hdrlen; + struct nlattr *nla; + int hdrlen, rem; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return NLMSG_GOODSIZE; + + nla_for_each_attr_type(nla, IFLA_EXT_MASK, + nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), rem) { + if (nla_len(nla) == sizeof(u32)) + ext_filter_mask = nla_get_u32(nla); } if (!ext_filter_mask) @@ -4065,8 +4407,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, } return skb; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_LINK, err); + rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } @@ -4291,9 +4632,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; + bool notified = false; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, - nlh->nlmsg_flags, extack); + nlh->nlmsg_flags, ¬ified, extack); if (err) goto out; else @@ -4302,16 +4644,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { + bool notified = false; + if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, - extack); + ¬ified, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); - if (!err) { + if (!err && !notified) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; @@ -4411,11 +4755,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); + bool notified = false; ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); @@ -4429,10 +4775,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { + bool notified = false; + ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { @@ -4443,7 +4792,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, } if (!err) { - if (!del_bulk) + if (!del_bulk && !notified) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; @@ -4459,15 +4808,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, int *idx, struct netdev_hw_addr_list *list) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; - int err; u32 portid, seq; + int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, @@ -4523,12 +4873,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); @@ -4606,18 +4956,16 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct net_device *dev; - struct net_device *br_dev = NULL; - const struct net_device_ops *ops = NULL; - const struct net_device_ops *cops = NULL; + const struct net_device_ops *ops = NULL, *cops = NULL; + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; + struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); - struct hlist_head *head; int brport_idx = 0; int br_idx = 0; - int h, s_h; - int idx = 0, s_idx; - int err = 0; int fidx = 0; + int err; + + NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, @@ -4636,70 +4984,51 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - s_h = cb->args[0]; - s_idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - - if (brport_idx && (dev->ifindex != brport_idx)) - continue; - - if (!br_idx) { /* user did not specify a specific bridge */ - if (netif_is_bridge_port(dev)) { - br_dev = netdev_master_upper_dev_get(dev); - cops = br_dev->netdev_ops; - } - } else { - if (dev != br_dev && - !netif_is_bridge_port(dev)) - continue; + for_each_netdev_dump(net, dev, ctx->ifindex) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; - if (br_dev != netdev_master_upper_dev_get(dev) && - !netif_is_bridge_master(dev)) - continue; - cops = ops; + if (!br_idx) { /* user did not specify a specific bridge */ + if (netif_is_bridge_port(dev)) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; } + } else { + if (dev != br_dev && + !netif_is_bridge_port(dev)) + continue; - if (idx < s_idx) - goto cont; + if (br_dev != netdev_master_upper_dev_get(dev) && + !netif_is_bridge_master(dev)) + continue; + cops = ops; + } - if (netif_is_bridge_port(dev)) { - if (cops && cops->ndo_fdb_dump) { - err = cops->ndo_fdb_dump(skb, cb, - br_dev, dev, - &fidx); - if (err == -EMSGSIZE) - goto out; - } + if (netif_is_bridge_port(dev)) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + break; } + } - if (dev->netdev_ops->ndo_fdb_dump) - err = dev->netdev_ops->ndo_fdb_dump(skb, cb, - dev, NULL, - &fidx); - else - err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, - &fidx); - if (err == -EMSGSIZE) - goto out; + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); + if (err == -EMSGSIZE) + break; - cops = NULL; + cops = NULL; - /* reset fdb offset to 0 for rest of the interfaces */ - cb->args[2] = 0; - fidx = 0; -cont: - idx++; - } + /* reset fdb offset to 0 for rest of the interfaces */ + ctx->fdb_idx = 0; + fidx = 0; } -out: - cb->args[0] = h; - cb->args[1] = idx; - cb->args[2] = fidx; + ctx->fdb_idx = fidx; return skb->len; } @@ -4712,12 +5041,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); @@ -4984,12 +5313,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); @@ -5245,15 +5574,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { - if (nla_len(attr) < sizeof(flags)) - return -EINVAL; + nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, + rem) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; - have_flags = true; - flags = nla_get_u16(attr); - break; - } + have_flags = true; + flags = nla_get_u16(attr); + break; } } @@ -5882,7 +6210,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, { struct if_stats_msg *ifsm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { + ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); + if (!ifsm) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } @@ -5890,8 +6219,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, if (!strict_check) return 0; - ifsm = nlmsg_data(nlh); - /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ @@ -5962,19 +6289,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; - int h, s_h, err, s_idx, s_idxattr, s_prividx; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; - struct hlist_head *head; + struct { + unsigned long ifindex; + int idxattr; + int prividx; + } *ctx = (void *)cb->ctx; struct net_device *dev; - int idx = 0; - - s_h = cb->args[0]; - s_idx = cb->args[1]; - s_idxattr = cb->args[2]; - s_prividx = cb->args[3]; + int err; cb->seq = net->dev_base_seq; @@ -5993,39 +6318,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - flags, &filters, - &s_idxattr, &s_prividx, - extack); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + for_each_netdev_dump(net, dev, ctx->ifindex) { + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, &filters, + &ctx->idxattr, &ctx->prividx, + extack); + /* If we ran out of room on the first message, + * we're in trouble. + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; - s_prividx = 0; - s_idxattr = 0; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } + if (err < 0) + break; + ctx->prividx = 0; + ctx->idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } -out: - cb->args[3] = s_prividx; - cb->args[2] = s_idxattr; - cb->args[1] = idx; - cb->args[0] = h; - return skb->len; + return err; } void rtnl_offload_xstats_notify(struct net_device *dev) @@ -6134,12 +6446,12 @@ static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, { struct br_port_msg *bpm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { + bpm = nlmsg_payload(nlh, sizeof(*bpm)); + if (!bpm) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } - bpm = nlmsg_data(nlh); if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; @@ -6164,7 +6476,7 @@ static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int idx, s_idx; int err; - NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx); + NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); if (cb->strict_check) { err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); @@ -6484,6 +6796,52 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Process one rtnetlink message. */ +static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED); + rtnl_dumpit_func dumpit = cb->data; + int err; + + /* Previous iteration have already finished, avoid calling->dumpit() + * again, it may not expect to be called after it reached the end. + */ + if (!dumpit) + return 0; + + if (needs_lock) + rtnl_lock(); + err = dumpit(skb, cb); + if (needs_lock) + rtnl_unlock(); + + /* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). + * Some applications which parse netlink manually depend on this. + */ + if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { + if (err < 0 && err != -EMSGSIZE) + return err; + if (!err) + cb->data = NULL; + + return skb->len; + } + return err; +} + +static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) +{ + if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE || + !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) { + WARN_ON(control->data); + control->data = control->dump; + control->dump = rtnl_dumpit; + } + + return netlink_dump_start(ssk, skb, nlh, control); +} + static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -6548,7 +6906,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .module = owner, .flags = flags, }; - err = netlink_dump_start(rtnl, skb, nlh, &c); + err = rtnetlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ @@ -6663,7 +7021,6 @@ static int __net_init rtnetlink_net_init(struct net *net) struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, - .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, .bind = rtnetlink_bind, }; @@ -6686,6 +7043,41 @@ static struct pernet_operations rtnetlink_net_ops = { .exit = rtnetlink_net_exit, }; +static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, + {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, + .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, + {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, + {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get, + .dumpit = rtnl_stats_dump}, + {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set}, + {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop}, + {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK, + .dumpit = rtnl_bridge_getlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK, + .doit = rtnl_bridge_dellink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK, + .doit = rtnl_bridge_setlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get, + .dumpit = rtnl_fdb_dump}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get, + .dumpit = rtnl_mdb_dump}, +}; + void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) @@ -6693,34 +7085,5 @@ void __init rtnetlink_init(void) register_netdevice_notifier(&rtnetlink_dev_notifier); - rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, - rtnl_dump_ifinfo, 0); - rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); - - rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); - rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); - rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); - - rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); - - rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, - RTNL_FLAG_BULK_DEL_SUPPORTED); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); - - rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); - rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); - - rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, - 0); - rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0); - - rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0); - rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, - RTNL_FLAG_BULK_DEL_SUPPORTED); + rtnl_register_many(rtnetlink_rtnl_msg_handlers); } diff --git a/net/core/scm.c b/net/core/scm.c index 9cd4b0a01cd6..0225bd94170f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -89,6 +89,12 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; + fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -276,6 +282,16 @@ efault: } EXPORT_SYMBOL(put_cmsg); +int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data) +{ + /* Don't produce truncated CMSGs */ + if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) + return -ETOOSMALL; + + return put_cmsg(msg, level, type, len, data); +} + void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; @@ -376,9 +392,137 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } EXPORT_SYMBOL(scm_fp_dup); + +#ifdef CONFIG_SECURITY_NETWORK +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ + struct lsm_context ctx; + int err; + + if (sk->sk_scm_security) { + err = security_secid_to_secctx(scm->secid, &ctx); + + if (err >= 0) { + put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, + ctx.context); + + security_release_secctx(&ctx); + } + } +} + +static bool scm_has_secdata(struct sock *sk) +{ + return sk->sk_scm_security; +} +#else +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ +} + +static bool scm_has_secdata(struct sock *sk) +{ + return false; +} +#endif + +static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) +{ + struct file *pidfd_file = NULL; + int len, pidfd; + + /* put_cmsg() doesn't return an error if CMSG is truncated, + * that's why we need to opencode these checks here. + */ + if (msg->msg_flags & MSG_CMSG_COMPAT) + len = sizeof(struct compat_cmsghdr) + sizeof(int); + else + len = sizeof(struct cmsghdr) + sizeof(int); + + if (msg->msg_controllen < len) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + + if (!scm->pid) + return; + + pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); + + if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + + return; + } + + if (pidfd_file) + fd_install(pidfd, pidfd_file); +} + +static bool __scm_recv_common(struct sock *sk, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!msg->msg_control) { + if (sk->sk_scm_credentials || sk->sk_scm_pidfd || + scm->fp || scm_has_secdata(sk)) + msg->msg_flags |= MSG_CTRUNC; + + scm_destroy(scm); + return false; + } + + if (sk->sk_scm_credentials) { + struct user_namespace *current_ns = current_user_ns(); + struct ucred ucreds = { + .pid = scm->creds.pid, + .uid = from_kuid_munged(current_ns, scm->creds.uid), + .gid = from_kgid_munged(current_ns, scm->creds.gid), + }; + + put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); + } + + scm_passec(sk, msg, scm); + + if (scm->fp) + scm_detach_fds(msg, scm); + + return true; +} + +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + scm_destroy_cred(scm); +} +EXPORT_SYMBOL(scm_recv); + +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + if (sock->sk->sk_scm_pidfd) + scm_pidfd_recv(msg, scm); + + scm_destroy_cred(scm); +} diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index b0ff6153be62..9a3965680451 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -71,7 +71,7 @@ u32 secure_tcpv6_ts_off(const struct net *net, return siphash(&combined, offsetofend(typeof(combined), daddr), &ts_secret); } -EXPORT_SYMBOL(secure_tcpv6_ts_off); +EXPORT_IPV6_MOD(secure_tcpv6_ts_off); u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) @@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif - -#if IS_ENABLED(CONFIG_IP_DCCP) -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) -{ - u64 seq; - net_secret_init(); - seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccp_sequence_number); - -#if IS_ENABLED(CONFIG_IPV6) -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - __be16 sport; - __be16 dport; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - .sport = sport, - .dport = dport - }; - u64 seq; - net_secret_init(); - seq = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccpv6_sequence_number); -#endif -#endif diff --git a/net/core/selftests.c b/net/core/selftests.c index 8f801e6e3b91..406faf8e5f3f 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -100,10 +100,10 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, ehdr->h_proto = htons(ETH_P_IP); if (attr->tcp) { + memset(thdr, 0, sizeof(*thdr)); thdr->source = htons(attr->sport); thdr->dest = htons(attr->dport); thdr->doff = sizeof(struct tcphdr) / 4; - thdr->check = 0; } else { uhdr->source = htons(attr->sport); uhdr->dest = htons(attr->dport); @@ -144,16 +144,25 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, attr->id = net_test_next_id; shdr->id = net_test_next_id++; - if (attr->size) - skb_put(skb, attr->size); - if (attr->max_size && attr->max_size > skb->len) - skb_put(skb, attr->max_size - skb->len); + if (attr->size) { + void *payload = skb_put(skb, attr->size); + + memset(payload, 0, attr->size); + } + + if (attr->max_size && attr->max_size > skb->len) { + size_t pad_len = attr->max_size - skb->len; + void *pad = skb_put(skb, pad_len); + + memset(pad, 0, pad_len); + } skb->csum = 0; skb->ip_summed = CHECKSUM_PARTIAL; if (attr->tcp) { - thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, - ihdr->daddr, 0); + int l4len = skb->len - skb_transport_offset(skb); + + thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { @@ -299,7 +308,7 @@ static int net_test_phy_loopback_enable(struct net_device *ndev) if (!ndev->phydev) return -EOPNOTSUPP; - return phy_loopback(ndev->phydev, true); + return phy_loopback(ndev->phydev, true, 0); } static int net_test_phy_loopback_disable(struct net_device *ndev) @@ -307,7 +316,7 @@ static int net_test_phy_loopback_disable(struct net_device *ndev) if (!ndev->phydev) return -EOPNOTSUPP; - return phy_loopback(ndev->phydev, false); + return phy_loopback(ndev->phydev, false, 0); } static int net_test_phy_loopback_udp(struct net_device *ndev) diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c new file mode 100644 index 000000000000..4235db6bdfad --- /dev/null +++ b/net/core/skb_fault_injection.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/debugfs.h> +#include <linux/fault-inject.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> + +static struct { + struct fault_attr attr; + char devname[IFNAMSIZ]; + bool filtered; +} skb_realloc = { + .attr = FAULT_ATTR_INITIALIZER, + .filtered = false, +}; + +static bool should_fail_net_realloc_skb(struct sk_buff *skb) +{ + struct net_device *net = skb->dev; + + if (skb_realloc.filtered && + strncmp(net->name, skb_realloc.devname, IFNAMSIZ)) + /* device name filter set, but names do not match */ + return false; + + if (!should_fail(&skb_realloc.attr, 1)) + return false; + + return true; +} +ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE); + +void skb_might_realloc(struct sk_buff *skb) +{ + if (!should_fail_net_realloc_skb(skb)) + return; + + pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_might_realloc); + +static int __init fail_skb_realloc_setup(char *str) +{ + return setup_fault_attr(&skb_realloc.attr, str); +} +__setup("fail_skb_realloc=", fail_skb_realloc_setup); + +static void reset_settings(void) +{ + skb_realloc.filtered = false; + memset(&skb_realloc.devname, 0, IFNAMSIZ); +} + +static ssize_t devname_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + ssize_t ret; + + reset_settings(); + ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ, + ppos, buffer, count); + if (ret < 0) + return ret; + + skb_realloc.devname[IFNAMSIZ - 1] = '\0'; + /* Remove a possible \n at the end of devname */ + strim(skb_realloc.devname); + + if (strnlen(skb_realloc.devname, IFNAMSIZ)) + skb_realloc.filtered = true; + + return count; +} + +static ssize_t devname_read(struct file *file, + char __user *buffer, + size_t size, loff_t *ppos) +{ + if (!skb_realloc.filtered) + return 0; + + return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname, + strlen(skb_realloc.devname)); +} + +static const struct file_operations devname_ops = { + .write = devname_write, + .read = devname_read, +}; + +static int __init fail_skb_realloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_skb_realloc", NULL, + &skb_realloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_file("devname", mode, dir, NULL, &devname_ops); + + return 0; +} + +late_initcall(fail_skb_realloc_debugfs); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b99127712e67..d6420b74ea9c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@ #endif #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> @@ -63,11 +64,13 @@ #include <linux/mpls.h> #include <linux/kcov.h> #include <linux/iov_iter.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/gro.h> #include <net/gso.h> #include <net/hotdata.h> #include <net/ip6_checksum.h> @@ -87,13 +90,17 @@ #include <linux/textsearch.h> #include "dev.h" +#include "devmem.h" +#include "netmem_priv.h" #include "sock_destructor.h" #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique @@ -108,9 +115,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); - /* kcm_write_msgs() relies on casting paged frags to bio_vec to use * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the * netmem is a page. @@ -221,98 +225,31 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { + local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); - -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + void *data; fragsz = SKB_DATA_ALIGN(fragsz); - return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, - align_mask); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + data = __page_frag_alloc_align(&nc->page, fragsz, + GFP_ATOMIC | __GFP_NOWARN, align_mask); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + return data; + } EXPORT_SYMBOL(__napi_alloc_frag_align); @@ -320,19 +257,16 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { void *data; - fragsz = SKB_DATA_ALIGN(fragsz); if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); - data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, + fragsz = SKB_DATA_ALIGN(fragsz); + data = __page_frag_alloc_align(nc, fragsz, + GFP_ATOMIC | __GFP_NOWARN, align_mask); } else { - struct napi_alloc_cache *nc; - local_bh_disable(); - nc = this_cpu_ptr(&napi_alloc_cache); - data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, - align_mask); + data = __napi_alloc_frag_align(fragsz, align_mask); local_bh_enable(); } return data; @@ -344,21 +278,87 @@ static struct sk_buff *napi_skb_cache_get(void) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - GFP_ATOMIC, + GFP_ATOMIC | __GFP_NOWARN, NAPI_SKB_CACHE_BULK, nc->skb_cache); - if (unlikely(!nc->skb_count)) + if (unlikely(!nc->skb_count)) { + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; + } } skb = nc->skb_cache[--nc->skb_count]; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); return skb; } +/** + * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache + * @skbs: pointer to an at least @n-sized array to fill with skb pointers + * @n: number of entries to provide + * + * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes + * the pointers into the provided array @skbs. If there are less entries + * available, tries to replenish the cache and bulk-allocates the diff from + * the MM layer if needed. + * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are + * ready for {,__}build_skb_around() and don't have any data buffers attached. + * Must be called *only* from the BH context. + * + * Return: number of successfully allocated skbs (@n if no actual allocation + * needed or kmem_cache_alloc_bulk() didn't fail). + */ +u32 napi_skb_cache_get_bulk(void **skbs, u32 n) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 bulk, total = n; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + + if (nc->skb_count >= n) + goto get; + + /* No enough cached skbs. Try refilling the cache first */ + bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); + nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, bulk, + &nc->skb_cache[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* Still not enough. Bulk-allocate the missing part directly, zeroed */ + n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, + n - nc->skb_count, &skbs[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* kmem_cache didn't allocate the number we need, limit the output */ + total -= n - nc->skb_count; + n = nc->skb_count; + +get: + for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { + u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache); + + skbs[i] = nc->skb_cache[base + i]; + + kasan_mempool_unpoison_object(skbs[i], cache_size); + memset(skbs[i], 0, offsetof(struct sk_buff, tail)); + } + + nc->skb_count -= n; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + + return total; +} +EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); + static inline void __finalize_skb_around(struct sk_buff *skb, void *data, unsigned int size) { @@ -412,7 +412,8 @@ struct sk_buff *slab_build_skb(void *data) struct sk_buff *skb; unsigned int size; - skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; @@ -463,7 +464,8 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; @@ -726,7 +728,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); @@ -743,12 +745,16 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); } else { local_bh_disable(); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache.page); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); + + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); local_bh_enable(); } @@ -775,10 +781,9 @@ skb_fail: EXPORT_SYMBOL(__netdev_alloc_skb); /** - * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used @@ -787,9 +792,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; @@ -800,10 +805,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -813,32 +816,17 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, goto skb_success; } - nc = this_cpu_ptr(&napi_alloc_cache); + len = SKB_HEAD_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache); - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = nc->page.pfmemalloc; - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) return NULL; @@ -860,7 +848,7 @@ skb_success: skb_fail: return skb; } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb); void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size, unsigned int truesize) @@ -907,11 +895,6 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool is_pp_page(struct page *page) -{ - return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { @@ -995,7 +978,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_pp_cow_data); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog) + const struct bpf_prog *prog) { if (!prog->aux->xdp_has_frags) return -EINVAL; @@ -1005,56 +988,25 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_cow_data_for_xdp); #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) +bool napi_pp_put_page(netmem_ref netmem) { - bool allow_direct = false; - struct page_pool *pp; - - page = compound_head(page); + netmem = netmem_compound_head(netmem); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely(!is_pp_page(page))) + if (unlikely(!netmem_is_pp(netmem))) return false; - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - * __page_pool_put_page() makes sure we're not in hardirq context - * and interrupts are enabled prior to accessing the cache. - */ - if (napi_safe || in_softirq()) { - const struct napi_struct *napi = READ_ONCE(pp->p.napi); - unsigned int cpuid = smp_processor_id(); - - allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; - allow_direct |= READ_ONCE(pp->cpuid) == cpuid; - } - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); return true; } EXPORT_SYMBOL(napi_pp_put_page); #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return napi_pp_put_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(page_to_netmem(virt_to_page(data))); } /** @@ -1070,7 +1022,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) static int skb_pp_frag_ref(struct sk_buff *skb) { struct skb_shared_info *shinfo; - struct page *head_page; + netmem_ref head_netmem; int i; if (!skb->pp_recycle) @@ -1079,11 +1031,11 @@ static int skb_pp_frag_ref(struct sk_buff *skb) shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { - head_page = compound_head(skb_frag_page(&shinfo->frags[i])); - if (likely(is_pp_page(head_page))) - page_pool_ref_page(head_page); + head_netmem = netmem_compound_head(shinfo->frags[i].netmem); + if (likely(netmem_is_pp(head_netmem))) + page_pool_ref_netmem(head_netmem); else - page_ref_inc(head_page); + page_ref_inc(netmem_to_page(head_netmem)); } return 0; } @@ -1096,12 +1048,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset) kfree(head); } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; if (skb->head_frag) { - if (skb_pp_recycle(skb, head, napi_safe)) + if (skb_pp_recycle(skb, head)) return; skb_free_frag(head); } else { @@ -1109,8 +1061,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe) } } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -1127,13 +1078,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, } for (i = 0; i < shinfo->nr_frags; i++) - napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); - skb_free_head(skb, napi_safe); + skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races @@ -1194,12 +1145,11 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb, reason, napi_safe); + skb_release_data(skb, reason); } /** @@ -1213,13 +1163,14 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); static __always_inline -bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason) { if (unlikely(!skb_unref(skb))) return false; @@ -1232,26 +1183,27 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) if (reason == SKB_CONSUMED) trace_consume_skb(skb, __builtin_return_address(0)); else - trace_kfree_skb(skb, __builtin_return_address(0), reason); + trace_kfree_skb(skb, __builtin_return_address(0), reason, sk); return true; } /** - * kfree_skb_reason - free an sk_buff with special reason + * sk_skb_reason_drop - free an sk_buff with special reason + * @sk: the socket to receive @skb, or NULL if not applicable * @skb: buffer to free * @reason: reason why this skb is dropped * - * Drop a reference to the buffer and free it if the usage count has - * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' - * tracepoint. + * Drop a reference to the buffer and free it if the usage count has hit + * zero. Meanwhile, pass the receiving socket and drop reason to + * 'kfree_skb' tracepoint. */ void __fix_address -kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - if (__kfree_skb_reason(skb, reason)) + if (__sk_skb_reason_drop(sk, skb, reason)) __kfree_skb(skb); } -EXPORT_SYMBOL(kfree_skb_reason); +EXPORT_SYMBOL(sk_skb_reason_drop); #define KFREE_SKB_BULK_SIZE 16 @@ -1270,7 +1222,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, return; } - skb_release_all(skb, reason, false); + skb_release_all(skb, reason); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1290,7 +1242,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) while (segs) { struct sk_buff *next = segs->next; - if (__kfree_skb_reason(segs, reason)) { + if (__sk_skb_reason_drop(NULL, segs, reason)) { skb_poison_list(segs); kfree_skb_add_bulk(segs, &sa, reason); } @@ -1331,22 +1283,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) has_trans = skb_transport_header_was_set(skb); printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" - "mac=(%d,%d) net=(%d,%d) trans=%d\n" + "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" - "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" - "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", + "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" + "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" + "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", level, skb->len, headroom, skb_headlen(skb), tailroom, has_mac ? skb->mac_header : -1, has_mac ? skb_mac_header_len(skb) : -1, + skb->mac_len, skb->network_header, has_trans ? skb_network_header_len(skb) : -1, has_trans ? skb->transport_header : -1, sh->tx_flags, sh->nr_frags, sh->gso_size, sh->gso_type, sh->gso_segs, - skb->csum, skb->ip_summed, skb->csum_complete_sw, - skb->csum_valid, skb->csum_level, + skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level, skb->hash, skb->sw_hash, skb->l4_hash, - ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, + skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, + skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, + skb->inner_network_header, skb->inner_transport_header); if (dev) printk("%sdev name=%s feat=%pNF\n", @@ -1375,6 +1333,14 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) struct page *p; u8 *vaddr; + if (skb_frag_is_net_iov(frag)) { + printk("%sskb frag %d: not readable\n", level, i); + len -= skb_frag_size(frag); + if (!len) + break; + continue; + } + skb_frag_foreach_page(frag, skb_frag_off(frag), skb_frag_size(frag), p, p_off, p_len, copied) { @@ -1444,7 +1410,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1456,6 +1422,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) if (!kasan_mempool_poison_object(skb)) return; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { @@ -1467,11 +1434,12 @@ static void napi_skb_cache_put(struct sk_buff *skb) nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { - skb_release_all(skb, reason, true); + skb_release_all(skb, reason); napi_skb_cache_put(skb); } @@ -1509,7 +1477,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb, SKB_CONSUMED, !!budget); + skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1640,7 +1608,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst, SKB_CONSUMED, false); + skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -1688,7 +1656,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1703,12 +1672,12 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } - uarg->ubuf.callback = msg_zerocopy_callback; + uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; @@ -1726,7 +1695,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1734,7 +1703,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, u32 bytelen, next; /* there might be non MSG_ZEROCOPY users */ - if (uarg->callback != msg_zerocopy_callback) + if (uarg->ops != &msg_zerocopy_ubuf_ops) return NULL; /* realloc only when socket is locked (TCP, UDP cork), @@ -1756,7 +1725,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1771,7 +1741,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1845,8 +1815,8 @@ release: sock_put(sk); } -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); @@ -1855,7 +1825,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, if (refcount_dec_and_test(&uarg->refcnt)) __msg_zerocopy_callback(uarg_zc); } -EXPORT_SYMBOL_GPL(msg_zerocopy_callback); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { @@ -1865,24 +1834,39 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) uarg_to_msgzc(uarg)->len--; if (have_uref) - msg_zerocopy_callback(NULL, uarg, true); + msg_zerocopy_complete(NULL, uarg, true); } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { + .complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { - struct ubuf_info *orig_uarg = skb_zcopy(skb); int err, orig_len = skb->len; - /* An skb can only point to one uarg. This edge case happens when - * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. - */ - if (orig_uarg && uarg != orig_uarg) - return -EEXIST; + if (uarg->ops->link_skb) { + err = uarg->ops->link_skb(skb, uarg); + if (err) + return err; + } else { + struct ubuf_info *orig_uarg = skb_zcopy(skb); - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + /* An skb can only point to one uarg. This edge case happens + * when TCP appends to an skb, but zerocopy_realloc triggered + * a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + } + + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; @@ -1954,6 +1938,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; + if (!skb_frags_readable(skb)) + return -EFAULT; + if (!num_frags) goto release; @@ -2123,11 +2110,20 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb_headroom(skb); - unsigned int size = skb_end_offset(skb) + skb->data_len; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + struct sk_buff *n; + unsigned int size; + int headerlen; + + if (!skb_frags_readable(skb)) + return NULL; + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + + headerlen = skb_headroom(skb); + size = skb_end_offset(skb) + skb->data_len; + n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; @@ -2272,9 +2268,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { - skb_free_head(skb, false); + skb_free_head(skb); } off = (data + nhead) - skb->head; @@ -2455,12 +2451,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* * Allocate the copy buffer */ - struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask, skb_alloc_rx_flag(skb), - NUMA_NO_NODE); - int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; + struct sk_buff *n; + int oldheadroom; + + if (!skb_frags_readable(skb)) + return NULL; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + oldheadroom = skb_headroom(skb); + n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (!n) return NULL; @@ -2798,6 +2802,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) */ int i, k, eat = (skb->tail + delta) - skb->end; + if (!skb_frags_readable(skb)) + return NULL; + if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC)) @@ -2951,6 +2958,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) to += copy; } + if (!skb_frags_readable(skb)) + goto fault; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; @@ -3139,9 +3149,15 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, /* * then map the fragments */ + if (!skb_frags_readable(skb)) + return false; + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + if (WARN_ON_ONCE(!skb_frag_page(f))) + return false; + if (__splice_segment(skb_frag_page(f), skb_frag_off(f), skb_frag_size(f), offset, len, spd, false, sk, pipe)) @@ -3217,7 +3233,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg) + int len, sendmsg_func sendmsg, int flags) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -3235,7 +3251,7 @@ do_frag_list: kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + msg.msg_flags = MSG_DONTWAIT | flags; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, @@ -3272,7 +3288,8 @@ do_frag_list: while (slen) { struct bio_vec bvec; struct msghdr msg = { - .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | + flags, }; bvec_set_page(&bvec, skb_frag_page(frag), slen, @@ -3318,14 +3335,21 @@ error: int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); + /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** @@ -3359,6 +3383,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) from += copy; } + if (!skb_frags_readable(skb)) + goto fault; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; int end; @@ -3418,8 +3445,7 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ -__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum, const struct skb_checksum_ops *ops) +__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -3430,14 +3456,16 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, if (copy > 0) { if (copy > len) copy = len; - csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, - skb->data + offset, copy, csum); + csum = csum_partial(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; pos = copy; } + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; @@ -3458,13 +3486,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); - csum2 = INDIRECT_CALL_1(ops->update, - csum_partial_ext, - vaddr + p_off, p_len, 0); + csum2 = csum_partial(vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); - csum = INDIRECT_CALL_1(ops->combine, - csum_block_add_ext, csum, - csum2, pos, p_len); + csum = csum_block_add(csum, csum2, pos); pos += p_len; } @@ -3485,10 +3509,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum2; if (copy > len) copy = len; - csum2 = __skb_checksum(frag_iter, offset - start, - copy, 0, ops); - csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, - csum, csum2, pos, copy); + csum2 = skb_checksum(frag_iter, offset - start, copy, + 0); + csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; offset += copy; @@ -3500,18 +3523,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, return csum; } -EXPORT_SYMBOL(__skb_checksum); - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) -{ - const struct skb_checksum_ops ops = { - .update = csum_partial_ext, - .combine = csum_block_add_ext, - }; - - return __skb_checksum(skb, offset, len, csum, &ops); -} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -3538,6 +3549,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, pos = copy; } + if (!skb_frags_readable(skb)) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -3601,6 +3615,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +#ifdef CONFIG_NET_CRC32C +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + if (copy > 0) { + copy = min(copy, len); + crc = crc32c(crc, skb->data + offset, copy); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + copy = end - offset; + if (copy > 0) { + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + copy = min(copy, len); + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + crc = crc32c(crc, vaddr + p_off, p_len); + kunmap_atomic(vaddr); + } + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + copy = min(copy, len); + crc = skb_crc32c(frag_iter, offset - start, copy, crc); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + BUG_ON(len); + + return crc; +} +EXPORT_SYMBOL(skb_crc32c); +#endif /* CONFIG_NET_CRC32C */ + __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) { __sum16 sum; @@ -3660,32 +3746,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) } EXPORT_SYMBOL(__skb_checksum_complete); -static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static const struct skb_checksum_ops default_crc32c_ops = { - .update = warn_crc32c_csum_update, - .combine = warn_crc32c_csum_combine, -}; - -const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = - &default_crc32c_ops; -EXPORT_SYMBOL(crc32c_csum_stub); - /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer @@ -4029,6 +4089,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb, skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb1->unreadable = skb->unreadable; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; skb1->len += skb1->data_len; @@ -4076,6 +4137,8 @@ static inline void skb_split_no_header(struct sk_buff *skb, pos += size; } skb_shinfo(skb1)->nr_frags = k; + + skb1->unreadable = skb->unreadable; } /** @@ -4139,6 +4202,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) if (skb_zcopy(tgt) || skb_zcopy(skb)) return 0; + DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); + DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); + todo = shiftlen; from = 0; to = skb_shinfo(tgt)->nr_frags; @@ -4147,8 +4213,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) /* Actual merge is delayed until the point when we know we can * commit all, so that we don't have to undo partial changes */ - if (!to || - !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), skb_frag_off(fragfrom))) { merge = -1; } else { @@ -4311,6 +4376,9 @@ next_skb: return block_limit - abs_offset; } + if (!skb_frags_readable(st->cur_skb)) + return 0; + if (st->frag_idx == 0 && !st->frag_data) st->stepped_offset += skb_headlen(st->cur_skb); @@ -4387,6 +4455,41 @@ void skb_abort_seq_read(struct skb_seq_state *st) } EXPORT_SYMBOL(skb_abort_seq_read); +/** + * skb_copy_seq_read() - copy from a skb_seq_state to a buffer + * @st: source skb_seq_state + * @offset: offset in source + * @to: destination buffer + * @len: number of bytes to copy + * + * Copy @len bytes from @offset bytes into the source @st to the destination + * buffer @to. `offset` should increase (or be unchanged) with each subsequent + * call to this function. If offset needs to decrease from the previous use `st` + * should be reset first. + * + * Return: 0 on success or -EINVAL if the copy ended early + */ +int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) +{ + const u8 *data; + u32 sqlen; + + for (;;) { + sqlen = skb_seq_read(offset, &data, st); + if (sqlen == 0) + return -EINVAL; + if (sqlen >= len) { + memcpy(to, data, len); + return 0; + } + memcpy(to, data, sqlen); + to += sqlen; + offset += sqlen; + len -= sqlen; + } +} +EXPORT_SYMBOL(skb_copy_seq_read); + #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, @@ -5139,7 +5242,7 @@ EXPORT_SYMBOL_GPL(skb_to_sgvec); * 3. sg_unmark_end * 4. skb_to_sgvec(payload2) * - * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * When mapping multiple payload conditionally, skb_to_sgvec_nomark * is more preferable. */ int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, @@ -5404,7 +5507,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) + if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) return true; read_lock_bh(&sk->sk_callback_lock); @@ -5437,6 +5540,54 @@ err: } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); +static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + int tstype) +{ + switch (tstype) { + case SCM_TSTAMP_SCHED: + return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; + case SCM_TSTAMP_SND: + return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : + SKBTX_SW_TSTAMP); + case SCM_TSTAMP_ACK: + return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; + case SCM_TSTAMP_COMPLETION: + return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP; + } + + return false; +} + +static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, + int tstype) +{ + int op; + + switch (tstype) { + case SCM_TSTAMP_SCHED: + op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; + break; + case SCM_TSTAMP_SND: + if (hwtstamps) { + op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; + *skb_hwtstamps(skb) = *hwtstamps; + } else { + op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; + } + break; + case SCM_TSTAMP_ACK: + op = BPF_SOCK_OPS_TSTAMP_ACK_CB; + break; + default: + return; + } + + bpf_skops_tx_timestamping(sk, skb, op); +} + void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, @@ -5449,6 +5600,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; + if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) + skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, + sk, tstype); + + if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) + return; + tsflags = READ_ONCE(sk->sk_tsflags); if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) @@ -5923,7 +6081,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (to->pp_recycle != from->pp_recycle) return false; - if (len <= skb_tailroom(to)) { + if (skb_frags_readable(from) != skb_frags_readable(to)) + return false; + + if (len <= skb_tailroom(to) && skb_frags_readable(from)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; @@ -5997,7 +6158,7 @@ EXPORT_SYMBOL(skb_try_coalesce); * @skb: buffer to clean * @xnet: packet is crossing netns * - * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * skb_scrub_packet can be used after encapsulating or decapsulating a packet * into/from a tunnel. Some information have to be cleared during these * operations. * skb_scrub_packet can also be used to clean a skb before injecting it in @@ -6018,11 +6179,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->offload_fwd_mark = 0; skb->offload_l3_fwd_mark = 0; #endif + ipvs_reset(skb); if (!xnet) return; - ipvs_reset(skb); skb->mark = 0; skb_clear_tstamp(skb); } @@ -6219,7 +6380,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) return err; skb->protocol = skb->vlan_proto; - skb->mac_len += VLAN_HLEN; + skb->network_header -= VLAN_HLEN; skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } @@ -6575,12 +6736,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values */ - skb_free_head(skb, false); + skb_free_head(skb); } skb->head = data; @@ -6715,7 +6876,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data, size); return -ENOMEM; } - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; @@ -6779,7 +6940,7 @@ void skb_condense(struct sk_buff *skb) { if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || - skb_cloned(skb)) + skb_cloned(skb) || !skb_frags_readable(skb)) return; /* Nice, we can free page frag(s) right now */ @@ -6995,6 +7156,19 @@ free_now: EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + local_bh_disable(); + __napi_kfree_skb(skb, SKB_CONSUMED); + local_bh_enable(); +} + /** * skb_attempt_defer_free - queue skb for remote freeing * @skb: buffer @@ -7010,10 +7184,10 @@ void skb_attempt_defer_free(struct sk_buff *skb) unsigned int defer_max; bool kick; - if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || - !cpu_online(cpu) || - cpu == raw_smp_processor_id()) { -nodefer: __kfree_skb(skb); + if (cpu == raw_smp_processor_id() || + WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu)) { +nodefer: kfree_skb_napi_cache(skb); return; } @@ -7021,7 +7195,7 @@ nodefer: __kfree_skb(skb); DEBUG_NET_WARN_ON_ONCE(skb->destructor); sd = &per_cpu(softnet_data, cpu); - defer_max = READ_ONCE(sysctl_skb_defer_max); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; @@ -7039,8 +7213,8 @@ nodefer: __kfree_skb(skb); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) - smp_call_function_single_async(cpu, &sd->defer_csd); + if (unlikely(kick)) + kick_defer_list_purge(sd, cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, @@ -7073,7 +7247,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp) { - size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; ssize_t spliced = 0, ret = 0; unsigned int i; @@ -7169,3 +7343,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(put_netmem); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4d75ef9d24bf..34c51eb1a14f 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -293,7 +293,7 @@ out: /* If we trim data a full sg elem before curr pointer update * copybreak and current so that any future copy operations * start at new copy location. - * However trimed data that has not yet been used in a copy op + * However trimmed data that has not yet been used in a copy op * does not require an update. */ if (!msg->sg.size) { @@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); @@ -434,7 +435,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, page = sg_page(sge); if (copied + copy > len) copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (copy) + copy = copy_page_to_iter(page, sge->offset, copy, iter); if (!copy) { copied = copied ? copied : -EFAULT; goto out; @@ -444,8 +446,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -526,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, - struct sk_msg *msg) + struct sk_msg *msg, + bool take_ref) { int num_sge, copied; + /* skb_to_sgvec will fail when the total number of fragments in + * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the + * caller may aggregate multiple skbs. + */ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. + * Note that skb_linearize requires the skb not to be shared. */ if (skb_linearize(skb)) return -EAGAIN; @@ -545,11 +555,14 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, return num_sge; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + psock->ingress_bytes += len; +#endif copied = len; msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; - msg->skb = skb; + msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); @@ -557,7 +570,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len); + u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) @@ -571,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * correctly. */ if (unlikely(skb->sk == sk)) - return sk_psock_skb_ingress_self(psock, skb, off, len); + return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; @@ -583,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * into user buffers. */ skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; @@ -594,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len) + u32 off, u32 len, bool take_ref) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; @@ -603,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; @@ -612,18 +625,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { - int err = 0; - if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - skb_get(skb); - err = sk_psock_skb_ingress(psock, skb, off, len); - if (err < 0) - kfree_skb(skb); - return err; + + return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, @@ -648,12 +656,14 @@ static void sk_psock_backlog(struct work_struct *work) bool ingress; int ret; + /* Increment the psock refcnt to synchronize with close(fd) path in + * sock_map_close(), ensuring we wait for backlog thread completion + * before sk_socket freed. If refcnt increment fails, it indicates + * sock_map_close() completed with sk_socket potentially already freed. + */ + if (!sk_psock_get(psock->sk)) + return; mutex_lock(&psock->work_mutex); - if (unlikely(state->len)) { - len = state->len; - off = state->off; - } - while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; @@ -663,6 +673,13 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } + + /* Resume processing from previous partial state */ + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -673,7 +690,8 @@ static void sk_psock_backlog(struct work_struct *work) if (ret <= 0) { if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, len, off); - + /* Restore redir info we cleared before */ + skb_bpf_set_redir(skb, psock->sk, ingress); /* Delay slightly to prioritize any * other work that might be here. */ @@ -690,11 +708,14 @@ static void sk_psock_backlog(struct work_struct *work) len -= ret; } while (len); + /* The entire skb sent, clear state */ + sk_psock_skb_state(psock, state, 0, 0); skb = skb_dequeue(&psock->ingress_skb); kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); + sk_psock_put(psock->sk, psock); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -771,6 +792,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } @@ -1005,7 +1028,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, off = stm->offset; len = stm->full_len; } - err = sk_psock_skb_ingress_self(psock, skb, off, len); + err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); @@ -1116,9 +1139,9 @@ static void sk_psock_strp_data_ready(struct sock *sk) if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { - write_lock_bh(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); @@ -1138,6 +1161,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) if (!ret) sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + if (sk_is_tcp(sk)) { + psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; + psock->copied_seq = tcp_sk(sk)->copied_seq; + } return ret; } @@ -1226,11 +1253,8 @@ static void sk_psock_verdict_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); - if (psock) { - read_lock_bh(&sk->sk_callback_lock); + if (psock) sk_psock_data_ready(sk, psock); - read_unlock_bh(&sk->sk_callback_lock); - } rcu_read_unlock(); } } diff --git a/net/core/sock.c b/net/core/sock.c index 43bf3818c19e..3b409bc8ef6d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -85,7 +85,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> @@ -124,9 +124,11 @@ #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> @@ -146,6 +148,8 @@ #include <linux/ethtool.h> +#include <uapi/linux/pidfd.h> + #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); @@ -283,9 +287,6 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE; - -int sysctl_tstamp_allow_data __read_mostly = 1; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); @@ -455,6 +456,13 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, return 0; } +static bool sk_set_prio_allowed(const struct sock *sk, int val) +{ + return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); +} + static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { @@ -482,7 +490,7 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; @@ -552,7 +560,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; - if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } @@ -821,14 +829,11 @@ EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { + sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); - sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); - sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } @@ -935,6 +940,7 @@ int sock_set_timestamping(struct sock *sk, int optname, WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, @@ -945,6 +951,20 @@ int sock_set_timestamping(struct sock *sk, int optname, return 0; } +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ + struct bpf_sock_ops_kern sock_ops; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, 0); + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); +} +#endif + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); @@ -1049,6 +1069,75 @@ static int sock_reserve_memory(struct sock *sk, int bytes) return 0; } +#ifdef CONFIG_PAGE_POOL + +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED + * in 1 syscall. The limit exists to limit the amount of memory the kernel + * allocates to copy these tokens, and to prevent looping over the frags for + * too long. + */ +#define MAX_DONTNEED_TOKENS 128 +#define MAX_DONTNEED_FRAGS 1024 + +static noinline_for_stack int +sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) +{ + unsigned int num_tokens, i, j, k, netmem_num = 0; + struct dmabuf_token *tokens; + int ret = 0, num_frags = 0; + netmem_ref netmems[16]; + + if (!sk_is_tcp(sk)) + return -EBADF; + + if (optlen % sizeof(*tokens) || + optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) + return -EINVAL; + + num_tokens = optlen / sizeof(*tokens); + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); + if (!tokens) + return -ENOMEM; + + if (copy_from_sockptr(tokens, optval, optlen)) { + kvfree(tokens); + return -EFAULT; + } + + xa_lock_bh(&sk->sk_user_frags); + for (i = 0; i < num_tokens; i++) { + for (j = 0; j < tokens[i].token_count; j++) { + if (++num_frags > MAX_DONTNEED_FRAGS) + goto frag_limit_reached; + + netmem_ref netmem = (__force netmem_ref)__xa_erase( + &sk->sk_user_frags, tokens[i].token_start + j); + + if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + continue; + + netmems[netmem_num++] = netmem; + if (netmem_num == ARRAY_SIZE(netmems)) { + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + netmem_num = 0; + xa_lock_bh(&sk->sk_user_frags); + } + ret++; + } + } + +frag_limit_reached: + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + + kvfree(tokens); + return ret; +} +#endif + void sockopt_lock_sock(struct sock *sk) { /* When current->bpf_ctx is set, the setsockopt is called from @@ -1083,6 +1172,17 @@ bool sockopt_capable(int cap) } EXPORT_SYMBOL(sockopt_capable); +static int sockopt_validate_clockid(__kernel_clockid_t value) +{ + switch (value) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_TAI: + return 0; + } + return -EINVAL; +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -1117,22 +1217,11 @@ int sk_setsockopt(struct sock *sk, int level, int optname, /* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: - if ((val >= 0 && val <= 6) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (sk_set_prio_allowed(sk, val)) { sock_set_priority(sk, val); return 0; } return -EPERM; - case SO_PASSSEC: - assign_bit(SOCK_PASSSEC, &sock->flags, valbool); - return 0; - case SO_PASSCRED: - assign_bit(SOCK_PASSCRED, &sock->flags, valbool); - return 0; - case SO_PASSPIDFD: - assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); - return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: @@ -1180,6 +1269,8 @@ int sk_setsockopt(struct sock *sk, int level, int optname, return 0; } case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT) @@ -1200,6 +1291,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, ret = -EOPNOTSUPP; return ret; } +#ifdef CONFIG_PAGE_POOL + case SO_DEVMEM_DONTNEED: + return sock_devmem_dontneed(sk, optval, optlen); +#endif } sockopt_lock_sock(sk); @@ -1215,7 +1310,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); @@ -1434,6 +1532,10 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; + case SO_RCVPRIORITY: + sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); + break; + case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; @@ -1450,6 +1552,33 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; + case SO_PASSCRED: + if (sk_may_scm_recv(sk)) + sk->sk_scm_credentials = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSSEC: + if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) + sk->sk_scm_security = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSPIDFD: + if (sk_is_unix(sk)) + sk->sk_scm_pidfd = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSRIGHTS: + if (sk_is_unix(sk)) + sk->sk_scm_rights = valbool; + else + ret = -EOPNOTSUPP; + break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); @@ -1497,6 +1626,11 @@ set_sndbuf: ret = -EPERM; break; } + + ret = sockopt_validate_clockid(sk_txtime.clockid); + if (ret) + break; + sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = @@ -1741,11 +1875,24 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSCRED: - v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_credentials; break; case SO_PASSPIDFD: - v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_pidfd; + break; + + case SO_PASSRIGHTS: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_rights; break; case SO_PEERCRED: @@ -1767,6 +1914,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, { struct pid *peer_pid; struct file *pidfd_file = NULL; + unsigned int flags = 0; int pidfd; if (len > sizeof(pidfd)) @@ -1779,7 +1927,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!peer_pid) return -ENODATA; - pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); + /* The use of PIDFD_STALE requires stashing of struct pid + * on pidfs with pidfs_register_pid() and only AF_UNIX + * were prepared for this. + */ + if (sk->sk_family == AF_UNIX) + flags = PIDFD_STALE; + + pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); put_pid(peer_pid); if (pidfd < 0) return pidfd; @@ -1842,7 +1997,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSSEC: - v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_security; break; case SO_PEERSEC: @@ -1857,6 +2015,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = sock_flag(sk, SOCK_RCVMARK); break; + case SO_RCVPRIORITY: + v.val = sock_flag(sk, SOCK_RCVPRIORITY); + break; + case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; @@ -1940,7 +2102,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ - if (v.val < MIN_NAPI_ID) + if (!napi_id_valid(v.val)) v.val = 0; break; @@ -1986,6 +2148,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; + /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; @@ -2014,6 +2179,8 @@ lenout: */ static inline void sock_lock_init(struct sock *sk) { + sk_owner_clear(sk); + if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, @@ -2032,7 +2199,7 @@ static inline void sock_lock_init(struct sock *sk) /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, - * even temporarly, because of RCU lookups. sk_node should also be left as is. + * even temporarily, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) @@ -2110,6 +2277,9 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); + + sk_owner_put(sk); + if (slab != NULL) kmem_cache_free(slab, sk); else @@ -2145,6 +2315,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { + net_passive_inc(net); __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } @@ -2169,6 +2340,7 @@ EXPORT_SYMBOL(sk_alloc); static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); + struct net *net = sock_net(sk); struct sk_filter *filter; if (sk->sk_destruct) @@ -2200,14 +2372,28 @@ static void __sk_destruct(struct rcu_head *head) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - if (likely(sk->sk_net_refcnt)) - put_net_track(sock_net(sk), &sk->ns_tracker); - else - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - + if (likely(sk->sk_net_refcnt)) { + put_net_track(net, &sk->ns_tracker); + } else { + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + } sk_prot_free(sk->sk_prot_creator, sk); } +void sk_net_refcnt_upgrade(struct sock *sk) +{ + struct net *net = sock_net(sk); + + WARN_ON_ONCE(sk->sk_net_refcnt); + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); +} +EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); + void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); @@ -2262,7 +2448,12 @@ static void sk_init_common(struct sock *sk) lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); - lockdep_set_class_and_name(&sk->sk_callback_lock, + if (sk->sk_kern_sock) + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_kern_callback_keys + sk->sk_family, + af_family_kern_clock_key_strings[sk->sk_family]); + else + lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } @@ -2299,6 +2490,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) * is not properly dismantling its kernel sockets at netns * destroy time. */ + net_passive_inc(sock_net(newsk)); __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } @@ -2351,17 +2543,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; + + goto free; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. @@ -2391,18 +2580,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) net_enable_timestamp(); out: return newsk; -} -EXPORT_SYMBOL_GPL(sk_clone_lock); - -void sk_free_unlock_clone(struct sock *sk) -{ +free: /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - sk->sk_destruct = NULL; - bh_unlock_sock(sk); - sk_free(sk); + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; } -EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { @@ -2427,8 +2615,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) u32 max_segs = 1; sk->sk_route_caps = dst->dev->features; - if (sk_is_tcp(sk)) + if (sk_is_tcp(sk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + sk->sk_route_caps |= NETIF_F_GSO; + icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); + } if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) @@ -2505,19 +2697,16 @@ void __sock_wfree(struct sk_buff *skb) void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); - skb->sk = sk; #ifdef CONFIG_INET - if (unlikely(!sk_fullsock(sk))) { - skb->destructor = sock_edemux; - sock_hold(sk); - return; - } + if (unlikely(!sk_fullsock(sk))) + return skb_set_owner_edemux(skb, sk); #endif + skb->sk = sk; skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation - * is enough to guarantee sk_free() wont free this sock until + * is enough to guarantee sk_free() won't free this sock until * all in-flight packets are completed */ refcount_add(skb->truesize, &sk->sk_wmem_alloc); @@ -2526,13 +2715,12 @@ EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { -#ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ - if (skb->decrypted) + if (skb_is_decrypted(skb)) return false; -#endif + return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } @@ -2702,6 +2890,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } EXPORT_SYMBOL(sock_kmalloc); +/* + * Duplicate the input "src" memory block using the socket's + * option memory buffer. + */ +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority) +{ + void *mem; + + mem = sock_kmalloc(sk, size, priority); + if (mem) + memcpy(mem, src, size); + return mem; +} +EXPORT_SYMBOL(sock_kmemdup); + /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. @@ -2811,6 +3015,8 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, { u32 tsflags; + BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); + switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && @@ -2839,10 +3045,33 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; + case SCM_TS_OPT_ID: + if (sk_is_tcp(sk)) + return -EINVAL; + tsflags = READ_ONCE(sk->sk_tsflags); + if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); + sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SO_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) + return -EPERM; + sockc->priority = *(u32 *)CMSG_DATA(cmsg); + break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } @@ -3055,16 +3284,16 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; - bool charged = false; + bool charged = true; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg) { - if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) + charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + if (!charged) goto suppress_allocation; - charged = true; } /* Under limit. */ @@ -3149,7 +3378,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (charged) + if (memcg && charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; @@ -3242,8 +3471,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) } EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { return -EOPNOTSUPP; } @@ -3338,7 +3567,7 @@ static void sock_def_error_report(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } @@ -3353,7 +3582,7 @@ void sock_def_readable(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -3373,7 +3602,7 @@ static void sock_def_write_space(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -3398,7 +3627,7 @@ static void sock_def_write_space_wfree(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } @@ -3409,7 +3638,7 @@ static void sock_def_destruct(struct sock *sk) void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) - if (send_sigurg(&sk->sk_socket->file->f_owner)) + if (send_sigurg(sk->sk_socket->file)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); @@ -3424,14 +3653,14 @@ EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { - if (del_timer(timer)) + if (timer_delete(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { - if (del_timer_sync(timer)) + if (timer_delete_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); @@ -3461,18 +3690,6 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) } sk->sk_uid = uid; - rwlock_init(&sk->sk_callback_lock); - if (sk->sk_kern_sock) - lockdep_set_class_and_name( - &sk->sk_callback_lock, - af_kern_callback_keys + sk->sk_family, - af_family_kern_clock_key_strings[sk->sk_family]); - else - lockdep_set_class_and_name( - &sk->sk_callback_lock, - af_callback_keys + sk->sk_family, - af_family_clock_key_strings[sk->sk_family]); - sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; sk->sk_write_space = sock_def_write_space; @@ -3689,7 +3906,7 @@ EXPORT_SYMBOL(sock_recv_errqueue); * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume - * this means if you specify SO_ERROR (otherwise whats the point of it). + * this means if you specify SO_ERROR (otherwise what is the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) @@ -3771,7 +3988,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); - mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); + mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); @@ -3837,7 +4054,7 @@ static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } @@ -3848,7 +4065,7 @@ static int assign_proto_idx(struct proto *prot) static void release_proto_idx(struct proto *prot) { - if (prot->inuse_idx != PROTO_INUSE_NR - 1) + if (prot->inuse_idx != PROTO_INUSE_NR) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 654122838025..b23594c767f2 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -18,7 +18,7 @@ static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX]; -static struct sock_diag_inet_compat __rcu *inet_rcv_compat; +static const struct sock_diag_inet_compat __rcu *inet_rcv_compat; static struct workqueue_struct *broadcast_wq; @@ -187,8 +187,7 @@ void sock_diag_broadcast_destroy(struct sock *sk) void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr) { - xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, - ptr); + xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr)); } EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); @@ -196,8 +195,7 @@ void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr) { const struct sock_diag_inet_compat *old; - old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, - NULL); + old = unrcu_pointer(xchg(&inet_rcv_compat, NULL)); WARN_ON_ONCE(old != ptr); } EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); @@ -266,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: - case DCCPDIAG_GETSOCK: - if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 27d733c0f65e..82a14f131d00 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -59,55 +67,50 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { - u32 ufd = attr->target_fd; struct bpf_map *map; - struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); - fdput(f); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { - u32 ufd = attr->target_fd; struct bpf_prog *prog; struct bpf_map *map; - struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); - if (IS_ERR(prog)) { - ret = PTR_ERR(prog); - goto put_map; - } + if (IS_ERR(prog)) + return PTR_ERR(prog); if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); -put_map: - fdput(f); return ret; } @@ -156,6 +159,7 @@ static void sock_map_del_link(struct sock *sk, verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); + break; } } spin_unlock_bh(&psock->link_lock); @@ -299,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { - ret = sk_psock_init_strp(sk, psock); + if (sk_is_tcp(sk)) + ret = sk_psock_init_strp(sk, psock); + else + ret = -EOPNOTSUPP; if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); @@ -408,12 +415,11 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key) static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { - struct sock *sk; + struct sock *sk = NULL; int err = 0; spin_lock_bh(&stab->lock); - sk = *psk; - if (!sk_test || sk_test == sk) + if (!sk_test || sk_test == *psk) sk = xchg(psk, NULL); if (likely(sk)) @@ -538,6 +544,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } @@ -644,6 +653,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) + return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; @@ -672,6 +683,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; + if (sk_is_vsock(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; @@ -1171,6 +1184,7 @@ static void sock_hash_free(struct bpf_map *map) sock_put(elem->sk); sock_hash_free_elem(htab, elem); } + cond_resched(); } /* wait for psock readers accessing its map link */ @@ -1245,6 +1259,8 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) + return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; @@ -1273,6 +1289,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; + if (sk_is_vsock(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; @@ -1454,80 +1472,108 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; + u32 prog_cnt = 0, flags = 0; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; - struct fd f; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1552,7 +1598,6 @@ end: copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; - fdput(f); return ret; } @@ -1633,19 +1678,23 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock_get(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - saved_close = READ_ONCE(sk->sk_prot)->close; - } else { + psock = sk_psock(sk); + if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); + psock = sk_psock_get(sk); + if (unlikely(!psock)) + goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); + } else { + saved_close = READ_ONCE(sk->sk_prot)->close; +no_psock: + rcu_read_unlock(); + release_sock(sk); } /* Make sure we do not recurse. This is a bug. @@ -1657,6 +1706,200 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; + enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + sockmap_link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + if (!sockmap_link->map) { + ret = -ENOLINK; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + sockmap_link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = sockmap_link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + sockmap_link->map = map; + sockmap_link->attach_type = attach_type; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5a165286e4d8..4211710393a8 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -173,10 +173,9 @@ static bool __reuseport_detach_closed_sock(struct sock *sk, static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { - unsigned int size = sizeof(struct sock_reuseport) + - sizeof(struct sock *) * max_socks; - struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + struct sock_reuseport *reuse; + reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC); if (!reuse) return NULL; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6973dda3abda..5dbb2c6f371d 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ #include <net/busy_poll.h> #include <net/pkt_sched.h> #include <net/hotdata.h> +#include <net/proto_memory.h> #include <net/rps.h> #include "dev.h" @@ -33,6 +34,7 @@ static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -50,29 +52,45 @@ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) -static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, - struct cpumask *mask) +static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) { - char kbuf[128]; + char *kbuf; int len; if (*ppos || !*lenp) { *lenp = 0; - return; + return 0; + } + + /* CPUs are displayed as a hex bitmap + a comma between each groups of 8 + * nibbles (except the last one which has a newline instead). + * Guesstimate the buffer size at the group granularity level. + */ + len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp); + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) { + *lenp = 0; + return -ENOMEM; } - len = min(sizeof(kbuf) - 1, *lenp); len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; - return; + goto free_buf; } - if (len < *lenp) - kbuf[len++] = '\n'; + /* scnprintf writes a trailing null char not counted in the returned + * length, override it with a newline. + */ + kbuf[len++] = '\n'; memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; + +free_buf: + kfree(kbuf); + return 0; } #endif @@ -94,7 +112,7 @@ static struct cpumask *rps_default_mask_cow_alloc(struct net *net) return rps_default_mask; } -static int rps_default_mask_sysctl(struct ctl_table *table, int write, +static int rps_default_mask_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->data; @@ -116,8 +134,8 @@ static int rps_default_mask_sysctl(struct ctl_table *table, int write, if (err) goto done; } else { - dump_cpumask(buffer, lenp, ppos, - net->core.rps_default_mask ? : cpu_none_mask); + err = dump_cpumask(buffer, lenp, ppos, + net->core.rps_default_mask ? : cpu_none_mask); } done: @@ -125,7 +143,7 @@ done: return err; } -static int rps_sock_flow_sysctl(struct ctl_table *table, int write, +static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; @@ -183,7 +201,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu_mightsleep(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -197,7 +215,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); -static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, +static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; @@ -221,7 +239,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu_mightsleep(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); @@ -230,7 +248,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } @@ -246,7 +264,7 @@ write_unlock: } rcu_read_unlock(); - dump_cpumask(buffer, lenp, ppos, mask); + ret = dump_cpumask(buffer, lenp, ppos, mask); } done: @@ -254,7 +272,7 @@ done: return ret; } -static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, +static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; @@ -276,7 +294,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED -static int set_default_qdisc(struct ctl_table *table, int write, +static int set_default_qdisc(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; @@ -295,14 +313,14 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif -static int proc_do_dev_weight(struct ctl_table *table, int write, +static int proc_do_dev_weight(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { static DEFINE_MUTEX(dev_weight_mutex); int ret, weight; mutex_lock(&dev_weight_mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); @@ -313,7 +331,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, return ret; } -static int proc_do_rss_key(struct ctl_table *table, int write, +static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; @@ -326,7 +344,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, } #ifdef CONFIG_BPF_JIT -static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, +static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -359,7 +377,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int -proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) @@ -370,7 +388,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, # endif /* CONFIG_HAVE_EBPF_JIT */ static int -proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) @@ -382,40 +400,8 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, static struct ctl_table net_core_table[] = { { - .procname = "wmem_max", - .data = &sysctl_wmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_max", - .data = &sysctl_rmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { - .procname = "wmem_default", - .data = &sysctl_wmem_default, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &sysctl_rmem_default, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { .procname = "mem_pcpu_rsv", - .data = &sysctl_mem_pcpu_rsv, + .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -427,6 +413,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", @@ -434,6 +421,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", @@ -441,6 +429,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", @@ -522,15 +511,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "tstamp_allow_data", - .data = &sysctl_tstamp_allow_data, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", @@ -595,7 +575,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -608,7 +588,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", @@ -654,13 +634,12 @@ static struct ctl_table net_core_table[] = { }, { .procname = "skb_defer_max", - .data = &sysctl_skb_defer_max, + .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { } }; static struct ctl_table netns_core_table[] = { @@ -697,7 +676,50 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, - { } + { + .procname = "tstamp_allow_data", + .data = &init_net.core.sysctl_tstamp_allow_data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + /* sysctl_core_net_init() will set the values after this + * to readonly in network namespaces + */ + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -715,20 +737,27 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + size_t table_size = ARRAY_SIZE(netns_core_table); + struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) - tmp->data += (char *)net - (char *)&init_net; + for (i = 0; i < table_size; ++i) { + if (tbl[i].data == &sysctl_wmem_max) + break; + + tbl[i].data += (char *)net - (char *)&init_net; + } + for (; i < table_size; ++i) + tbl[i].mode &= ~0222; } - net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, - ARRAY_SIZE(netns_core_table)); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; @@ -743,7 +772,7 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 04840697fe79..a50a7ef49ae8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -9,6 +9,7 @@ #include <linux/ptp_classify.h> #include <linux/skbuff.h> #include <linux/export.h> +#include <linux/ptp_clock_kernel.h> static unsigned int classify(const struct sk_buff *skb) { @@ -21,18 +22,39 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; struct sk_buff *clone; unsigned int type; - if (!skb->sk) + if (!skb->sk || !skb->dev) return; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + type = classify(skb); if (type == PTP_CLASS_NONE) return; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->txtstamp)) { clone = skb_clone_sk(skb); if (!clone) @@ -44,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; unsigned int type; - if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) + if (!skb->dev) return false; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return false; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + if (skb_headroom(skb) < ETH_HLEN) return false; @@ -62,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return false; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->rxtstamp)) return mii_ts->rxtstamp(mii_ts, skb, type); diff --git a/net/core/tso.c b/net/core/tso.c index e00796e3b146..6df997b9076e 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -3,7 +3,7 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/tso.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last) diff --git a/net/core/utils.c b/net/core/utils.c index c994e95172ac..5e63b0ea21f3 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Generic address resultion entity + * Generic address resolution entity * * Authors: * net_random Alan Cox @@ -399,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, } EXPORT_SYMBOL(inet_pton_with_scope); -bool inet_addr_is_any(struct sockaddr *addr) +bool inet_addr_is_any(struct sockaddr_storage *addr) { - if (addr->sa_family == AF_INET6) { + if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; const struct sockaddr_in6 in6_any = { .sin6_addr = IN6ADDR_ANY_INIT }; @@ -409,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr) if (!memcmp(in6->sin6_addr.s6_addr, in6_any.sin6_addr.s6_addr, 16)) return true; - } else if (addr->sa_family == AF_INET) { + } else if (addr->ss_family == AF_INET) { struct sockaddr_in *in = (struct sockaddr_in *)addr; if (in->sin_addr.s_addr == htonl(INADDR_ANY)) return true; } else { - pr_warn("unexpected address family %u\n", addr->sa_family); + pr_warn("unexpected address family %u\n", addr->ss_family); } return false; @@ -473,11 +473,11 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, EXPORT_SYMBOL(inet_proto_csum_replace16); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr) + __wsum diff, bool pseudohdr, bool ipv6) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace_by_diff(sum, diff); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6) skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); diff --git a/net/core/xdp.c b/net/core/xdp.c index 41693154e426..491334b9b8be 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include <net/page_pool/helpers.h> #include <net/hotdata.h> +#include <net/netdev_lock.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> @@ -127,10 +128,8 @@ void xdp_unreg_mem_model(struct xdp_mem_info *mem) return; if (type == MEM_TYPE_PAGE_POOL) { - rcu_read_lock(); - xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); - rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); @@ -188,7 +187,6 @@ int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; - xdp_rxq->napi_id = napi_id; xdp_rxq->frag_size = frag_size; xdp_rxq->reg_state = REG_STATE_REGISTERED; @@ -295,10 +293,8 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, mutex_lock(&mem_id_lock); ret = __mem_id_init_hash_table(); mutex_unlock(&mem_id_lock); - if (ret < 0) { - WARN_ON(1); + if (ret < 0) return ERR_PTR(ret); - } } xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); @@ -362,6 +358,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); + if (type == MEM_TYPE_XSK_BUFF_POOL && allocator) + xsk_pool_set_rxq_info(allocator, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) trace_mem_connect(xdp_alloc, xdp_rxq); return 0; @@ -369,33 +368,87 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); +/** + * xdp_reg_page_pool - register &page_pool as a memory provider for XDP + * @pool: &page_pool to register + * + * Can be used to register pools manually without connecting to any XDP RxQ + * info, so that the XDP layer will be aware of them. Then, they can be + * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool(). + * + * Return: %0 on success, -errno on error. + */ +int xdp_reg_page_pool(struct page_pool *pool) +{ + struct xdp_mem_info mem; + + return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool); +} +EXPORT_SYMBOL_GPL(xdp_reg_page_pool); + +/** + * xdp_unreg_page_pool - unregister &page_pool from the memory providers list + * @pool: &page_pool to unregister + * + * A shorthand for manual unregistering page pools. If the pool was previously + * attached to an RxQ info, it must be detached first. + */ +void xdp_unreg_page_pool(const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_unreg_mem_model(&mem); +} +EXPORT_SYMBOL_GPL(xdp_unreg_page_pool); + +/** + * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info + * @xdp_rxq: XDP RxQ info to attach the pool to + * @pool: pool to attach + * + * If the pool was registered manually, this function must be called instead + * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info. + */ +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_rxq_info_attach_mem_model(xdp_rxq, &mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); + /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp) +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp) { - struct page *page; - - switch (mem->type) { + switch (mem_type) { case MEM_TYPE_PAGE_POOL: - page = virt_to_head_page(data); + netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as mem->type knows this a page_pool page + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page */ - page_pool_put_full_page(page->pp, page, napi_direct); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, + napi_direct); break; case MEM_TYPE_PAGE_SHARED: - page_frag_free(data); + page_frag_free(__netmem_address(netmem)); break; case MEM_TYPE_PAGE_ORDER0: - page = virt_to_page(data); /* Assumes order0 page*/ - put_page(page); + put_page(__netmem_to_page(netmem)); break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ @@ -403,7 +456,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ - WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); + WARN(1, "Incorrect XDP memory type (%d) usage", mem_type); break; } } @@ -411,38 +464,34 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + false, NULL); - __xdp_return(page_address(page), &xdpf->mem, false, NULL); - } out: - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + true, NULL); - __xdp_return(page_address(page), &xdpf->mem, true, NULL); - } out: - __xdp_return(xdpf->data, &xdpf->mem, true, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -456,46 +505,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); * xdp_frame_bulk is usually stored/allocated on the function * call-stack to avoid locking penalties. */ -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) -{ - struct xdp_mem_allocator *xa = bq->xa; - - if (unlikely(!xa || !bq->count)) - return; - - page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); - /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ - bq->count = 0; -} -EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); /* Must be called with rcu_read_lock held */ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq) { - struct xdp_mem_info *mem = &xdpf->mem; - struct xdp_mem_allocator *xa; - - if (mem->type != MEM_TYPE_PAGE_POOL) { + if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) { xdp_return_frame(xdpf); return; } - xa = bq->xa; - if (unlikely(!xa)) { - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - bq->count = 0; - bq->xa = xa; - } - if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); - if (unlikely(mem->id != xa->mem.id)) { - xdp_flush_frame_bulk(bq); - bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - } - if (unlikely(xdp_frame_has_frags(xdpf))) { struct skb_shared_info *sinfo; int i; @@ -504,31 +526,40 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; - bq->q[bq->count++] = skb_frag_address(frag); + bq->q[bq->count++] = skb_frag_netmem(frag); if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); } } - bq->q[bq->count++] = xdpf->data; + bq->q[bq->count++] = virt_to_netmem(xdpf->data); } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); +/** + * xdp_return_frag -- free one XDP frag or decrement its refcount + * @netmem: network memory reference to release + * @xdp: &xdp_buff to release the frag for + */ +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp) +{ + __xdp_return(netmem, xdp->rxq->mem.type, true, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frag); + void xdp_return_buff(struct xdp_buff *xdp) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), + xdp->rxq->mem.type, true, xdp); - __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); - } out: - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); + __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); @@ -574,7 +605,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->headroom = 0; xdpf->metasize = metasize; xdpf->frame_sz = PAGE_SIZE; - xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); return xdpf; @@ -588,15 +619,177 @@ void xdp_warn(const char *msg, const char *func, const int line) }; EXPORT_SYMBOL_GPL(xdp_warn); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +/** + * xdp_build_skb_from_buff - create an skb from &xdp_buff + * @xdp: &xdp_buff to convert to an skb + * + * Perform common operations to create a new skb to pass up the stack from + * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize + * skb data pointers and offsets, set the recycle bit if the buff is + * PP-backed, Rx queue index, protocol and update frags info. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) { - n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs); - if (unlikely(!n_skb)) - return -ENOMEM; + const struct xdp_rxq_info *rxq = xdp->rxq; + const struct skb_shared_info *sinfo; + struct sk_buff *skb; + u32 nr_frags = 0; + int metalen; - return 0; + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } + + skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + __skb_put(skb, xdp->data_end - xdp->data); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) + skb_metadata_set(skb, metalen); + + if (rxq->mem.type == MEM_TYPE_PAGE_POOL) + skb_mark_for_recycle(skb); + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(nr_frags)) { + u32 tsize; + + tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, tsize, + xdp_buff_is_frag_pfmemalloc(xdp)); + } + + skb->protocol = eth_type_trans(skb, rxq->dev); + + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff); + +/** + * xdp_copy_frags_from_zc - copy frags from XSk buff to skb + * @skb: skb to copy frags to + * @xdp: XSk &xdp_buff from which the frags will be copied + * @pp: &page_pool backing page allocation, if available + * + * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack. + * Allocate a new buffer for each frag, copy it and attach to the skb. + * + * Return: true on success, false on netmem allocation fail. + */ +static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, + const struct xdp_buff *xdp, + struct page_pool *pp) +{ + struct skb_shared_info *sinfo = skb_shinfo(skb); + const struct skb_shared_info *xinfo; + u32 nr_frags, tsize = 0; + bool pfmemalloc = false; + + xinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = xinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + const skb_frag_t *frag = &xinfo->frags[i]; + u32 len = skb_frag_size(frag); + u32 offset, truesize = len; + struct page *page; + + page = page_pool_dev_alloc(pp, &offset, &truesize); + if (unlikely(!page)) { + sinfo->nr_frags = i; + return false; + } + + memcpy(page_address(page) + offset, skb_frag_address(frag), + LARGEST_ALIGN(len)); + __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); + + tsize += truesize; + pfmemalloc |= page_is_pfmemalloc(page); + } + + xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, + tsize, pfmemalloc); + + return true; } -EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); + +/** + * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff + * @xdp: source XSk buff + * + * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb + * head, new buffer for the head, copy the data and initialize the skb fields. + * If there are frags, allocate new buffers for them and copy. + * Buffers are allocated from the system percpu pools to try recycling them. + * If new skb was built successfully, @xdp is returned to XSk pool's freelist. + * On error, it remains untouched and the caller must take care of this. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) +{ + const struct xdp_rxq_info *rxq = xdp->rxq; + u32 len = xdp->data_end - xdp->data_meta; + u32 truesize = xdp->frame_sz; + struct sk_buff *skb = NULL; + struct page_pool *pp; + int metalen; + void *data; + + if (!IS_ENABLED(CONFIG_PAGE_POOL)) + return NULL; + + local_lock_nested_bh(&system_page_pool.bh_lock); + pp = this_cpu_read(system_page_pool.pool); + data = page_pool_dev_alloc_va(pp, &truesize); + if (unlikely(!data)) + goto out; + + skb = napi_build_skb(data, truesize); + if (unlikely(!skb)) { + page_pool_free_va(pp, data, true); + goto out; + } + + skb_mark_for_recycle(skb); + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); + + memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len)); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) { + skb_metadata_set(skb, metalen); + __skb_pull(skb, metalen); + } + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(xdp_buff_has_frags(xdp)) && + unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { + napi_consume_skb(skb, true); + skb = NULL; + goto out; + } + + xsk_buff_free(xdp); + + skb->protocol = eth_type_trans(skb, rxq->dev); + +out: + local_unlock_nested_bh(&system_page_pool.bh_lock); + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, @@ -644,7 +837,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, * - RX ring dev queue index (skb_record_rx_queue) */ - if (xdpf->mem.type == MEM_TYPE_PAGE_POOL) + if (xdpf->mem_type == MEM_TYPE_PAGE_POOL) skb_mark_for_recycle(skb); /* Allow SKB to reuse area used by xdp_frame */ @@ -691,8 +884,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) nxdpf = addr; nxdpf->data = addr + headroom; nxdpf->frame_sz = PAGE_SIZE; - nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - nxdpf->mem.id = 0; + nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0; return nxdpf; } @@ -805,34 +997,60 @@ static int __init xdp_metadata_init(void) } late_initcall(xdp_metadata_init); -void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) { val &= NETDEV_XDP_ACT_MASK; if (dev->xdp_features == val) return; + netdev_assert_locked_or_invisible(dev); dev->xdp_features = val; if (dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} EXPORT_SYMBOL_GPL(xdp_set_features_flag); -void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg) { xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); if (support_sg) val |= NETDEV_XDP_ACT_NDO_XMIT_SG; - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + netdev_lock(dev); + xdp_features_set_redirect_target_locked(dev, support_sg); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); -void xdp_features_clear_redirect_target(struct net_device *dev) +void xdp_features_clear_redirect_target_locked(struct net_device *dev) { xdp_features_t val = dev->xdp_features; val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + netdev_lock(dev); + xdp_features_clear_redirect_target_locked(dev); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); |