diff options
Diffstat (limited to 'net/core')
69 files changed, 19613 insertions, 19962 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 5857cec87b83..9ef2099c5426 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,21 +9,26 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ +obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o xdp.o flow_offload.o gro.o + fib_notifier.o xdp.o flow_offload.o gro.o \ + netdev-genl.o netdev-genl-gen.o gso.o obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o -obj-$(CONFIG_PAGE_POOL) += page_pool.o +obj-y += hotdata.o +obj-y += netdev_rx_queue.o +obj-y += netdev_queues.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o @@ -33,10 +38,13 @@ obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o -obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o +obj-$(CONFIG_NET_TEST) += net_test.o +obj-$(CONFIG_NET_DEVMEM) += devmem.o +obj-$(CONFIG_DEBUG_NET) += lock_debug.o +obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index bb378c33f542..850dd736ccd1 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), true); + bpf_selem_unlink(SELEM(sdata), false); return 0; } @@ -49,22 +49,15 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; - bool free_sk_storage = false; - rcu_read_lock(); + rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage) { - rcu_read_unlock(); - return; - } - - raw_spin_lock_bh(&sk_storage->lock); - free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage); - raw_spin_unlock_bh(&sk_storage->lock); - rcu_read_unlock(); + if (!sk_storage) + goto out; - if (free_sk_storage) - kfree_rcu(sk_storage, rcu); + bpf_local_storage_destroy(sk_storage); +out: + rcu_read_unlock_migrate(); } static void bpf_sk_storage_map_free(struct bpf_map *map) @@ -74,7 +67,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &sk_cache); + return bpf_local_storage_map_alloc(attr, &sk_cache, false); } static int notsupp_get_next_key(struct bpf_map *map, void *key, @@ -100,8 +93,8 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(err); } -static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct socket *sock; @@ -112,7 +105,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags, GFP_ATOMIC); + map_flags, false, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -120,7 +113,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, return err; } -static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) { struct socket *sock; int fd, err; @@ -143,7 +136,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -166,7 +159,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) @@ -203,7 +196,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { - kfree(copy_selem); + bpf_selem_free(copy_selem, true); atomic_sub(smap->elem_size, &newsk->sk_omem_alloc); bpf_map_put(map); @@ -217,7 +210,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } out: - rcu_read_unlock(); + rcu_read_unlock_migrate(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. @@ -249,7 +242,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -281,9 +274,10 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { - int optmem_max = READ_ONCE(sysctl_optmem_max); struct sock *sk = (struct sock *)owner; + int optmem_max; + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { @@ -324,6 +318,7 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, + .map_mem_usage = bpf_local_storage_map_mem_usage, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { @@ -356,11 +351,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) { - const struct btf *btf_vmlinux; - const struct btf_type *t; - const char *tname; - u32 btf_id; - if (prog->aux->dst_prog) return false; @@ -375,13 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - btf_vmlinux = bpf_get_btf_vmlinux(); - if (IS_ERR_OR_NULL(btf_vmlinux)) - return false; - btf_id = prog->aux->attach_btf_id; - t = btf_type_by_id(btf_vmlinux, btf_id); - tname = btf_name_by_offset(btf_vmlinux, t->name_off); - return !!strncmp(tname, "bpf_sk_storage", + return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: return false; @@ -417,7 +401,7 @@ const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -429,7 +413,7 @@ const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .allowed = bpf_sk_storage_tracing_allowed, }; @@ -500,24 +484,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) if (!bpf_capable()) return ERR_PTR(-EPERM); - nla_for_each_nested(nla, nla_stgs, rem) { - if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) - nr_maps++; + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); + nr_maps++; } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); - nla_for_each_nested(nla, nla_stgs, rem) { - struct bpf_map *map; - int map_fd; - - if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) - continue; + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + int map_fd = nla_get_u32(nla); + struct bpf_map *map = bpf_map_get(map_fd); - map_fd = nla_get_u32(nla); - map = bpf_map_get(map_fd); if (IS_ERR(map)) { err = PTR_ERR(map); goto err_free; diff --git a/net/core/datagram.c b/net/core/datagram.c index e4ff2db40c98..c285c6465923 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -50,8 +50,9 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> -#include <linux/uio.h> +#include <linux/iov_iter.h> #include <linux/indirect_call_wrapper.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -62,6 +63,8 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> +#include "devmem.h" + /* * Is a socket 'connection oriented' ? */ @@ -162,8 +165,7 @@ done: return skb; } -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) @@ -260,7 +262,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) @@ -323,25 +325,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(skb_free_datagram); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) -{ - bool slow; - - if (!skb_unref(skb)) { - sk_peek_offset_bwd(sk, len); - return; - } - - slow = lock_sock_fast(sk); - sk_peek_offset_bwd(sk, len); - skb_orphan(skb); - unlock_sock_fast(sk, slow); - - /* skb is now orphaned, can be freed outside of locked section */ - __kfree_skb(skb); -} -EXPORT_SYMBOL(__skb_free_datagram_locked); - int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, @@ -362,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, spin_unlock_bh(&sk_queue->lock); } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); @@ -425,6 +408,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, return 0; } + if (!skb_frags_readable(skb)) + goto short_copy; + /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -434,15 +420,23 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; if (copy > len) copy = len; - n = INDIRECT_CALL_1(cb, simple_copy_to_iter, - vaddr + skb_frag_off(frag) + offset - start, - copy, data, to); - kunmap(page); + + n = 0; + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); + n += INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + p_off, p_len, data, to); + kunmap_local(vaddr); + } + offset += n; if (n != copy) goto short_copy; @@ -489,23 +483,37 @@ short_copy: return 0; } +#ifdef CONFIG_NET_CRC32C +static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, + void *_crcp, struct iov_iter *i) +{ + u32 *crcp = _crcp; + size_t copied; + + copied = copy_to_iter(addr, bytes, i); + *crcp = crc32c(*crcp, addr, copied); + return copied; +} + /** - * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator - * and update a hash. + * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator + * and update a CRC32C value. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec - * @hash: hash request to update + * @crcp: pointer to CRC32C value to update + * + * Return: 0 on success, -EFAULT if there was a fault during copy. */ -int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, - struct ahash_request *hash) +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp) { return __skb_datagram_iter(skb, offset, to, len, true, - hash_and_copy_to_iter, hash); + crc32c_and_copy_to_iter, crcp); } -EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); +EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); +#endif /* CONFIG_NET_CRC32C */ static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) @@ -610,24 +618,34 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, - struct sk_buff *skb, struct iov_iter *from, - size_t length) +int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, + struct iov_iter *from, int len) { - int frag; + struct iov_iter_state state; + int ret; - if (msg && msg->msg_ubuf && msg->sg_from_iter) - return msg->sg_from_iter(sk, skb, from, length); + iov_iter_save_state(from, &state); + ret = skb_copy_datagram_from_iter(skb, offset, from, len); + if (ret) + iov_iter_restore(from, &state); + return ret; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iter_full); - frag = skb_shinfo(skb)->nr_frags; +int zerocopy_fill_skb_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length) +{ + int frag = skb_shinfo(skb)->nr_frags; + + if (!skb_frags_readable(skb)) + return -EFAULT; while (length && iov_iter_count(from)) { + struct page *head, *last_head = NULL; struct page *pages[MAX_SKB_FRAGS]; - struct page *last_head = NULL; + int refs, order, n = 0; size_t start; ssize_t copied; - unsigned long truesize; - int refs, n = 0; if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; @@ -639,20 +657,20 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, length -= copied; - truesize = PAGE_ALIGN(copied + start); skb->data_len += copied; skb->len += copied; - skb->truesize += truesize; - if (sk && sk->sk_type == SOCK_STREAM) { - sk_wmem_queued_add(sk, truesize); - if (!skb_zcopy_pure(skb)) - sk_mem_charge(sk, truesize); - } else { - refcount_add(truesize, &skb->sk->sk_wmem_alloc); - } + skb->truesize += PAGE_ALIGN(copied + start); + + head = compound_head(pages[n]); + order = compound_order(head); + for (refs = 0; copied != 0; start = 0) { int size = min_t(int, copied, PAGE_SIZE - start); - struct page *head = compound_head(pages[n]); + + if (pages[n] - head > (1UL << order) - 1) { + head = compound_head(pages[n]); + order = compound_order(head); + } start += (pages[n] - head) << PAGE_SHIFT; copied -= size; @@ -684,6 +702,73 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, } return 0; } + +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC && + iov_iter_type(from) != ITER_UBUF) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, struct iov_iter *from, + size_t length, + struct net_devmem_dmabuf_binding *binding) +{ + unsigned long orig_size = skb->truesize; + unsigned long truesize; + int ret; + + if (msg && msg->msg_ubuf && msg->sg_from_iter) + ret = msg->sg_from_iter(skb, from, length); + else if (binding) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); + else + ret = zerocopy_fill_skb_from_iter(skb, from, length); + + truesize = skb->truesize - orig_size; + if (sk && sk->sk_type == SOCK_STREAM) { + sk_wmem_queued_add(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } + return ret; +} EXPORT_SYMBOL(__zerocopy_sg_from_iter); /** @@ -704,10 +789,64 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); +static __always_inline +size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress, + size_t len, void *from, void *priv2) +{ + __wsum next, *csum = priv2; + + next = csum_and_copy_to_user(from + progress, iter_to, len); + *csum = csum_block_add(*csum, next, progress); + return next ? 0 : len; +} + +static __always_inline +size_t memcpy_to_iter_csum(void *iter_to, size_t progress, + size_t len, void *from, void *priv2) +{ + __wsum *csum = priv2; + __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len); + + *csum = csum_block_add(*csum, next, progress); + return 0; +} + +struct csum_state { + __wsum csum; + size_t off; +}; + +static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, + struct iov_iter *i) +{ + struct csum_state *csstate = _csstate; + __wsum sum; + + if (WARN_ON_ONCE(i->data_source)) + return 0; + if (unlikely(iov_iter_is_discard(i))) { + // can't use csum_memcpy() for that one - data is not copied + csstate->csum = csum_block_add(csstate->csum, + csum_partial(addr, bytes, 0), + csstate->off); + csstate->off += bytes; + return bytes; + } + + sum = csum_shift(csstate->csum, csstate->off); + + bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum, + copy_to_user_iter_csum, + memcpy_to_iter_csum); + csstate->csum = csum_shift(sum, csstate->off); + csstate->off += bytes; + return bytes; +} + /** * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. @@ -781,48 +920,54 @@ fault: EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** - * datagram_poll - generic datagram poll + * datagram_poll_queue - same as datagram_poll, but on a specific receive + * queue * @file: file struct * @sock: socket * @wait: poll table + * @rcv_queue: receive queue to poll * - * Datagram poll: Again totally generic. This also handles - * sequenced packet sockets providing the socket receive queue - * is only ever holding data ready to receive. + * Performs polling on the given receive queue, handling shutdown, error, + * and connection state. This is useful for protocols that deliver + * userspace-bound packets through a custom queue instead of + * sk->sk_receive_queue. * - * Note: when you *don't* use this routine for this protocol, - * and you use a different write policy from sock_writeable() - * then please supply your own write_space callback. + * Return: poll bitmask indicating the socket's current state */ -__poll_t datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t datagram_poll_queue(struct file *file, struct socket *sock, + poll_table *wait, struct sk_buff_head *rcv_queue) { struct sock *sk = sock->sk; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (sk->sk_shutdown & RCV_SHUTDOWN) + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(rcv_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (connection_based(sk)) { - if (sk->sk_state == TCP_CLOSE) + int state = READ_ONCE(sk->sk_state); + + if (state == TCP_CLOSE) mask |= EPOLLHUP; /* connection hasn't started yet? */ - if (sk->sk_state == TCP_SYN_SENT) + if (state == TCP_SYN_SENT) return mask; } @@ -834,4 +979,27 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, return mask; } +EXPORT_SYMBOL(datagram_poll_queue); + +/** + * datagram_poll - generic datagram poll + * @file: file struct + * @sock: socket + * @wait: poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you *don't* use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + * + * Return: poll bitmask indicating the socket's current state + */ +__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + return datagram_poll_queue(file, sock, wait, + &sock->sk->sk_receive_queue); +} EXPORT_SYMBOL(datagram_poll); diff --git a/net/core/dev.c b/net/core/dev.c index b76fb37b381e..9094c0fb8c68 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -69,7 +69,7 @@ */ #include <linux/uaccess.h> -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> @@ -77,7 +77,9 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/isolation.h> #include <linux/sched/mm.h> +#include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> @@ -90,6 +92,7 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> +#include <linux/ethtool_netlink.h> #include <linux/skbuff.h> #include <linux/kthread.h> #include <linux/bpf.h> @@ -103,10 +106,12 @@ #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> +#include <net/netdev_queues.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> @@ -132,6 +137,7 @@ #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> +#include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> @@ -150,44 +156,25 @@ #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/types.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/rps.h> +#include <linux/phy_link_topology.h> #include "dev.h" +#include "devmem.h" #include "net-sysfs.h" - static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -struct list_head ptype_all __read_mostly; /* Taps */ static int netif_rx_internal(struct sk_buff *skb); -static int call_netdevice_notifiers_info(unsigned long val, - struct netdev_notifier_info *info); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -static struct napi_struct *napi_by_id(unsigned int napi_id); - -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); static DEFINE_MUTEX(ifalias_mutex); @@ -197,12 +184,11 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); -static DECLARE_RWSEM(devnet_rename_sem); - static inline void dev_base_seq_inc(struct net *net) { - while (++net->dev_base_seq == 0) - ; + unsigned int val = net->dev_base_seq + 1; + + WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) @@ -217,37 +203,62 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock_irqsave(struct softnet_data *sd, - unsigned long *flags) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) { - if (IS_ENABLED(CONFIG_RPS)) + static_branch_enable(&use_backlog_threads_key); + return 0; +} +early_param("thread_backlog_napi", setup_backlog_napi_threads); + +static bool use_backlog_threads(void) +{ + return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) +{ + return true; +} + +#endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_save(*flags); } -static inline void rps_lock_irq_disable(struct softnet_data *sd) +static inline void backlog_lock_irq_disable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_disable(); } -static inline void rps_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_restore(*flags); } -static inline void rps_unlock_irq_enable(struct softnet_data *sd) +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_enable(); } @@ -337,19 +348,27 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ - list_add_tail(&name_node->list, &dev->name_node->list); + list_add_tail_rcu(&name_node->list, &dev->name_node->list); return 0; } -static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +static void netdev_name_node_alt_free(struct rcu_head *head) { - list_del(&name_node->list); - netdev_name_node_del(name_node); + struct netdev_name_node *name_node = + container_of(head, struct netdev_name_node, rcu); + kfree(name_node->name); netdev_name_node_free(name_node); } +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + netdev_name_node_del(name_node); + list_del(&name_node->list); + call_rcu(&name_node->rcu, netdev_name_node_alt_free); +} + int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; @@ -365,7 +384,6 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) return -EINVAL; __netdev_name_node_alt_destroy(name_node); - return 0; } @@ -373,23 +391,30 @@ static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; - list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) - __netdev_name_node_alt_destroy(name_node); + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { + list_del(&name_node->list); + netdev_name_node_alt_free(&name_node->rcu); + } } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { + struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); - write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock(&dev_base_lock); + + netdev_for_each_altname(dev, name_node) + netdev_name_node_add(net, name_node); + + /* We reserved the ifindex, this can't fail */ + WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } @@ -397,18 +422,22 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev, bool lock) +static void unlist_netdevice(struct net_device *dev) { + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + ASSERT_RTNL(); + xa_erase(&net->dev_by_index, dev->ifindex); + + netdev_for_each_altname(dev, name_node) + netdev_name_node_del(name_node); + /* Unlink dev from the device chain */ - if (lock) - write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - if (lock) - write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -424,9 +453,19 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { + .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), +}; EXPORT_PER_CPU_SYMBOL(softnet_data); +/* Page_pool has a lockless array/stack to alloc/recycle pages. + * PP consumers must pay attention to run APIs in the appropriate context + * (e.g. NAPI context). + */ +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; + #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class @@ -535,10 +574,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { - if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &ptype_all; - else - return pt->dev ? &pt->dev->ptype_specific : + if (pt->type == htons(ETH_P_ALL)) { + if (!pt->af_packet_net && !pt->dev) + return NULL; + + return pt->dev ? &pt->dev->ptype_all : + &pt->af_packet_net->ptype_all; + } + + if (pt->dev) + return &pt->dev->ptype_specific; + + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } @@ -559,6 +606,9 @@ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); + if (WARN_ON_ONCE(!head)) + return; + spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); @@ -583,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt) struct list_head *head = ptype_head(pt); struct packet_type *pt1; + if (!head) + return; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { @@ -638,7 +691,7 @@ int dev_get_iflink(const struct net_device *dev) if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); - return dev->ifindex; + return READ_ONCE(dev->ifindex); } EXPORT_SYMBOL(dev_get_iflink); @@ -718,14 +771,88 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, } EXPORT_SYMBOL_GPL(dev_fill_forward_path); +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} + +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct * +netdev_napi_by_id(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + + napi = napi_by_id(napi_id); + if (!napi) + return NULL; + + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; + + return napi; +} + +/** + * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it + * @net: the applicable net namespace + * @napi_id: ID of a NAPI of a target device + * + * Find a NAPI instance with @napi_id. Lock its device. + * The device must be in %NETREG_REGISTERED state for lookup to succeed. + * netdev_unlock() must be called to release it. + * + * Return: pointer to NAPI, its device with lock held, NULL if not found. + */ +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + struct net_device *dev; + + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { + rcu_read_unlock(); + return NULL; + } + + dev = napi->dev; + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock(dev, net); + if (!dev) + return NULL; + + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (napi && napi->dev != dev) + napi = NULL; + rcu_read_unlock(); + + if (!napi) + netdev_unlock(dev); + return napi; +} + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ @@ -760,29 +887,43 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) } EXPORT_SYMBOL(dev_get_by_name_rcu); +/* Deprecated for new users, call netdev_get_by_name() instead */ +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_name); + /** - * dev_get_by_name - find a device by its name + * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has - * the usage count incremented and the caller must use dev_put() to + * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ - -struct net_device *dev_get_by_name(struct net *net, const char *name) +struct net_device *netdev_get_by_name(struct net *net, const char *name, + netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_name_rcu(net, name); - dev_hold(dev); - rcu_read_unlock(); + dev = dev_get_by_name(net, name); + if (dev) + netdev_tracker_alloc(dev, tracker, gfp); return dev; } -EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex @@ -792,8 +933,7 @@ EXPORT_SYMBOL(dev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -833,29 +973,42 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) } EXPORT_SYMBOL(dev_get_by_index_rcu); +/* Deprecated for new users, call netdev_get_by_index() instead */ +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_index); /** - * dev_get_by_index - find a device by its ifindex + * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. + * netdev_put() to indicate they have finished with it. */ - -struct net_device *dev_get_by_index(struct net *net, int ifindex) +struct net_device *netdev_get_by_index(struct net *net, int ifindex, + netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, ifindex); - dev_hold(dev); - rcu_read_unlock(); + dev = dev_get_by_index(net, ifindex); + if (dev) + netdev_tracker_alloc(dev, tracker, gfp); return dev; } -EXPORT_SYMBOL(dev_get_by_index); +EXPORT_SYMBOL(netdev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id @@ -866,21 +1019,151 @@ EXPORT_SYMBOL(dev_get_by_index); * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ - struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } -EXPORT_SYMBOL(dev_get_by_napi_id); + +/* Release the held reference on the net_device, and if the net_device + * is still registered try to lock the instance lock. If device is being + * unregistered NULL will be returned (but the reference has been released, + * either way!) + * + * This helper is intended for locking net_device after it has been looked up + * using a lockless lookup helper. Lock prevents the instance from going away. + */ +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) +{ + netdev_lock(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +/** + * netdev_get_by_index_lock() - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. If a valid device + * with @ifindex is found it will be returned with netdev->lock held. + * netdev_unlock() must be called to release it. + * + * Return: pointer to a device with lock held, NULL if not found. + */ +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock(dev, net); +} + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); +} + +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} +EXPORT_IPV6_MOD_GPL(netdev_copy_name); /** * netdev_get_name - get a netdevice name, knowing its ifindex. @@ -893,7 +1176,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex) struct net_device *dev; int ret; - down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -902,15 +1184,20 @@ int netdev_get_name(struct net *net, char *name, int ifindex) goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); - up_read(&devnet_rename_sem); return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -919,7 +1206,7 @@ out: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -931,14 +1218,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -956,33 +1268,32 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * __dev_get_by_flags - find any device with given flags - * @net: the applicable net namespace - * @if_flags: IFF_* values - * @mask: bitmask of bits in if_flags to check + * netdev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace + * @tracker: tracking object for the acquired reference + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags. * - * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. Must be called inside - * rtnl_lock(), and result refcount is unchanged. + * Context: rcu_read_lock() must be held. + * Returns: NULL if a device is not found or a pointer to the device. */ - -struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, + unsigned short if_flags, unsigned short mask) { - struct net_device *dev, *ret; - - ASSERT_RTNL(); + struct net_device *dev; - ret = NULL; - for_each_netdev(net, dev) { - if (((dev->flags ^ if_flags) & mask) == 0) { - ret = dev; - break; + for_each_netdev_rcu(net, dev) { + if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) { + netdev_hold(dev, tracker, GFP_ATOMIC); + return dev; } } - return ret; + + return NULL; } -EXPORT_SYMBOL(__dev_get_by_flags); +EXPORT_IPV6_MOD(netdev_get_by_flags_rcu); /** * dev_valid_name - check if name is okay for network device @@ -1014,7 +1325,7 @@ EXPORT_SYMBOL(dev_valid_name); * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string - * @buf: scratch buffer and result name string + * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses @@ -1025,83 +1336,79 @@ EXPORT_SYMBOL(dev_valid_name); * Returns the number of the unit assigned or a negative errno code. */ -static int __dev_alloc_name(struct net *net, const char *name, char *buf) +static int __dev_alloc_name(struct net *net, const char *name, char *res) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; + char buf[IFNAMSIZ]; - if (!dev_valid_name(name)) - return -EINVAL; - + /* Verify the string as this thing may have come from the user. + * There must be one "%d" and no other "%" characters. + */ p = strchr(name, '%'); - if (p) { - /* - * Verify the string as this thing may have come from - * the user. There must be either one "%d" and no other "%" - * characters. - */ - if (p[1] != 'd' || strchr(p + 2, '%')) - return -EINVAL; + if (!p || p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; - /* Use one page as a bit array of possible slots */ - inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); - if (!inuse) - return -ENOMEM; + /* Use one page as a bit array of possible slots */ + inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); + if (!inuse) + return -ENOMEM; - for_each_netdev(net, d) { - struct netdev_name_node *name_node; - list_for_each_entry(name_node, &d->name_node->list, list) { - if (!sscanf(name_node->name, name, &i)) - continue; - if (i < 0 || i >= max_netdevices) - continue; + for_each_netdev(net, d) { + struct netdev_name_node *name_node; - /* avoid cases where sscanf is not exact inverse of printf */ - snprintf(buf, IFNAMSIZ, name, i); - if (!strncmp(buf, name_node->name, IFNAMSIZ)) - __set_bit(i, inuse); - } - if (!sscanf(d->name, name, &i)) + netdev_for_each_altname(d, name_node) { + if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; - /* avoid cases where sscanf is not exact inverse of printf */ + /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); - if (!strncmp(buf, d->name, IFNAMSIZ)) + if (!strncmp(buf, name_node->name, IFNAMSIZ)) __set_bit(i, inuse); } + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; - i = find_first_zero_bit(inuse, max_netdevices); - free_page((unsigned long) inuse); + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + __set_bit(i, inuse); } - snprintf(buf, IFNAMSIZ, name, i); - if (!netdev_name_in_use(net, buf)) - return i; + i = find_first_zero_bit(inuse, max_netdevices); + bitmap_free(inuse); + if (i == max_netdevices) + return -ENFILE; - /* It is possible to run out of possible slots - * when the name is long and there isn't enough space left - * for the digits, or if all bits are used. - */ - return -ENFILE; + /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ + strscpy(buf, name, IFNAMSIZ); + snprintf(res, IFNAMSIZ, buf, i); + return i; } -static int dev_alloc_name_ns(struct net *net, - struct net_device *dev, - const char *name) +/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ +static int dev_prep_valid_name(struct net *net, struct net_device *dev, + const char *want_name, char *out_name, + int dup_errno) { - char buf[IFNAMSIZ]; - int ret; + if (!dev_valid_name(want_name)) + return -EINVAL; - BUG_ON(!net); - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strscpy(dev->name, buf, IFNAMSIZ); - return ret; + if (strchr(want_name, '%')) + return __dev_alloc_name(net, want_name, out_name); + + if (netdev_name_in_use(net, want_name)) + return -dup_errno; + if (out_name != want_name) + strscpy(out_name, want_name, IFNAMSIZ); + return 0; } /** @@ -1120,93 +1427,65 @@ static int dev_alloc_name_ns(struct net *net, int dev_alloc_name(struct net_device *dev, const char *name) { - return dev_alloc_name_ns(dev_net(dev), dev, name); + return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE); } EXPORT_SYMBOL(dev_alloc_name); static int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { - BUG_ON(!net); - - if (!dev_valid_name(name)) - return -EINVAL; - - if (strchr(name, '%')) - return dev_alloc_name_ns(net, dev, name); - else if (netdev_name_in_use(net, name)) - return -EEXIST; - else if (dev->name != name) - strscpy(dev->name, name, IFNAMSIZ); + int ret; - return 0; + ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST); + return ret < 0 ? ret : 0; } -/** - * dev_change_name - change name of a device - * @dev: device - * @newname: name (or format string) must be at least IFNAMSIZ - * - * Change name of a device, can pass format strings "eth%d". - * for wildcarding. - */ -int dev_change_name(struct net_device *dev, const char *newname) +int netif_change_name(struct net_device *dev, const char *newname) { + struct net *net = dev_net(dev); unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; - struct net *net; - - ASSERT_RTNL(); - BUG_ON(!dev_net(dev)); - net = dev_net(dev); + ASSERT_RTNL_NET(net); - down_write(&devnet_rename_sem); - - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - up_write(&devnet_rename_sem); + if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; - } memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); - if (err < 0) { - up_write(&devnet_rename_sem); + write_sequnlock_bh(&netdev_rename_lock); + + if (err < 0) return err; - } if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s%s\n", oldname, dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; - dev->name_assign_type = NET_NAME_RENAMED; + WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; - up_write(&devnet_rename_sem); + write_sequnlock_bh(&netdev_rename_lock); + WRITE_ONCE(dev->name_assign_type, old_assign_type); return ret; } - up_write(&devnet_rename_sem); - netdev_adjacent_rename_links(dev, oldname); - write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock(&dev_base_lock); - synchronize_rcu(); + synchronize_net(); - write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -1215,10 +1494,11 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - down_write(&devnet_rename_sem); + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { @@ -1230,15 +1510,7 @@ rollback: return err; } -/** - * dev_set_alias - change ifalias of a device - * @dev: device - * @alias: name up to IFALIASZ - * @len: limit of bytes to copy from info - * - * Set ifalias for a device, - */ -int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +int netif_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; @@ -1264,7 +1536,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return len; } -EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device @@ -1301,16 +1572,10 @@ void netdev_features_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_features_change); -/** - * netdev_state_change - device changes state - * @dev: device to cause notification - * - * Called to indicate a device has changed state. This function calls - * the notifier chains for netdev_chain and sends a NEWLINK message - * to the routing socket. - */ -void netdev_state_change(struct net_device *dev) +void netif_state_change(struct net_device *dev) { + netdev_ops_assert_locked_or_invisible(dev); + if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, @@ -1321,7 +1586,6 @@ void netdev_state_change(struct net_device *dev) rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } -EXPORT_SYMBOL(netdev_state_change); /** * __netdev_notify_peers - notify network peers about existence of @dev, @@ -1410,6 +1674,8 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) set_bit(__LINK_STATE_START, &dev->state); + netdev_ops_assert_locked(dev); + if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); @@ -1421,7 +1687,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - dev->flags |= IFF_UP; + netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1430,20 +1696,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) return ret; } -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * @extack: netlink extended ack - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +int netif_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; @@ -1459,7 +1712,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) return ret; } -EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { @@ -1497,10 +1749,13 @@ static void __dev_close_many(struct list_head *head) * We allow it to be called even after a DETACH hot-plug * event. */ + + netdev_ops_assert_locked(dev); + if (ops->ndo_stop) ops->ndo_stop(dev); - dev->flags &= ~IFF_UP; + netif_set_up(dev, false); netpoll_poll_enable(dev); } } @@ -1514,7 +1769,7 @@ static void __dev_close(struct net_device *dev) list_del(&single); } -void dev_close_many(struct list_head *head, bool unlink) +void netif_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1532,39 +1787,21 @@ void dev_close_many(struct list_head *head, bool unlink) list_del_init(&dev->close_list); } } -EXPORT_SYMBOL(dev_close_many); +EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL"); -/** - * dev_close - shutdown an interface. - * @dev: device to shutdown - * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. - */ -void dev_close(struct net_device *dev) +void netif_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); list_add(&dev->close_list, &single); - dev_close_many(&single, true); + netif_close_many(&single, true); list_del(&single); } } -EXPORT_SYMBOL(dev_close); - +EXPORT_SYMBOL(netif_close); -/** - * dev_disable_lro - disable Large Receive Offload on a device - * @dev: device - * - * Disable Large Receive Offload (LRO) on a net device. Must be - * called under RTNL. This is needed if received packets may be - * forwarded to another interface. - */ -void dev_disable_lro(struct net_device *dev) +void netif_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; @@ -1575,10 +1812,13 @@ void dev_disable_lro(struct net_device *dev) if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); - netdev_for_each_lower_dev(dev, lower_dev, iter) - dev_disable_lro(lower_dev); + netdev_for_each_lower_dev(dev, lower_dev, iter) { + netdev_lock_ops(lower_dev); + netif_disable_lro(lower_dev); + netdev_unlock_ops(lower_dev); + } } -EXPORT_SYMBOL(dev_disable_lro); +EXPORT_IPV6_MOD(netif_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device @@ -1614,6 +1854,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) + N(XDP_FEAT_CHANGE) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1665,7 +1906,9 @@ static int call_netdevice_register_net_notifiers(struct notifier_block *nb, int err; for_each_netdev(net, dev) { + netdev_lock_ops(dev); err = call_netdevice_register_notifiers(nb, dev); + netdev_unlock_ops(dev); if (err) goto rollback; } @@ -1709,14 +1952,19 @@ int register_netdevice_notifier(struct notifier_block *nb) /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); + + /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { + __rtnl_net_lock(net); err = call_netdevice_register_net_notifiers(nb, net); + __rtnl_net_unlock(net); if (err) goto rollback; } @@ -1727,8 +1975,11 @@ unlock: return err; rollback: - for_each_net_continue_reverse(net) + for_each_net_continue_reverse(net) { + __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); + } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; @@ -1761,8 +2012,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; - for_each_net(net) + for_each_net(net) { + __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); + } unlock: rtnl_unlock(); @@ -1826,9 +2080,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; - rtnl_lock(); + rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); @@ -1840,7 +2095,7 @@ EXPORT_SYMBOL(register_netdevice_notifier_net); * @nb: notifier * * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the + * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * @@ -1854,9 +2109,10 @@ int unregister_netdevice_notifier_net(struct net *net, { int err; - rtnl_lock(); + rtnl_net_lock(net); err = __unregister_netdevice_notifier_net(net, nb); - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); @@ -1869,12 +2125,40 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } -void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, - struct notifier_block *nb) +static void rtnl_net_dev_lock(struct net_device *dev) { - rtnl_lock(); - __move_netdevice_notifier_net(src_net, dst_net, nb); - rtnl_unlock(); + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); } int register_netdevice_notifier_dev_net(struct net_device *dev, @@ -1883,13 +2167,14 @@ int register_netdevice_notifier_dev_net(struct net_device *dev, { int err; - rtnl_lock(); + rtnl_net_dev_lock(dev); err = __register_netdevice_notifier_net(dev_net(dev), nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } - rtnl_unlock(); + rtnl_net_dev_unlock(dev); + return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); @@ -1900,10 +2185,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, { int err; - rtnl_lock(); + rtnl_net_dev_lock(dev); list_del(&nn->list); err = __unregister_netdevice_notifier_net(dev_net(dev), nb); - rtnl_unlock(); + rtnl_net_dev_unlock(dev); + return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); @@ -1926,8 +2212,8 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev, * are as for raw_notifier_call_chain(). */ -static int call_netdevice_notifiers_info(unsigned long val, - struct netdev_notifier_info *info) +int call_netdevice_notifiers_info(unsigned long val, + struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); int ret; @@ -2051,6 +2337,11 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); +EXPORT_SYMBOL(tcf_sw_enabled_key); +#endif + DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL @@ -2107,7 +2398,7 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; - skb->mono_delivery_time = 0; + skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } @@ -2172,9 +2463,9 @@ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); } -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) +static int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; @@ -2193,7 +2484,7 @@ static inline void deliver_ptype_list_skb(struct sk_buff *skb, list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; - if (pt_prev) + if (unlikely(pt_prev)) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } @@ -2214,15 +2505,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) } /** - * dev_nit_active - return true if any network interface taps are in use + * dev_nit_active_rcu - return true if any network interface taps are in use + * + * The caller must hold the RCU lock * * @dev: network device to check for the presence of taps */ -bool dev_nit_active(struct net_device *dev) +bool dev_nit_active_rcu(const struct net_device *dev) { - return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); + /* Callers may hold either RCU or RCU BH lock */ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + return !list_empty(&dev_net(dev)->ptype_all) || + !list_empty(&dev->ptype_all); } -EXPORT_SYMBOL_GPL(dev_nit_active); +EXPORT_SYMBOL_GPL(dev_nit_active_rcu); /* * Support routine. Sends outgoing frames to any network @@ -2231,15 +2528,15 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct packet_type *ptype; + struct packet_type *ptype, *pt_prev = NULL; + struct list_head *ptype_list; struct sk_buff *skb2 = NULL; - struct packet_type *pt_prev = NULL; - struct list_head *ptype_list = &ptype_all; rcu_read_lock(); + ptype_list = &dev_net_rcu(dev)->ptype_all; again: list_for_each_entry_rcu(ptype, ptype_list, list) { - if (ptype->ignore_outgoing) + if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket @@ -2248,7 +2545,7 @@ again: if (skb_loop_sk(ptype, skb)) continue; - if (pt_prev) { + if (unlikely(pt_prev)) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; @@ -2280,7 +2577,7 @@ again: pt_prev = ptype; } - if (ptype_list == &ptype_all) { + if (ptype_list != &dev->ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -2366,8 +2663,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_map *map = NULL; int pos; - if (dev_maps) - map = xmap_dereference(dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -2542,6 +2838,8 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, struct xps_map *map, *new_map; unsigned int nr_ids; + WARN_ON_ONCE(index >= dev->num_tx_queues); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; @@ -2881,7 +3179,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); @@ -2891,6 +3189,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); + net_shaper_set_real_num_tx_queues(dev, txq); + dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; @@ -2910,7 +3210,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -2929,7 +3228,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); @@ -2941,7 +3240,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); -#endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used @@ -3001,6 +3299,8 @@ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); + if (size < READ_ONCE(dev->gso_ipv4_max_size)) + netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); @@ -3074,13 +3374,20 @@ static void __netif_reschedule(struct Qdisc *q) void __netif_schedule(struct Qdisc *q) { + /* If q->defer_list is not empty, at least one thread is + * in __dev_xmit_skb() before llist_del_all(&q->defer_list). + * This thread will attempt to run the queue. + */ + if (!llist_empty(&q->defer_list)) + return; + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); struct dev_kfree_skb_cb { - enum skb_free_reason reason; + enum skb_drop_reason reason; }; static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) @@ -3113,7 +3420,7 @@ void netif_tx_wake_queue(struct netdev_queue *dev_queue) } EXPORT_SYMBOL(netif_tx_wake_queue); -void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) { unsigned long flags; @@ -3133,16 +3440,16 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } -EXPORT_SYMBOL(__dev_kfree_skb_irq); +EXPORT_SYMBOL(dev_kfree_skb_irq_reason); -void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) +void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) { if (in_hardirq() || irqs_disabled()) - __dev_kfree_skb_irq(skb, reason); + dev_kfree_skb_irq_reason(skb, reason); else - dev_kfree_skb(skb); + kfree_skb_reason(skb, reason); } -EXPORT_SYMBOL(__dev_kfree_skb_any); +EXPORT_SYMBOL(dev_kfree_skb_any_reason); /** @@ -3171,7 +3478,7 @@ void netif_device_attach(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); @@ -3202,6 +3509,7 @@ static u16 skb_tx_hash(const struct net_device *dev, } if (skb_rx_queue_recorded(skb)) { + DEBUG_NET_WARN_ON_ONCE(qcount == 0); hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; @@ -3213,7 +3521,7 @@ static u16 skb_tx_hash(const struct net_device *dev, return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } -static void skb_warn_bad_offload(const struct sk_buff *skb) +void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; @@ -3251,6 +3559,10 @@ int skb_checksum_help(struct sk_buff *skb) return -EINVAL; } + if (!skb_frags_readable(skb)) { + return -EFAULT; + } + /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ @@ -3262,15 +3574,19 @@ int skb_checksum_help(struct sk_buff *skb) offset = skb_checksum_start_offset(skb); ret = -EINVAL; - if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + if (unlikely(offset >= skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", + offset, skb_headlen(skb)); goto out; } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; - if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) { + if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", + offset + sizeof(__sum16), skb_headlen(skb)); goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); @@ -3285,9 +3601,10 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +#ifdef CONFIG_NET_CRC32C int skb_crc32c_csum_help(struct sk_buff *skb) { - __le32 crc32c_csum; + u32 crc; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) @@ -3315,15 +3632,14 @@ int skb_crc32c_csum_help(struct sk_buff *skb) if (ret) goto out; - crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, - skb->len - start, ~(__u32)0, - crc32c_csum_stub)); - *(__le32 *)(skb->data + offset) = crc32c_csum; - skb->ip_summed = CHECKSUM_NONE; - skb->csum_not_inet = 0; + crc = ~skb_crc32c(skb, start, skb->len - start, ~0); + *(__le32 *)(skb->data + offset) = cpu_to_le32(crc); + skb_reset_csum_not_inet(skb); out: return ret; } +EXPORT_SYMBOL(skb_crc32c_csum_help); +#endif /* CONFIG_NET_CRC32C */ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { @@ -3340,77 +3656,9 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) type = eth->h_proto; } - return __vlan_get_protocol(skb, type, depth); + return vlan_get_protocol_and_depth(skb, type, depth); } -/* openvswitch calls this on rx path, so we need a different check. - */ -static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) -{ - if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_UNNECESSARY; - - return skb->ip_summed == CHECKSUM_NONE; -} - -/** - * __skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @tx_path: whether it is called in TX path - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - * - * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. - */ -struct sk_buff *__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, bool tx_path) -{ - struct sk_buff *segs; - - if (unlikely(skb_needs_check(skb, tx_path))) { - int err; - - /* We're going to init ->check field in TCP or UDP header */ - err = skb_cow_head(skb, 0); - if (err < 0) - return ERR_PTR(err); - } - - /* Only report GSO partial support if it will enable us to - * support segmentation on this frame without needing additional - * work. - */ - if (features & NETIF_F_GSO_PARTIAL) { - netdev_features_t partial_features = NETIF_F_GSO_ROBUST; - struct net_device *dev = skb->dev; - - partial_features |= dev->features & dev->gso_partial_features; - if (!skb_gso_ok(skb, features | partial_features)) - features &= ~NETIF_F_GSO_PARTIAL; - } - - BUILD_BUG_ON(SKB_GSO_CB_OFFSET + - sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); - - SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); - SKB_GSO_CB(skb)->encap_level = 0; - - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - segs = skb_mac_gso_segment(skb, features); - - if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) - skb_warn_bad_offload(skb); - - return segs; -} -EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG @@ -3437,8 +3685,9 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = skb_frag_page(frag); - if (PageHighMem(skb_frag_page(frag))) + if (page && PageHighMem(page)) return 1; } } @@ -3510,6 +3759,9 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; + if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) + return features & ~NETIF_F_GSO_MASK; + if (!skb_shinfo(skb)->gso_type) { skb_warn_bad_offload(skb); return features & ~NETIF_F_GSO_MASK; @@ -3524,8 +3776,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; - /* Make sure to clear the IPv4 ID mangling feature if the - * IPv4 header has the potential to be fragmented. + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? @@ -3535,6 +3793,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, features &= ~NETIF_F_TSO_MANGLEID; } + /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, + * so neither does TSO that depends on it. + */ + if (features & NETIF_F_IPV6_CSUM && + (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && + skb_transport_header_was_set(skb) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); + return features; } @@ -3575,7 +3845,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int len; int rc; - if (dev_nit_active(dev)) + if (dev_nit_active_rcu(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; @@ -3634,6 +3904,11 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, return 0; if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { + if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + goto sw_checksum; + switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): @@ -3641,14 +3916,80 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, } } +sw_checksum: return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. + */ +static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + struct sock *sk = skb->sk; + + sk_validate = NULL; + if (sk) { + if (sk_fullsock(sk)) + sk_validate = sk->sk_validate_xmit_skb; + else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT) + sk_validate = inet_twsk(sk)->tw_validate_xmit_skb; + } + + if (sk_validate) { + skb = sk_validate(sk, dev, skb); + } else if (unlikely(skb_is_decrypted(skb))) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; + } +#endif + + return skb; +} + +static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *shinfo; + struct net_iov *niov; + + if (likely(skb_frags_readable(skb))) + goto out; + + if (!dev->netmem_tx) + goto out_free; + + shinfo = skb_shinfo(skb); + + if (shinfo->nr_frags > 0) { + niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); + if (net_is_devmem_iov(niov) && + net_devmem_iov_binding(niov)->dev != dev) + goto out_free; + } + +out: + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; + skb = validate_xmit_unreadable_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -3708,7 +4049,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d next = skb->next; skb_mark_not_on_list(skb); - /* in case skb wont be segmented, point to itself */ + /* in case skb won't be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); @@ -3728,43 +4069,58 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); -static void qdisc_pkt_len_init(struct sk_buff *skb) +static void qdisc_pkt_len_segs_init(struct sk_buff *skb) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + struct skb_shared_info *shinfo = skb_shinfo(skb); + u16 gso_segs; qdisc_skb_cb(skb)->pkt_len = skb->len; + if (!shinfo->gso_size) { + qdisc_skb_cb(skb)->pkt_segs = 1; + return; + } + + qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size && skb_transport_header_was_set(skb)) { + if (skb_transport_header_was_set(skb)) { unsigned int hdr_len; - u16 gso_segs = shinfo->gso_segs; /* mac layer + network layer */ - hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; - th = skb_header_pointer(skb, skb_transport_offset(skb), + th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); - } else { + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { struct udphdr _udphdr; - if (skb_header_pointer(skb, skb_transport_offset(skb), + if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } - if (shinfo->gso_type & SKB_GSO_DODGY) - gso_segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) { + int payload = skb->len - hdr_len; + /* Malicious packet. */ + if (payload <= 0) + return; + gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + shinfo->gso_segs = gso_segs; + qdisc_skb_cb(skb)->pkt_segs = gso_segs; + } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } @@ -3785,13 +4141,16 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { + struct sk_buff *next, *to_free = NULL, *to_free2 = NULL; spinlock_t *root_lock = qdisc_lock(q); - struct sk_buff *to_free = NULL; - bool contended; + struct llist_node *ll_list, *first_n; + unsigned long defer_count = 0; int rc; qdisc_calculate_pkt_len(skb, q); + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); + if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { @@ -3801,9 +4160,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); - goto no_lock_out; + goto free_skbs; } qdisc_bstats_cpu_update(q, skb); @@ -3811,74 +4170,93 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, !nolock_qdisc_is_empty(q)) __qdisc_run(q); - qdisc_run_end(q); - return NET_XMIT_SUCCESS; + to_free2 = qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; + goto free_skbs; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - qdisc_run(q); - -no_lock_out: - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - SKB_DROP_REASON_QDISC_DROP); - return rc; + to_free2 = qdisc_run(q); + goto free_skbs; } - /* - * Heuristic to force contended enqueues to serialize on a - * separate lock before trying to get qdisc main lock. - * This permits qdisc->running owner to get the lock more - * often and dequeue packets faster. - * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit - * and then other tasks will only enqueue packets. The packets will be - * sent after the qdisc owner is scheduled again. To prevent this - * scenario the task always serialize on the lock. + /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. + * In the try_cmpxchg() loop, we want to increment q->defer_count + * at most once to limit the number of skbs in defer_list. + * We perform the defer_count increment only if the list is not empty, + * because some arches have slow atomic_long_inc_return(). + */ + first_n = READ_ONCE(q->defer_list.first); + do { + if (first_n && !defer_count) { + defer_count = atomic_long_inc_return(&q->defer_count); + if (unlikely(defer_count > READ_ONCE(q->limit))) { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + return NET_XMIT_DROP; + } + } + skb->ll_node.next = first_n; + } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node)); + + /* If defer_list was not empty, we know the cpu which queued + * the first skb will process the whole list for us. */ - contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); - if (unlikely(contended)) - spin_lock(&q->busylock); + if (first_n) + return NET_XMIT_SUCCESS; spin_lock(root_lock); + + ll_list = llist_del_all(&q->defer_list); + /* There is a small race because we clear defer_count not atomically + * with the prior llist_del_all(). This means defer_list could grow + * over q->limit. + */ + atomic_long_set(&q->defer_count, 0); + + ll_list = llist_reverse_order(ll_list); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - __qdisc_drop(skb, &to_free); + llist_for_each_entry_safe(skb, next, ll_list, ll_node) + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; - } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && - qdisc_run_begin(q)) { + goto unlock; + } + if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + !llist_next(ll_list) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list, + struct sk_buff, + ll_node)); qdisc_bstats_update(q, skb); - - if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - } - - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - if (qdisc_run_begin(q)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } - __qdisc_run(q); - qdisc_run_end(q); + int count = 0; + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + prefetch(next); + prefetch(&next->priority); + skb_mark_not_on_list(skb); + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + count++; } + to_free2 = qdisc_run(q); + if (count != 1) + rc = NET_XMIT_SUCCESS; } +unlock: spin_unlock(root_lock); - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); - if (unlikely(contended)) - spin_unlock(&q->busylock); + +free_skbs: + tcf_kfree_skb_list(to_free); + tcf_kfree_skb_list(to_free2); return rc; } @@ -3928,69 +4306,242 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS -static struct sk_buff * -sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +#ifndef CONFIG_PREEMPT_RT +static bool netdev_xmit_txqueue_skipped(void) { + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); + +#else +static bool netdev_xmit_txqueue_skipped(void) +{ + return current->net_xmit.skip_txqueue; +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + current->net_xmit.skip_txqueue = skip; +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_NET_XGRESS +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, + enum skb_drop_reason *drop_reason) +{ + int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); - struct tcf_result cl_res; + struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); + struct tcf_result res; if (!miniq) - return skb; + return ret; + + /* Global bypass */ + if (!static_branch_likely(&tcf_sw_enabled_key)) + return ret; + + /* Block-wise bypass */ + if (tcf_block_bypass_sw(miniq->block)) + return ret; - /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; - mini_qdisc_bstats_cpu_update(miniq, skb); + qdisc_skb_cb(skb)->post_ct = false; + tcf_set_drop_reason(skb, *drop_reason); - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + mini_qdisc_bstats_cpu_update(miniq, skb); + ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); + /* Only tcf related quirks below. */ + switch (ret) { + case TC_ACT_SHOT: + *drop_reason = tcf_get_drop_reason(skb); + mini_qdisc_qstats_cpu_drop(miniq); + break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); + skb->tc_index = TC_H_MIN(res.classid); break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return ret; +} + +static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); + +void tcx_inc(void) +{ + static_branch_inc(&tcx_needed_key); +} + +void tcx_dec(void) +{ + static_branch_dec(&tcx_needed_key); +} + +static __always_inline enum tcx_action_base +tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + const bool needs_mac) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + int ret = TCX_NEXT; + + if (needs_mac) + __skb_push(skb, skb->mac_len); + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != TCX_NEXT) + break; + } + if (needs_mac) + __skb_pull(skb, skb->mac_len); + return tcx_action_code(skb, ret); +} + +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); + enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int sch_ret; + + if (!entry) + return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + if (unlikely(*pt_prev)) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_pkt_len_segs_init(skb); + tcx_set_ingress(skb, true); + + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, true); + if (sch_ret != TC_ACT_UNSPEC) + goto ingress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); +ingress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by BPF, so we can safely + * push the L2 header back before redirecting to another + * netdev. + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - *ret = NET_XMIT_DROP; - kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + kfree_skb_reason(skb, drop_reason); + *ret = NET_RX_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; + /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: - *ret = NET_XMIT_SUCCESS; consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; + } + bpf_net_ctx_clear(bpf_net_ctx); + + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); + enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int sch_ret; + + if (!entry) + return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was + * already set by the caller. + */ + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, false); + if (sch_ret != TC_ACT_UNSPEC) + goto egress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); +egress_verdict: + switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + case TC_ACT_SHOT: + kfree_skb_reason(skb, drop_reason); + *ret = NET_XMIT_DROP; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; - default: - break; } -#endif /* CONFIG_NET_CLS_ACT */ + bpf_net_ctx_clear(bpf_net_ctx); return skb; } - -static struct netdev_queue * -netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) -{ - int qm = skb_get_queue_mapping(skb); - - return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); -} - -static bool netdev_xmit_txqueue_skipped(void) +#else +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) { - return __this_cpu_read(softnet_data.xmit.skip_txqueue); + return skb; } -void netdev_xmit_skip_txqueue(bool skip) +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { - __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); + return skb; } -EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); -#endif /* CONFIG_NET_EGRESS */ +#endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, @@ -4069,12 +4620,31 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_zero); -u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) +int sk_tx_queue_get(const struct sock *sk) { - return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; + int resel, val; + + if (!sk) + return -1; + /* Paired with WRITE_ONCE() in sk_tx_queue_clear() + * and sk_tx_queue_set(). + */ + val = READ_ONCE(sk->sk_tx_queue_mapping); + + if (val == NO_QUEUE_MAPPING) + return -1; + + if (!sk_fullsock(sk)) + return val; + + resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection); + if (resel && time_is_before_jiffies( + READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel)) + return -1; + + return val; } -EXPORT_SYMBOL(dev_pick_tx_cpu_id); +EXPORT_SYMBOL(sk_tx_queue_get); u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) @@ -4091,8 +4661,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); - if (queue_index != new_index && sk && - sk_fullsock(sk) && + if (sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); @@ -4163,7 +4732,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); skb_assert_len(skb); - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + if (unlikely(skb_shinfo(skb)->tx_flags & + (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also @@ -4173,10 +4743,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); - qdisc_pkt_len_init(skb); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_at_ingress = 0; -#endif + qdisc_pkt_len_segs_init(skb); + tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { @@ -4318,20 +4886,11 @@ EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); -int netdev_max_backlog __read_mostly = 1000; -EXPORT_SYMBOL(netdev_max_backlog); - -int netdev_tstamp_prequeue __read_mostly = 1; -unsigned int sysctl_skb_defer_max __read_mostly = 64; -int netdev_budget __read_mostly = 300; -/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ -unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ -int dev_rx_weight __read_mostly = 64; -int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -4343,52 +4902,88 @@ static inline void ____napi_schedule(struct softnet_data *sd, if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in - * napi_enable()/dev_set_threaded(). + * napi_enable()/netif_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL. */ thread = READ_ONCE(napi->thread); if (thread) { - /* Avoid doing set_bit() if the thread is in - * INTERRUPTIBLE state, cause napi_thread_wait() - * makes sure to proceed with napi polling - * if the thread is explicitly woken from here. - */ - if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) - set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) + goto use_local_napi; + + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } +use_local_napi: + DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list)); list_add_tail(&napi->poll_list, &sd->poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + WRITE_ONCE(napi->list_owner, smp_processor_id()); + /* If not called from net_rx_action() + * we have to raise NET_RX_SOFTIRQ. + */ + if (!sd->in_net_rx_action) + raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); -u32 rps_cpu_mask __read_mostly; -EXPORT_SYMBOL(rps_cpu_mask); - struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); +static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +{ + return hash_32(hash, flow_table->log); +} + +#ifdef CONFIG_RFS_ACCEL +/** + * rps_flow_is_active - check whether the flow is recently active. + * @rflow: Specific flow to check activity. + * @flow_table: per-queue flowtable that @rflow belongs to. + * @cpu: CPU saved in @rflow. + * + * If the CPU has processed many packets since the flow's last activity + * (beyond 10 times the table size), the flow is considered stale. + * + * Return: true if flow was recently active. + */ +static bool rps_flow_is_active(struct rps_dev_flow *rflow, + struct rps_dev_flow_table *flow_table, + unsigned int cpu) +{ + unsigned int flow_last_active; + unsigned int sd_input_head; + + if (cpu >= nr_cpu_ids) + return false; + + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); + flow_last_active = READ_ONCE(rflow->last_qtail); + + return (int)(sd_input_head - flow_last_active) < + (int)(10 << flow_table->log); +} +#endif + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow *rflow, u16 next_cpu) + struct rps_dev_flow *rflow, u16 next_cpu, u32 hash, + u32 flow_id) { if (next_cpu < nr_cpu_ids) { + u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; - u32 flow_id; + struct rps_dev_flow *tmp_rflow; + unsigned int tmp_cpu; u16 rxq_index; int rc; @@ -4404,23 +4999,38 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb_get_hash(skb) & flow_table->mask; + + tmp_rflow = &flow_table->flows[flow_id]; + tmp_cpu = READ_ONCE(tmp_rflow->cpu); + + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { + if (rps_flow_is_active(tmp_rflow, flow_table, + tmp_cpu)) { + if (hash != READ_ONCE(tmp_rflow->hash) || + next_cpu == tmp_cpu) + goto out; + } + } + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; + old_rflow = rflow; - rflow = &flow_table->flows[flow_id]; - rflow->filter = rc; - if (old_rflow->filter == rflow->filter) - old_rflow->filter = RPS_NO_FILTER; + rflow = tmp_rflow; + WRITE_ONCE(rflow->filter, rc); + WRITE_ONCE(rflow->hash, hash); + + if (old_rflow->filter == rc) + WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif - rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); + rps_input_queue_tail_save(&rflow->last_qtail, head); } - rflow->cpu = next_cpu; + WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } @@ -4437,6 +5047,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; + u32 flow_id; u32 tcpu; u32 hash; @@ -4465,23 +5076,26 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!hash) goto done; - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; u32 ident; - /* First check into global flow table if there is a match */ - ident = sock_flow_table->ents[hash & sock_flow_table->mask]; - if ((ident ^ hash) & ~rps_cpu_mask) + /* First check into global flow table if there is a match. + * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). + */ + ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); + if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; - next_cpu = ident & rps_cpu_mask; + next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[hash & flow_table->mask]; + flow_id = rfs_slot(hash, flow_table); + rflow = &flow_table->flows[flow_id]; tcpu = rflow->cpu; /* @@ -4497,10 +5111,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; - rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash, + flow_id); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { @@ -4544,17 +5159,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id <= flow_table->mask) { + if (flow_table && flow_id < (1UL << flow_table->log)) { + unsigned int cpu; + rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu < nr_cpu_ids && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < - (int)(10 * flow_table->mask))) + if (READ_ONCE(rflow->filter) == filter_id && + rps_flow_is_active(rflow, flow_table, cpu)) expire = false; } rcu_read_unlock(); @@ -4570,7 +5184,8 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ @@ -4585,39 +5200,71 @@ static void trigger_rx_softirq(void *data) } /* - * Check if this softnet_data structure is another cpu one - * If yes, queue it to our IPI list and return 1 - * If no, return 0 + * After we queued a packet into sd->input_pkt_queue, + * we need to make sure this queue is serviced soon. + * + * - If this is another cpu queue, link it to our rps_ipi_list, + * and make sure we will process rps_ipi_list from net_rx_action(). + * + * - If this is our own queue, NAPI schedule our backlog. + * Note that this also raises NET_RX_SOFTIRQ. */ -static int napi_schedule_rps(struct softnet_data *sd) +static void napi_schedule_rps(struct softnet_data *sd) { struct softnet_data *mysd = this_cpu_ptr(&softnet_data); #ifdef CONFIG_RPS if (sd != mysd) { + if (use_backlog_threads()) { + __napi_schedule_irqoff(&sd->backlog); + return; + } + sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - return 1; + /* If not called from net_rx_action() or napi_threaded_poll() + * we have to raise NET_RX_SOFTIRQ. + */ + if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return; } #endif /* CONFIG_RPS */ __napi_schedule_irqoff(&mysd->backlog); - return 0; +} + +void kick_defer_list_purge(unsigned int cpu) +{ + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + unsigned long flags; + + if (use_backlog_threads()) { + backlog_lock_irq_save(sd, &flags); + + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + __napi_schedule_irqoff(&sd->backlog); + + backlog_unlock_irq_restore(sd, &flags); + + } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { + smp_call_function_single_async(cpu, &sd->defer_csd); + } } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif -static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen, + int max_backlog) { #ifdef CONFIG_NET_FLOW_LIMIT - struct sd_flow_limit *fl; - struct softnet_data *sd; unsigned int old_flow, new_flow; + const struct softnet_data *sd; + struct sd_flow_limit *fl; - if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) + if (likely(qlen < (max_backlog >> 1))) return false; sd = this_cpu_ptr(&softnet_data); @@ -4625,7 +5272,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; @@ -4636,7 +5283,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } @@ -4657,36 +5305,46 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; + u32 tail; + + reason = SKB_DROP_REASON_DEV_READY; + if (unlikely(!netif_running(skb->dev))) + goto bad_dev; - reason = SKB_DROP_REASON_NOT_SPECIFIED; sd = &per_cpu(softnet_data, cpu); - rps_lock_irqsave(sd, &flags); - if (!netif_running(skb->dev)) - goto drop; + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog) || + skb_flow_limit(skb, qlen, max_backlog)) + goto cpu_backlog_drop; + backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { - if (qlen) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - rps_unlock_irq_restore(sd, &flags); - return NET_RX_SUCCESS; + if (likely(qlen <= max_backlog)) { + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); } + __skb_queue_tail(&sd->input_pkt_queue, skb); + tail = rps_input_queue_tail_incr(sd); + backlog_unlock_irq_restore(sd, &flags); - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) - napi_schedule_rps(sd); - goto enqueue; + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); + return NET_RX_SUCCESS; } - reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: - sd->dropped++; - rps_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: + reason = SKB_DROP_REASON_CPU_BACKLOG; + numa_drop_add(&sd->drop_counters, 1); +bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; @@ -4716,7 +5374,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; @@ -4741,6 +5399,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4770,6 +5434,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, skb->len += off; /* positive on grow, negative on shrink */ } + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || @@ -4803,11 +5475,38 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, return act; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + +static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { - u32 act = XDP_DROP; + struct sk_buff *skb = *pskb; + u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4815,41 +5514,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_is_redirected(skb)) return XDP_PASS; - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) + if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } - act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); + bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); + trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: - kfree_skb(skb); + kfree_skb(*pskb); break; } @@ -4862,7 +5556,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -4887,32 +5581,38 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; - act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: - err = xdp_do_generic_redirect(skb->dev, skb, + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: - generic_xdp_tx(skb, xdp_prog); + generic_xdp_tx(*pskb, xdp_prog); break; } + bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } + bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: - kfree_skb_reason(skb, SKB_DROP_REASON_XDP); + bpf_net_ctx_clear(bpf_net_ctx); + kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -4921,7 +5621,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); @@ -5005,7 +5705,7 @@ int netif_rx(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx); -static __latent_entropy void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5023,16 +5723,17 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) clist = clist->next; WARN_ON(refcount_read(&skb->users)); - if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) - trace_consume_skb(skb); + if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) + trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, - SKB_DROP_REASON_NOT_SPECIFIED); + get_kfree_skb_cb(skb)->reason, NULL); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else - __kfree_skb_defer(skb); + __napi_kfree_skb(skb, + get_kfree_skb_cb(skb)->reason); } } @@ -5048,8 +5749,9 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) rcu_read_lock(); while (head) { - struct Qdisc *q = head; spinlock_t *root_lock = NULL; + struct sk_buff *to_free; + struct Qdisc *q = head; head = head->next_sched; @@ -5076,9 +5778,10 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) } clear_bit(__QDISC_STATE_SCHED, &q->state); - qdisc_run(q); + to_free = qdisc_run(q); if (root_lock) spin_unlock(root_lock); + tcf_kfree_skb_list(to_free); } rcu_read_unlock(); @@ -5094,72 +5797,6 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff * -sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev, bool *another) -{ -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); - struct tcf_result cl_res; - - /* If there's at least one ingress present somewhere (so - * we get here via enabled static key), remaining devices - * that are not configured with an ingress qdisc will bail - * out here. - */ - if (!miniq) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - qdisc_skb_cb(skb)->pkt_len = skb->len; - tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; - skb->tc_at_ingress = 1; - mini_qdisc_bstats_cpu_update(miniq, skb); - - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { - case TC_ACT_OK: - case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); - break; - case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); - *ret = NET_RX_DROP; - return NULL; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - case TC_ACT_TRAP: - consume_skb(skb); - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_REDIRECT: - /* skb_mac_header check was done by cls/act_bpf, so - * we can safely push the L2 header back before - * redirecting to another netdev - */ - __skb_push(skb, skb->mac_len); - if (skb_do_redirect(skb) == -EAGAIN) { - __skb_pull(skb, skb->mac_len); - *another = true; - break; - } - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_CONSUMED: - *ret = NET_RX_SUCCESS; - return NULL; - default: - break; - } -#endif /* CONFIG_NET_CLS_ACT */ - return skb; -} - /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check @@ -5254,7 +5891,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, if (nf_hook_ingress_active(skb)) { int ingress_retval; - if (*pt_prev) { + if (unlikely(*pt_prev)) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } @@ -5270,6 +5907,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO; struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; @@ -5278,15 +5916,21 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); orig_dev = skb->dev; skb_reset_network_header(skb); +#if !defined(CONFIG_DEBUG_NET) + /* We plan to no longer reset the transport header here. + * Give some time to fuzzers and dev build to catch bugs + * in network stacks. + */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); +#endif skb_reset_mac_len(skb); pt_prev = NULL; @@ -5300,7 +5944,8 @@ another_round: int ret2; migrate_disable(); - ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + &skb); migrate_enable(); if (ret2 != XDP_PASS) { @@ -5321,14 +5966,15 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (pt_prev) + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, + list) { + if (unlikely(pt_prev)) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { - if (pt_prev) + if (unlikely(pt_prev)) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } @@ -5353,11 +5999,13 @@ skip_taps: #endif skb_reset_redirect(skb); skip_classify: - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) { + drop_reason = SKB_DROP_REASON_PFMEMALLOC; goto drop; + } if (skb_vlan_tag_present(skb)) { - if (pt_prev) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } @@ -5369,7 +6017,7 @@ skip_classify: rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { - if (pt_prev) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } @@ -5433,6 +6081,14 @@ check_vlan_id: deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); + + /* orig_dev and skb->dev could belong to different netns; + * Even in such case we need to traverse only the list + * coming from skb->dev, as the ptype owner (packet socket) + * will use dev_net(skb->dev) to do namespace filtering. + */ + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &dev_net_rcu(skb->dev)->ptype_specific); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, @@ -5444,8 +6100,6 @@ check_vlan_id: } if (pt_prev) { - if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) - goto drop; *ppt_prev = pt_prev; } else { drop: @@ -5453,7 +6107,8 @@ drop: dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); - kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); + + kfree_skb_reason(skb, drop_reason); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ @@ -5546,10 +6201,9 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; - struct list_head sublist; struct sk_buff *skb, *next; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; @@ -5644,7 +6298,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); - dev_disable_lro(dev); + netif_disable_lro(dev); dev_disable_gro_hw(dev); } break; @@ -5661,7 +6315,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5687,11 +6341,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb) void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), + skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); @@ -5770,35 +6424,39 @@ void netif_receive_skb_list(struct list_head *head) } EXPORT_SYMBOL(netif_receive_skb_list); -static DEFINE_PER_CPU(struct work_struct, flush_works); - /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; + struct sk_buff_head list; struct softnet_data *sd; + __skb_queue_head_init(&list); local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + __skb_queue_tail(&list, skb); + rps_input_queue_head_incr(sd); } } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); - input_queue_head_incr(sd); + __skb_queue_tail(&list, skb); + rps_input_queue_head_incr(sd); } } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } static bool flush_required(int cpu) @@ -5807,14 +6465,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); return do_flush; #endif @@ -5826,36 +6484,54 @@ static bool flush_required(int cpu) return true; } +struct flush_backlogs { + cpumask_t flush_cpus; + struct work_struct w[]; +}; + +static struct flush_backlogs *flush_backlogs_alloc(void) +{ + return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), + GFP_KERNEL); +} + +static struct flush_backlogs *flush_backlogs_fallback; +static DEFINE_MUTEX(flush_backlogs_mutex); + static void flush_all_backlogs(void) { - static cpumask_t flush_cpus; + struct flush_backlogs *ptr = flush_backlogs_alloc(); unsigned int cpu; - /* since we are under rtnl lock protection we can use static data - * for the cpumask and avoid allocating on stack the possibly - * large mask - */ - ASSERT_RTNL(); + if (!ptr) { + mutex_lock(&flush_backlogs_mutex); + ptr = flush_backlogs_fallback; + } + cpumask_clear(&ptr->flush_cpus); cpus_read_lock(); - cpumask_clear(&flush_cpus); for_each_online_cpu(cpu) { if (flush_required(cpu)) { - queue_work_on(cpu, system_highpri_wq, - per_cpu_ptr(&flush_works, cpu)); - cpumask_set_cpu(cpu, &flush_cpus); + INIT_WORK(&ptr->w[cpu], flush_backlog); + queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); + __cpumask_set_cpu(cpu, &ptr->flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of - * them + * them. */ - for_each_cpu(cpu, &flush_cpus) - flush_work(per_cpu_ptr(&flush_works, cpu)); + for_each_cpu(cpu, &ptr->flush_cpus) + flush_work(&ptr->w[cpu]); cpus_read_unlock(); + + if (ptr != flush_backlogs_fallback) + kfree(ptr); + else + mutex_unlock(&flush_backlogs_mutex); } static void net_rps_send_ipi(struct softnet_data *remsd) @@ -5880,7 +6556,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; - if (remsd) { + if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); @@ -5895,7 +6571,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS - return sd->rps_ipi_list != NULL; + return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif @@ -5915,21 +6591,26 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = READ_ONCE(dev_rx_weight); + napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) { + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); - if (++work >= quota) + if (++work >= quota) { + rps_input_queue_head_add(sd, work); return work; + } + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -5939,15 +6620,19 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; again = false; } else { + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); } + if (work) + rps_input_queue_head_add(sd, work); return work; } @@ -6035,25 +6720,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; if (work_done) { - if (n->gro_bitmask) - timeout = READ_ONCE(n->dev->gro_flush_timeout); - n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); + if (n->gro.bitmask) + timeout = napi_get_gro_flush_timeout(n); + n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; - timeout = READ_ONCE(n->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } - if (n->gro_bitmask) { - /* When the NAPI instance uses a timeout and keeps postponing - * it, we need to bound somehow the time packets are kept in - * the GRO layer - */ - napi_gro_flush(n, !!timeout); - } - gro_normal_list(n); + /* + * When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer. + */ + gro_flush_normal(&n->gro, !!timeout); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6061,6 +6744,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) list_del_init(&n->poll_list); local_irq_restore(flags); } + WRITE_ONCE(n->list_owner, -1); val = READ_ONCE(n->state); do { @@ -6090,17 +6774,26 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -/* must be called under rcu_read_lock(), as we dont take a reference */ -static struct napi_struct *napi_by_id(unsigned int napi_id) +static void skb_defer_free_flush(void) { - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; + struct llist_node *free_list; + struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; - return NULL; + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); + + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + prefetch(next); + napi_consume_skb(skb, 1); + } + } } #if defined(CONFIG_NET_RX_BUSY_POLL) @@ -6108,25 +6801,26 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { - gro_normal_list(napi); + gro_normal_list(&napi->gro); __napi_schedule(napi); return; } - if (napi->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(napi, HZ >= 1000); - } + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush_normal(&napi->gro, HZ >= 1000); - gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, - u16 budget) +enum { + NAPI_F_PREFER_BUSY_POLL = 1, + NAPI_F_END_ON_RESCHED = 2, +}; + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + unsigned flags, u16 budget) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; bool skip_schedule = false; unsigned long timeout; int rc; @@ -6144,10 +6838,11 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - if (prefer_busy_poll) { - napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); - timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (flags & NAPI_F_PREFER_BUSY_POLL) { + napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); + timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; @@ -6166,32 +6861,36 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } -void napi_busy_loop(unsigned int napi_id, - bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll, u16 budget) +static void __napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; void *have_poll_lock = NULL; struct napi_struct *napi; + WARN_ON_ONCE(!rcu_read_lock_held()); + restart: napi_poll = NULL; - rcu_read_lock(); - napi = napi_by_id(napi_id); if (!napi) - goto out; + return; - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); for (;;) { int work = 0; local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -6200,14 +6899,14 @@ restart: */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } @@ -6216,22 +6915,28 @@ restart: } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); - gro_normal_list(napi); + gro_normal_list(&napi->gro); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); + skb_defer_free_flush(); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { + if (flags & NAPI_F_END_ON_RESCHED) + break; if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); - preempt_enable(); + busy_poll_stop(napi, have_poll_lock, flags, budget); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); rcu_read_unlock(); cond_resched(); + rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; @@ -6239,33 +6944,113 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); - preempt_enable(); -out: + busy_poll_stop(napi, have_poll_lock, flags, budget); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); +} + +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = NAPI_F_END_ON_RESCHED; + + if (prefer_busy_poll) + flags |= NAPI_F_PREFER_BUSY_POLL; + + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); +} + +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; + + rcu_read_lock(); + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); +void napi_suspend_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + unsigned long timeout = napi_get_irq_suspend_timeout(napi); + + if (timeout) + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + } + rcu_read_unlock(); +} + +void napi_resume_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + /* If irq_suspend_timeout is set to 0 between the call to + * napi_suspend_irqs and now, the original value still + * determines the safety timeout as intended and napi_watchdog + * will resume irq processing. + */ + if (napi_get_irq_suspend_timeout(napi)) { + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); + } + } + rcu_read_unlock(); +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ +static void __napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + napi->gro.cached_napi_id = napi_id; + + WRITE_ONCE(napi->napi_id, napi_id); + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); +} + +static void napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + unsigned long flags; + + spin_lock_irqsave(&napi_hash_lock, flags); + WARN_ON_ONCE(napi_by_id(napi_id)); + __napi_hash_add_with_id(napi, napi_id); + spin_unlock_irqrestore(&napi_hash_lock, flags); +} + static void napi_hash_add(struct napi_struct *napi) { + unsigned long flags; + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; - spin_lock(&napi_hash_lock); + spin_lock_irqsave(&napi_hash_lock, flags); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + if (unlikely(!napi_id_valid(++napi_gen_id))) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); - napi->napi_id = napi_gen_id; - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + __napi_hash_add_with_id(napi, napi_gen_id); - spin_unlock(&napi_hash_lock); + spin_unlock_irqrestore(&napi_hash_lock, flags); } /* Warning : caller is responsible to make sure rcu grace period @@ -6273,11 +7058,13 @@ static void napi_hash_add(struct napi_struct *napi) */ static void napi_hash_del(struct napi_struct *napi) { - spin_lock(&napi_hash_lock); + unsigned long flags; + + spin_lock_irqsave(&napi_hash_lock, flags); hlist_del_init_rcu(&napi->napi_hash_node); - spin_unlock(&napi_hash_lock); + spin_unlock_irqrestore(&napi_hash_lock, flags); } static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) @@ -6298,75 +7085,423 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void init_gro_hash(struct napi_struct *napi) +static void napi_stop_kthread(struct napi_struct *napi) { - int i; + unsigned long val, new; + + /* Wait until the napi STATE_THREADED is unset. */ + while (true) { + val = READ_ONCE(napi->state); - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&napi->gro_hash[i].list); - napi->gro_hash[i].count = 0; + /* If napi kthread own this napi or the napi is idle, + * STATE_THREADED can be unset here. + */ + if ((val & NAPIF_STATE_SCHED_THREADED) || + !(val & NAPIF_STATE_SCHED)) { + new = val & (~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL)); + } else { + msleep(20); + continue; + } + + if (try_cmpxchg(&napi->state, &val, new)) + break; + } + + /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by + * the kthread. + */ + while (true) { + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) + break; + + msleep(20); } - napi->gro_bitmask = 0; + + kthread_stop(napi->thread); + napi->thread = NULL; } -int dev_set_threaded(struct net_device *dev, bool threaded) +static void napi_set_threaded_state(struct napi_struct *napi, + enum netdev_napi_threaded threaded_mode) +{ + bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED; + bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL; + + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll); +} + +int napi_set_threaded(struct napi_struct *napi, + enum netdev_napi_threaded threaded) +{ + if (threaded) { + if (!napi->thread) { + int err = napi_kthread_create(napi); + + if (err) + return err; + } + } + + if (napi->config) + napi->config->threaded = threaded; + + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + if (!threaded && napi->thread) { + napi_stop_kthread(napi); + } else { + /* Make sure kthread is created before THREADED bit is set. */ + smp_mb__before_atomic(); + napi_set_threaded_state(napi, threaded); + } + + return 0; +} + +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) { struct napi_struct *napi; - int err = 0; + int i, err = 0; - if (dev->threaded == threaded) - return 0; + netdev_assert_locked_or_invisible(dev); if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { err = napi_kthread_create(napi); if (err) { - threaded = false; + threaded = NETDEV_NAPI_THREADED_DISABLED; break; } } } } - dev->threaded = threaded; + WRITE_ONCE(dev->threaded, threaded); - /* Make sure kthread is created before THREADED bit - * is set. - */ - smp_mb__before_atomic(); + /* The error should not occur as the kthreads are already created. */ + list_for_each_entry(napi, &dev->napi_list, dev_list) + WARN_ON_ONCE(napi_set_threaded(napi, threaded)); - /* Setting/unsetting threaded mode on a napi might not immediately - * take effect, if the current napi instance is actively being - * polled. In this case, the switch between threaded mode and - * softirq mode will happen in the next round of napi_schedule(). - * This should not cause hiccups/stalls to the live traffic. + /* Override the config for all NAPIs even if currently not listed */ + for (i = 0; i < dev->num_napi_configs; i++) + dev->napi_config[i].threaded = threaded; + + return err; +} + +/** + * netif_threaded_enable() - enable threaded NAPIs + * @dev: net_device instance + * + * Enable threaded mode for the NAPI instances of the device. This may be useful + * for devices where multiple NAPI instances get scheduled by a single + * interrupt. Threaded NAPI allows moving the NAPI processing to cores other + * than the core where IRQ is mapped. + * + * This function should be called before @dev is registered. + */ +void netif_threaded_enable(struct net_device *dev) +{ + WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED)); +} +EXPORT_SYMBOL(netif_threaded_enable); + +/** + * netif_queue_set_napi - Associate queue with the napi + * @dev: device to which NAPI and queue belong + * @queue_index: Index of queue + * @type: queue type as RX or TX + * @napi: NAPI context, pass NULL to clear previously set NAPI + * + * Set queue with its corresponding napi context. This should be done after + * registering the NAPI handler for the queue-vector and the queues have been + * mapped to the corresponding interrupt vector. + */ +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, struct napi_struct *napi) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + + if (WARN_ON_ONCE(napi && !napi->dev)) + return; + netdev_ops_assert_locked_or_invisible(dev); + + switch (type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(dev, queue_index); + rxq->napi = napi; + return; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(dev, queue_index); + txq->napi = napi; + return; + default: + return; + } +} +EXPORT_SYMBOL(netif_queue_set_napi); + +static void +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct napi_struct *napi = + container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + int err; +#endif + + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif +} + +#ifdef CONFIG_RFS_ACCEL +static void netif_napi_affinity_release(struct kref *ref) +{ + struct napi_struct *napi = + container_of(ref, struct napi_struct, notify.kref); + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + + netdev_assert_locked(napi->dev); + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, + &napi->state)); + + if (!napi->dev->rx_cpu_rmap_auto) + return; + rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + cpu_rmap_put(rmap); +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + if (dev->rx_cpu_rmap_auto) + return 0; + + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); + if (!dev->rx_cpu_rmap) + return -ENOMEM; + + dev->rx_cpu_rmap_auto = true; + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ + struct cpu_rmap *rmap = dev->rx_cpu_rmap; + + if (!dev->rx_cpu_rmap_auto) + return; + + /* Free the rmap */ + cpu_rmap_put(rmap); + dev->rx_cpu_rmap = NULL; + dev->rx_cpu_rmap_auto = false; +} + +#else +static void netif_napi_affinity_release(struct kref *ref) +{ +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ +} +#endif + +void netif_set_affinity_auto(struct net_device *dev) +{ + unsigned int i, maxqs, numa; + + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); + numa = dev_to_node(&dev->dev); + + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + + dev->irq_affinity_auto = true; +} +EXPORT_SYMBOL(netif_set_affinity_auto); + +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) +{ + int rc; + + netdev_assert_locked_or_invisible(napi->dev); + + if (napi->irq == irq) + return; + + /* Remove existing resources */ + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + napi->irq = irq; + if (irq < 0 || + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) + return; + + /* Abort for buggy drivers */ + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) + return; + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); + if (rc < 0) + return; + + cpu_rmap_get(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = rc; + } +#endif + + /* Use core IRQ notifier */ + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + goto put_rmap; + } + + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + return; + +put_rmap: +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; + cpu_rmap_put(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = -1; + } +#endif + napi->notify.notify = NULL; + napi->notify.release = NULL; +} +EXPORT_SYMBOL(netif_napi_set_irq_locked); + +static void napi_restore_config(struct napi_struct *n) +{ + n->defer_hard_irqs = n->config->defer_hard_irqs; + n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; + + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + + /* a NAPI ID might be stored in the config, if so use it. if not, use + * napi_hash_add to generate one for us. */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (threaded) - set_bit(NAPI_STATE_THREADED, &napi->state); + if (n->config->napi_id) { + napi_hash_add_with_id(n, n->config->napi_id); + } else { + napi_hash_add(n); + n->config->napi_id = n->napi_id; + } + + WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded)); +} + +static void napi_save_config(struct napi_struct *n) +{ + n->config->defer_hard_irqs = n->defer_hard_irqs; + n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; + napi_hash_del(n); +} + +/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will + * inherit an existing ID try to insert it at the right position. + */ +static void +netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) +{ + unsigned int new_id, pos_id; + struct list_head *higher; + struct napi_struct *pos; + + new_id = UINT_MAX; + if (napi->config && napi->config->napi_id) + new_id = napi->config->napi_id; + + higher = &dev->napi_list; + list_for_each_entry(pos, &dev->napi_list, dev_list) { + if (napi_id_valid(pos->napi_id)) + pos_id = pos->napi_id; + else if (pos->config) + pos_id = pos->config->napi_id; else - clear_bit(NAPI_STATE_THREADED, &napi->state); + pos_id = UINT_MAX; + + if (pos_id <= new_id) + break; + higher = &pos->dev_list; } + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ +} - return err; +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); } -EXPORT_SYMBOL(dev_set_threaded); -void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) { + netdev_assert_locked(dev); if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); - hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - napi->timer.function = napi_watchdog; - init_gro_hash(napi); + hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + gro_init(&napi->gro); napi->skb = NULL; - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -6376,25 +7511,36 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif + napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); - list_add_rcu(&napi->dev_list, &dev->napi_list); - napi_hash_add(napi); + netif_napi_dev_list_add(dev, napi); + + /* default settings from sysfs are applied to all NAPIs. any per-NAPI + * configuration will be loaded in napi_enable + */ + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); + napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ - if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = 0; + if (napi_get_threaded_config(dev, napi)) + if (napi_kthread_create(napi)) + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; + netif_napi_set_irq_locked(napi, -1); } -EXPORT_SYMBOL(netif_napi_add_weight); +EXPORT_SYMBOL(netif_napi_add_weight_locked); -void napi_disable(struct napi_struct *n) +void napi_disable_locked(struct napi_struct *n) { unsigned long val, new; might_sleep(); + netdev_assert_locked(n->dev); + set_bit(NAPI_STATE_DISABLE, &n->state); val = READ_ONCE(n->state); @@ -6405,26 +7551,47 @@ void napi_disable(struct napi_struct *n) } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; - new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + new &= ~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL | + NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); + if (n->config) + napi_save_config(n); + else + napi_hash_del(n); + clear_bit(NAPI_STATE_DISABLE, &n->state); } -EXPORT_SYMBOL(napi_disable); +EXPORT_SYMBOL(napi_disable_locked); /** - * napi_enable - enable NAPI scheduling - * @n: NAPI context + * napi_disable() - prevent NAPI from scheduling + * @n: NAPI context * - * Resume NAPI from being scheduled on this context. - * Must be paired with napi_disable. + * Stop NAPI from being scheduled on this context. + * Waits till any outstanding processing completes. + * Takes netdev_lock() for associated net_device. */ -void napi_enable(struct napi_struct *n) +void napi_disable(struct napi_struct *n) +{ + netdev_lock(n->dev); + napi_disable_locked(n); + netdev_unlock(n->dev); +} +EXPORT_SYMBOL(napi_disable); + +void napi_enable_locked(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); + if (n->config) + napi_restore_config(n); + else + napi_hash_add(n); + do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); @@ -6433,40 +7600,54 @@ void napi_enable(struct napi_struct *n) new |= NAPIF_STATE_THREADED; } while (!try_cmpxchg(&n->state, &val, new)); } -EXPORT_SYMBOL(napi_enable); +EXPORT_SYMBOL(napi_enable_locked); -static void flush_gro_hash(struct napi_struct *napi) +/** + * napi_enable() - enable NAPI scheduling + * @n: NAPI context + * + * Enable scheduling of a NAPI instance. + * Must be paired with napi_disable(). + * Takes netdev_lock() for associated net_device. + */ +void napi_enable(struct napi_struct *n) { - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct sk_buff *skb, *n; - - list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) - kfree_skb(skb); - napi->gro_hash[i].count = 0; - } + netdev_lock(n->dev); + napi_enable_locked(n); + netdev_unlock(n->dev); } +EXPORT_SYMBOL(napi_enable); /* Must be called in process context */ -void __netif_napi_del(struct napi_struct *napi) +void __netif_napi_del_locked(struct napi_struct *napi) { + netdev_assert_locked(napi->dev); + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; - napi_hash_del(napi); + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + if (napi->config) { + napi->index = -1; + napi->config = NULL; + } + list_del_rcu(&napi->dev_list); napi_free_frags(napi); - flush_gro_hash(napi); - napi->gro_bitmask = 0; + gro_cleanup(&napi->gro); if (napi->thread) { kthread_stop(napi->thread); napi->thread = NULL; } } -EXPORT_SYMBOL(__netif_napi_del); +EXPORT_SYMBOL(__netif_napi_del_locked); static int __napi_poll(struct napi_struct *n, bool *repoll) { @@ -6481,9 +7662,11 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) { + if (napi_is_scheduled(n)) { work = n->poll(n, weight); trace_napi_poll(n, work, weight); + + xdp_do_check_flushed(n); } if (unlikely(work > weight)) @@ -6516,14 +7699,8 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) return work; } - if (n->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(n, HZ >= 1000); - } - - gro_normal_list(n); + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush_normal(&n->gro, HZ >= 1000); /* Some drivers may have called napi_schedule * prior to exhausting their budget. @@ -6551,9 +7728,14 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) work = __napi_poll(n, &do_repoll); - if (do_repoll) + if (do_repoll) { +#if defined(CONFIG_DEBUG_NET) + if (unlikely(!napi_is_scheduled(n))) + pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n", + n->dev->name, n->poll); +#endif list_add_tail(&n->poll_list, repoll); - + } netpoll_poll_unlock(have); return work; @@ -6561,8 +7743,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static int napi_thread_wait(struct napi_struct *napi) { - bool woken = false; - set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { @@ -6571,15 +7751,13 @@ static int napi_thread_wait(struct napi_struct *napi) * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ - if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); - /* woken being true indicates this thread owns this napi. */ - woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -6587,63 +7765,93 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static int napi_threaded_poll(void *data) +static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) { - struct napi_struct *napi = data; - void *have; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + struct softnet_data *sd; + unsigned long last_qs = jiffies; - while (!napi_thread_wait(napi)) { - for (;;) { - bool repoll = false; + for (;;) { + bool repoll = false; + void *have; - local_bh_disable(); + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - have = netpoll_poll_lock(napi); - __napi_poll(napi, &repoll); - netpoll_poll_unlock(have); + sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; - local_bh_enable(); + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); - if (!repoll) - break; + sd->in_napi_threaded_poll = false; + barrier(); + + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + skb_defer_free_flush(); + bpf_net_ctx_clear(bpf_net_ctx); + + /* When busy poll is enabled, the old packets are not flushed in + * napi_complete_done. So flush them here. + */ + if (busy_poll) + gro_flush_normal(&napi->gro, HZ >= 1000); + local_bh_enable(); + /* Call cond_resched here to avoid watchdog warnings. */ + if (repoll || busy_poll) { + rcu_softirq_qs_periodic(last_qs); cond_resched(); } + + if (!repoll) + break; } - return 0; } -static void skb_defer_free_flush(struct softnet_data *sd) +static int napi_threaded_poll(void *data) { - struct sk_buff *skb, *next; - unsigned long flags; + struct napi_struct *napi = data; + bool want_busy_poll; + bool in_busy_poll; + unsigned long val; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) - return; + while (!napi_thread_wait(napi)) { + val = READ_ONCE(napi->state); - spin_lock_irqsave(&sd->defer_lock, flags); - skb = sd->defer_list; - sd->defer_list = NULL; - sd->defer_count = 0; - spin_unlock_irqrestore(&sd->defer_lock, flags); + want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; + in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL; - while (skb != NULL) { - next = skb->next; - napi_consume_skb(skb, 1); - skb = next; + if (unlikely(val & NAPIF_STATE_DISABLE)) + want_busy_poll = false; + + if (want_busy_poll != in_busy_poll) + assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, + want_busy_poll); + + napi_threaded_poll_loop(napi, want_busy_poll); } + + return 0; } -static __latent_entropy void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); - int budget = READ_ONCE(netdev_budget); + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); +start: + sd->in_net_rx_action = true; local_irq_disable(); list_splice_init(&sd->poll_list, &list); local_irq_enable(); @@ -6651,11 +7859,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) for (;;) { struct napi_struct *n; - skb_defer_free_flush(sd); + skb_defer_free_flush(); if (list_empty(&list)) { - if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - goto end; + if (list_empty(&repoll)) { + sd->in_net_rx_action = false; + barrier(); + /* We need to check if ____napi_schedule() + * had refilled poll_list while + * sd->in_net_rx_action was true. + */ + if (!list_empty(&sd->poll_list)) + goto start; + if (!sd_has_rps_ipi_waiting(sd)) + goto end; + } break; } @@ -6668,7 +7886,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { - sd->time_squeeze++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } } @@ -6680,9 +7899,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); + else + sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); -end:; +end: + bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { @@ -8298,30 +9520,31 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; + unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); - dev->flags |= IFF_PROMISC; - dev->promiscuity += inc; - if (dev->promiscuity == 0) { + promiscuity = dev->promiscuity + inc; + if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ - if (inc < 0) - dev->flags &= ~IFF_PROMISC; - else { - dev->promiscuity -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_PROMISC; + } else { + flags = old_flags | IFF_PROMISC; } - if (dev->flags != old_flags) { - pr_info("device %s %s promiscuous mode\n", - dev->name, - dev->flags & IFF_PROMISC ? "entered" : "left"); + WRITE_ONCE(dev->promiscuity, promiscuity); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); + netdev_info(dev, "%s promiscuous mode\n", + dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, @@ -8337,23 +9560,20 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev_change_rx_flags(dev, IFF_PROMISC); } - if (notify) + if (notify) { + /* The ops lock is only required to ensure consistent locking + * for `NETDEV_CHANGE` notifiers. This function is sometimes + * called without the lock, even for devices that are ops + * locked, such as in `dev_uc_sync_multiple` when using + * bonding or teaming. + */ + netdev_ops_assert_locked(dev); __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); + } return 0; } -/** - * dev_set_promiscuity - update promiscuity count on a device - * @dev: device - * @inc: modifier - * - * Add or remove promiscuity from a device. While the count in the device - * remains above zero the interface remains promiscuous. Once it hits zero - * the device reverts back to normal filtering operation. A negative inc - * value is used to drop promiscuity on the device. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_promiscuity(struct net_device *dev, int inc) +int netif_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; @@ -8365,30 +9585,33 @@ int dev_set_promiscuity(struct net_device *dev, int inc) dev_set_rx_mode(dev); return err; } -EXPORT_SYMBOL(dev_set_promiscuity); -static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) +int netif_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + unsigned int allmulti, flags; ASSERT_RTNL(); - dev->flags |= IFF_ALLMULTI; - dev->allmulti += inc; - if (dev->allmulti == 0) { + allmulti = dev->allmulti + inc; + if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ - if (inc < 0) - dev->flags &= ~IFF_ALLMULTI; - else { - dev->allmulti -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_ALLMULTI; + } else { + flags = old_flags | IFF_ALLMULTI; } - if (dev->flags ^ old_flags) { + WRITE_ONCE(dev->allmulti, allmulti); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); + netdev_info(dev, "%s allmulticast mode\n", + dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) @@ -8398,25 +9621,6 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) return 0; } -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) -{ - return __dev_set_allmulti(dev, inc, true); -} -EXPORT_SYMBOL(dev_set_allmulti); - /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast @@ -8459,21 +9663,21 @@ void dev_set_rx_mode(struct net_device *dev) } /** - * dev_get_flags - get flags reported to userspace - * @dev: device + * netif_get_flags() - get flags reported to userspace + * @dev: device * - * Get the combination of flag bits exported through APIs to userspace. + * Get the combination of flag bits exported through APIs to userspace. */ -unsigned int dev_get_flags(const struct net_device *dev) +unsigned int netif_get_flags(const struct net_device *dev) { unsigned int flags; - flags = (dev->flags & ~(IFF_PROMISC | + flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | - (dev->gflags & (IFF_PROMISC | + (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { @@ -8487,7 +9691,7 @@ unsigned int dev_get_flags(const struct net_device *dev) return flags; } -EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(netif_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) @@ -8532,7 +9736,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; - unsigned int old_flags = dev->flags; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; @@ -8549,7 +9753,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - __dev_set_allmulti(dev, inc, false); + netif_set_allmulti(dev, inc, false); } return ret; @@ -8584,17 +9788,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, } } -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * @extack: netlink extended ack - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned int flags, - struct netlink_ext_ack *extack) +int netif_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; @@ -8607,9 +9802,8 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } -EXPORT_SYMBOL(dev_change_flags); -int __dev_set_mtu(struct net_device *dev, int new_mtu) +int __netif_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; @@ -8620,7 +9814,7 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) WRITE_ONCE(dev->mtu, new_mtu); return 0; } -EXPORT_SYMBOL(__dev_set_mtu); +EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL"); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) @@ -8639,18 +9833,22 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu, } /** - * dev_set_mtu_ext - Change maximum transfer unit - * @dev: device - * @new_mtu: new transfer unit - * @extack: netlink extended ack + * netif_set_mtu_ext() - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * @extack: netlink extended ack + * + * Change the maximum transfer size of the network device. * - * Change the maximum transfer size of the network device. + * Return: 0 on success, -errno on failure. */ -int dev_set_mtu_ext(struct net_device *dev, int new_mtu, - struct netlink_ext_ack *extack) +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) { int err, orig_mtu; + netdev_ops_assert_locked(dev); + if (new_mtu == dev->mtu) return 0; @@ -8667,7 +9865,7 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, return err; orig_mtu = dev->mtu; - err = __dev_set_mtu(dev, new_mtu); + err = __netif_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, @@ -8677,7 +9875,7 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ - __dev_set_mtu(dev, orig_mtu); + __netif_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } @@ -8685,25 +9883,20 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, return err; } -int dev_set_mtu(struct net_device *dev, int new_mtu) +int netif_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); - err = dev_set_mtu_ext(dev, new_mtu, &extack); + err = netif_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } -EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(netif_set_mtu); -/** - * dev_change_tx_queue_len - Change TX queue length of a netdevice - * @dev: device - * @new_len: new tx queue length - */ -int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; @@ -8712,7 +9905,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) return -ERANGE; if (new_len != orig_len) { - dev->tx_queue_len = new_len; + WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) @@ -8726,28 +9919,25 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); - dev->tx_queue_len = orig_len; + WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } -/** - * dev_set_group - Change group this device belongs to - * @dev: device - * @new_group: group this device should belong to - */ -void dev_set_group(struct net_device *dev, int new_group) +void netif_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } /** - * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. - * @dev: device - * @addr: new address - * @extack: netlink extended ack + * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + * + * Return: 0 on success, -errno on failure. */ -int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, - struct netlink_ext_ack *extack) +int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) { struct netdev_notifier_pre_changeaddr_info info = { .info.dev = dev, @@ -8759,58 +9949,40 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); return notifier_to_errno(rc); } -EXPORT_SYMBOL(dev_pre_changeaddr_notify); +EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL"); -/** - * dev_set_mac_address - Change Media Access Control Address - * @dev: device - * @sa: new address - * @extack: netlink extended ack - * - * Change the hardware (MAC) address of the device - */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack) +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; - if (sa->sa_family != dev->type) + if (ss->ss_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); - if (err) - return err; - err = ops->ndo_set_mac_address(dev, sa); + err = netif_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; + if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { + err = ops->ndo_set_mac_address(dev, ss); + if (err) + return err; + } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } -EXPORT_SYMBOL(dev_set_mac_address); -static DECLARE_RWSEM(dev_addr_sem); +DECLARE_RWSEM(dev_addr_sem); -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack) +/* "sa" is a true struct sockaddr with limited "sa_data" member. */ +int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { - int ret; - - down_write(&dev_addr_sem); - ret = dev_set_mac_address(dev, sa, extack); - up_write(&dev_addr_sem); - return ret; -} -EXPORT_SYMBOL(dev_set_mac_address_user); - -int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) -{ - size_t size = sizeof(sa->sa_data_min); + size_t size = sizeof(sa->sa_data); struct net_device *dev; int ret = 0; @@ -8834,16 +10006,9 @@ unlock: up_read(&dev_addr_sem); return ret; } -EXPORT_SYMBOL(dev_get_mac_address); +EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL"); -/** - * dev_change_carrier - Change device carrier - * @dev: device - * @new_carrier: new value - * - * Change device carrier - */ -int dev_change_carrier(struct net_device *dev, bool new_carrier) +int netif_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; @@ -8894,16 +10059,17 @@ int dev_get_phys_port_name(struct net_device *dev, } /** - * dev_get_port_parent_id - Get the device's port parent identifier - * @dev: network device - * @ppid: pointer to a storage for the port's parent identifier - * @recurse: allow/disallow recursion to lower devices + * netif_get_port_parent_id() - Get the device's port parent identifier + * @dev: network device + * @ppid: pointer to a storage for the port's parent identifier + * @recurse: allow/disallow recursion to lower devices + * + * Get the devices's port parent identifier. * - * Get the devices's port parent identifier + * Return: 0 on success, -errno on failure. */ -int dev_get_port_parent_id(struct net_device *dev, - struct netdev_phys_item_id *ppid, - bool recurse) +int netif_get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid, bool recurse) { const struct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; @@ -8922,7 +10088,7 @@ int dev_get_port_parent_id(struct net_device *dev, return err; netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = dev_get_port_parent_id(lower_dev, ppid, true); + err = netif_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) @@ -8933,7 +10099,7 @@ int dev_get_port_parent_id(struct net_device *dev, return err; } -EXPORT_SYMBOL(dev_get_port_parent_id); +EXPORT_SYMBOL(netif_get_port_parent_id); /** * netdev_port_same_parent_id - Indicate if two network devices have @@ -8946,23 +10112,17 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) struct netdev_phys_item_id a_id = { }; struct netdev_phys_item_id b_id = { }; - if (dev_get_port_parent_id(a, &a_id, true) || - dev_get_port_parent_id(b, &b_id, true)) + if (netif_get_port_parent_id(a, &a_id, true) || + netif_get_port_parent_id(b, &b_id, true)) return false; return netdev_phys_item_id_same(&a_id, &b_id); } EXPORT_SYMBOL(netdev_port_same_parent_id); -/** - * dev_change_proto_down - set carrier according to proto_down. - * - * @dev: device - * @proto_down: new value - */ -int dev_change_proto_down(struct net_device *dev, bool proto_down) +int netif_change_proto_down(struct net_device *dev, bool proto_down) { - if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) + if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -8970,32 +10130,35 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); - dev->proto_down = proto_down; + WRITE_ONCE(dev->proto_down, proto_down); return 0; } /** - * dev_change_proto_down_reason - proto down reason + * netdev_change_proto_down_reason_locked - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value) +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value) { + u32 proto_down_reason; int b; if (!mask) { - dev->proto_down_reason = value; + proto_down_reason = value; } else { + proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) - dev->proto_down_reason |= BIT(b); + proto_down_reason |= BIT(b); else - dev->proto_down_reason &= ~BIT(b); + proto_down_reason &= ~BIT(b); } } + WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { @@ -9056,6 +10219,40 @@ u8 dev_xdp_prog_count(struct net_device *dev) } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); +u8 dev_xdp_sb_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog && + !dev->xdp_state[i].prog->aux->xdp_has_frags) + count++; + return count; +} + +int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + if (!dev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + bpf->command == XDP_SETUP_PROG && + bpf->prog && !bpf->prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(bpf->extack, + "unable to propagate XDP to device using tcp-data-split"); + return -EBUSY; + } + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); + return -EBUSY; + } + + return dev->netdev_ops->ndo_bpf(dev, bpf); +} +EXPORT_SYMBOL_GPL(netif_xdp_propagate); + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -9084,6 +10281,19 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; + netdev_ops_assert_locked(dev); + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + prog && !prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); + return -EBUSY; + } + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); + return -EBUSY; + } + memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; @@ -9224,8 +10434,16 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } - if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { - NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); + if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { + NL_SET_ERR_MSG(extack, "Program bound to different device"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { @@ -9297,7 +10515,9 @@ static void bpf_xdp_link_release(struct bpf_link *link) * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { + netdev_lock_ops(xdp_link->dev); WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + netdev_unlock_ops(xdp_link->dev); xdp_link->dev = NULL; } @@ -9379,10 +10599,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, goto out_unlock; } + netdev_lock_ops(xdp_link->dev); mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); + netdev_unlock_ops(xdp_link->dev); if (err) goto out_unlock; @@ -9407,6 +10629,7 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; + struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; @@ -9424,7 +10647,8 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog, + attr->link_create.attach_type); link->dev = dev; link->flags = attr->link_create.flags; @@ -9434,12 +10658,15 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - err = dev_xdp_attach_link(dev, NULL, link); + netdev_lock_ops(dev); + err = dev_xdp_attach_link(dev, &extack, link); + netdev_unlock_ops(dev); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); + trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } @@ -9502,34 +10729,74 @@ err_out: return err; } +u32 dev_get_min_mp_channel_count(const struct net_device *dev) +{ + int i; + + netdev_ops_assert_locked(dev); + + for (i = dev->real_num_rx_queues - 1; i >= 0; i--) + if (dev->_rx[i].mp_params.mp_priv) + /* The channel count is the idx plus 1. */ + return i + 1; + + return 0; +} + /** - * dev_new_index - allocate an ifindex - * @net: the applicable net namespace + * dev_index_reserve() - allocate an ifindex in a namespace + * @net: the applicable net namespace + * @ifindex: requested ifindex, pass %0 to get one allocated + * + * Allocate a ifindex for a new device. Caller must either use the ifindex + * to store the device (via list_netdevice()) or call dev_index_release() + * to give the index up. * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. + * Return: a suitable unique value for a new device interface number or -errno. */ -static int dev_new_index(struct net *net) +static int dev_index_reserve(struct net *net, u32 ifindex) { - int ifindex = net->ifindex; + int err; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return net->ifindex = ifindex; + if (ifindex > INT_MAX) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EINVAL; } + + if (!ifindex) + err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, + xa_limit_31b, &net->ifindex, GFP_KERNEL); + else + err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); + if (err < 0) + return err; + + return ifindex; +} + +static void dev_index_release(struct net *net, int ifindex) +{ + /* Expect only unused indexes, unlist_netdevice() removes the used */ + WARN_ON(xa_erase(&net->dev_by_index, ifindex)); +} + +static bool from_cleanup_net(void) +{ +#ifdef CONFIG_NET_NS + return current == READ_ONCE(cleanup_net_task); +#else + return false; +#endif } /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -9564,6 +10831,7 @@ static void netdev_sync_lower_features(struct net_device *upper, if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); + netdev_lock_ops(lower); lower->wanted_features &= ~feature; __netdev_update_features(lower); @@ -9572,10 +10840,20 @@ static void netdev_sync_lower_features(struct net_device *upper, &feature, lower->name); else netdev_features_change(lower); + netdev_unlock_ops(lower); } } } +static bool netdev_has_ip_or_hw_csum(netdev_features_t features) +{ + netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; + bool hw_csum = features & NETIF_F_HW_CSUM; + + return ip_csum || hw_csum; +} + static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { @@ -9657,15 +10935,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_LRO; } - if (features & NETIF_F_HW_TLS_TX) { - bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == - (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - bool hw_csum = features & NETIF_F_HW_CSUM; - - if (!ip_csum && !hw_csum) { - netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); - features &= ~NETIF_F_HW_TLS_TX; - } + if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { @@ -9673,6 +10945,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_HW_TLS_RX; } + if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); + features &= ~NETIF_F_GSO_UDP_L4; + } + return features; } @@ -9684,6 +10961,7 @@ int __netdev_update_features(struct net_device *dev) int err = -1; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); features = netdev_get_wanted_features(dev); @@ -9736,12 +11014,14 @@ sync_lower: * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ + udp_tunnel_nic_lock(dev); if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } + udp_tunnel_nic_unlock(dev); } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { @@ -9929,6 +11209,65 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); +static int netdev_do_alloc_pcpu_stats(struct net_device *dev) +{ + void __percpu *v; + + /* Drivers implementing ndo_get_peer_dev must support tstat + * accounting, so that skb_do_redirect() can bump the dev's + * RX stats upon network namespace switch. + */ + if (dev->netdev_ops->ndo_get_peer_dev && + dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) + return -EOPNOTSUPP; + + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return 0; + case NETDEV_PCPU_STAT_LSTATS: + v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); + break; + default: + return -EINVAL; + } + + return v ? 0 : -ENOMEM; +} + +static void netdev_do_free_pcpu_stats(struct net_device *dev) +{ + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return; + case NETDEV_PCPU_STAT_LSTATS: + free_percpu(dev->lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + free_percpu(dev->tstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + free_percpu(dev->dstats); + break; + } +} + +static void netdev_free_phy_link_topology(struct net_device *dev) +{ + struct phy_link_topology *topo = dev->link_topo; + + if (IS_ENABLED(CONFIG_PHYLIB) && topo) { + xa_destroy(&topo->phys); + kfree(topo); + dev->link_topo = NULL; + } +} + /** * register_netdevice() - register a network device * @dev: device to register @@ -9958,6 +11297,10 @@ int register_netdevice(struct net_device *dev) if (ret) return ret; + /* rss ctx ID 0 is reserved for the default context, start from 1 */ + xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); + mutex_init(&dev->ethtool->rss_lock); + spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); @@ -9989,12 +11332,15 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } - ret = -EBUSY; - if (!dev->ifindex) - dev->ifindex = dev_new_index(net); - else if (__dev_get_by_index(net, dev->ifindex)) + ret = netdev_do_alloc_pcpu_stats(dev); + if (ret) goto err_uninit; + ret = dev_index_reserve(net, dev->ifindex); + if (ret < 0) + goto err_free_pcpu; + dev->ifindex = ret; + /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ @@ -10040,16 +11386,20 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) - goto err_uninit; + goto err_ifindex_release; ret = netdev_register_kobject(dev); - write_lock(&dev_base_lock); - dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; - write_unlock(&dev_base_lock); + + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); + netdev_unlock(dev); + if (ret) goto err_uninit_notify; + netdev_lock_ops(dev); __netdev_update_features(dev); + netdev_unlock_ops(dev); /* * Default initial state at registry is that the @@ -10075,7 +11425,9 @@ int register_netdevice(struct net_device *dev) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ + netdev_lock_ops(dev); ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */ @@ -10087,8 +11439,7 @@ int register_netdevice(struct net_device *dev) * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: @@ -10096,6 +11447,10 @@ out: err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); +err_ifindex_release: + dev_index_release(net, dev->ifindex); +err_free_pcpu: + netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -10107,49 +11462,26 @@ err_free_name: } EXPORT_SYMBOL(register_netdevice); -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initialize the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * The setup steps dummy netdevs need which normal netdevs get by going + * through register_netdevice(). */ -int init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev(struct net_device *dev) { - /* Clear everything. Note we don't initialize spinlocks - * are they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; - /* NAPI wants this */ - INIT_LIST_HEAD(&dev->napi_list); - /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); - /* napi_busy_loop stats accounting wants this */ - dev_net_set(dev, &init_net); - /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. */ - - return 0; } -EXPORT_SYMBOL_GPL(init_dummy_netdev); - /** * register_netdev - register a network device @@ -10166,12 +11498,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev); */ int register_netdev(struct net_device *dev) { + struct net *net = dev_net(dev); int err; - if (rtnl_lock_killable()) + if (rtnl_net_lock_killable(net)) return -EINTR; + err = register_netdevice(dev); - rtnl_unlock(); + + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdev); @@ -10248,8 +11584,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) rebroadcast_time = jiffies; } + rcu_barrier(); + if (!wait) { - rcu_barrier(); wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); @@ -10301,15 +11638,15 @@ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; + int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { - struct net_device *dev = list_first_entry(&unlink_list, - struct net_device, - unlink_list); + dev = list_first_entry(&unlink_list, struct net_device, + unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } @@ -10331,12 +11668,13 @@ void netdev_run_todo(void) continue; } - write_lock(&dev_base_lock); - dev->reg_state = NETREG_UNREGISTERED; - write_unlock(&dev_base_lock); - linkwatch_forget_dev(dev); + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); + netdev_unlock(dev); + linkwatch_sync_dev(dev); } + cnt = 0; while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); @@ -10348,17 +11686,67 @@ void netdev_run_todo(void) WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); + netdev_do_free_pcpu_stats(dev); if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev); - if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) - wake_up(&netdev_unregistering_wq); + cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } + if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) + wake_up(&netdev_unregistering_wq); +} + +/* Collate per-cpu network dstats statistics + * + * Read per-cpu network statistics from dev->dstats and populate the related + * fields in @s. + */ +static void dev_fetch_dstats(struct rtnl_link_stats64 *s, + const struct pcpu_dstats __percpu *dstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, rx_drops; + u64 tx_packets, tx_bytes, tx_drops; + const struct pcpu_dstats *stats; + unsigned int start; + + stats = per_cpu_ptr(dstats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + rx_drops = u64_stats_read(&stats->rx_drops); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); + tx_drops = u64_stats_read(&stats->tx_drops); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->rx_dropped += rx_drops; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; + s->tx_dropped += tx_drops; + } +} + +/* ndo_get_stats64 implementation for dtstats-based accounting. + * + * Populate @s from dev->stats and dev->dstats. This is used internally by the + * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. + */ +static void dev_get_dstats64(const struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_dstats(s, dev->dstats); } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has @@ -10375,14 +11763,15 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = atomic_long_read(&src[i]); + dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); } EXPORT_SYMBOL(netdev_stats_to_stats64); -struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) +static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc( + struct net_device *dev) { struct net_device_core_stats __percpu *p; @@ -10395,7 +11784,23 @@ struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device /* This READ_ONCE() pairs with the cmpxchg() above */ return READ_ONCE(dev->core_stats); } -EXPORT_SYMBOL(netdev_core_stats_alloc); + +noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) +{ + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); + unsigned long __percpu *field; + + if (unlikely(!p)) { + p = netdev_core_stats_alloc(dev); + if (!p) + return; + } + + field = (unsigned long __percpu *)((void __percpu *)p + offset); + this_cpu_inc(*field); +} +EXPORT_SYMBOL_GPL(netdev_core_stats_inc); /** * dev_get_stats - get network device statistics @@ -10413,11 +11818,29 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; + /* + * IPv{4,6} and udp tunnels share common stat helpers and use + * different stat type (NETDEV_PCPU_STAT_TSTATS vs + * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. + */ + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != + offsetof(struct pcpu_dstats, rx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != + offsetof(struct pcpu_dstats, rx_packets)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != + offsetof(struct pcpu_dstats, tx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != + offsetof(struct pcpu_dstats, tx_packets)); + if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { + dev_get_tstats64(dev, storage); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { + dev_get_dstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } @@ -10501,7 +11924,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) return NULL; netdev_init_one_queue(dev, queue, NULL); RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); - queue->qdisc_sleeping = &noop_qdisc; + RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc); rcu_assign_pointer(dev->ingress_queue, queue); #endif return queue; @@ -10528,18 +11951,13 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) { WARN_ON(dev->reg_state == NETREG_REGISTERED); - dev->gro_flush_timeout = 20000; - dev->napi_defer_hard_irqs = 1; + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + netdev_set_gro_flush_timeout(dev, 20000); + netdev_set_defer_hard_irqs(dev, 1); + } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); -void netdev_freemem(struct net_device *dev) -{ - char *addr = (char *)dev - dev->padded; - - kvfree(addr); -} - /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -10559,8 +11977,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned int txqs, unsigned int rxqs) { struct net_device *dev; - unsigned int alloc_size; - struct net_device *p; + size_t napi_config_sz; + unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -10574,23 +11992,16 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } - alloc_size = sizeof(struct net_device); - if (sizeof_priv) { - /* ensure 32-byte alignment of private area */ - alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); - alloc_size += sizeof_priv; - } - /* ensure 32-byte alignment of whole construct */ - alloc_size += NETDEV_ALIGN - 1; + maxqs = max(txqs, rxqs); - p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); - if (!p) + dev = kvzalloc(struct_size(dev, priv, sizeof_priv), + GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!dev) return NULL; - dev = PTR_ALIGN(p, NETDEV_ALIGN); - dev->padded = (char *)dev - (char *)p; + dev->priv_len = sizeof_priv; - ref_tracker_dir_init(&dev->refcnt_tracker, 128); + ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev"); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) @@ -10609,8 +12020,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; + dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; @@ -10632,6 +12046,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif + + mutex_init(&dev->lock); + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -10649,8 +12066,22 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; + dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); + if (!dev->ethtool) + goto free_all; - strcpy(dev->name, name); + dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); + if (!dev->cfg) + goto free_all; + dev->cfg_pending = dev->cfg; + + dev->num_napi_configs = maxqs; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); + if (!dev->napi_config) + goto free_all; + + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) @@ -10669,11 +12100,27 @@ free_pcpu: free_percpu(dev->pcpu_refcnt); free_dev: #endif - netdev_freemem(dev); + kvfree(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); +static void netdev_napi_exit(struct net_device *dev) +{ + if (!list_empty(&dev->napi_list)) { + struct napi_struct *p, *n; + + netdev_lock(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + __netif_napi_del_locked(p); + netdev_unlock(dev); + + synchronize_net(); + } + + kvfree(dev->napi_config); +} + /** * free_netdev - free network device * @dev: device @@ -10685,8 +12132,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs); */ void free_netdev(struct net_device *dev) { - struct napi_struct *p, *n; - might_sleep(); /* When called immediately after register_netdevice() failed the unwind @@ -10699,6 +12144,9 @@ void free_netdev(struct net_device *dev) return; } + WARN_ON(dev->cfg != dev->cfg_pending); + kfree(dev->cfg); + kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -10707,8 +12155,9 @@ void free_netdev(struct net_device *dev) /* Flush device addresses */ dev_addr_flush(dev); - list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) - netif_napi_del(p); + netdev_napi_exit(dev); + + netif_del_cpu_rmap(dev); ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT @@ -10720,14 +12169,19 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; + netdev_free_phy_link_topology(dev); + + mutex_destroy(&dev->lock); + /* Compatibility with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - netdev_freemem(dev); + if (dev->reg_state == NETREG_UNINITIALIZED || + dev->reg_state == NETREG_DUMMY) { + kvfree(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -10735,6 +12189,19 @@ void free_netdev(struct net_device *dev) EXPORT_SYMBOL(free_netdev); /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, + init_dummy_netdev); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. @@ -10743,13 +12210,28 @@ EXPORT_SYMBOL(free_netdev); void synchronize_net(void) { might_sleep(); - if (rtnl_is_locked()) + if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); +static void netdev_rss_contexts_free(struct net_device *dev) +{ + struct ethtool_rxfh_context *ctx; + unsigned long context; + + mutex_lock(&dev->ethtool->rss_lock); + xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { + xa_erase(&dev->ethtool->rss_ctx, context); + dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL); + kfree(ctx); + } + xa_destroy(&dev->ethtool->rss_ctx); + mutex_unlock(&dev->ethtool->rss_lock); +} + /** * unregister_netdevice_queue - remove device from the kernel * @dev: device @@ -10778,11 +12260,54 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + +/* devices must be UP and netdev_lock()'d */ +static void netif_close_many_and_unlock(struct list_head *close_head) +{ + struct net_device *dev, *tmp; + + netif_close_many(close_head, false); + + /* ... now unlock them */ + list_for_each_entry_safe(dev, tmp, close_head, close_list) { + netdev_unlock(dev); + list_del_init(&dev->close_list); + } +} + +static void netif_close_many_and_unlock_cond(struct list_head *close_head) +{ +#ifdef CONFIG_LOCKDEP + /* We can only track up to MAX_LOCK_DEPTH locks per task. + * + * Reserve half the available slots for additional locks possibly + * taken by notifiers and (soft)irqs. + */ + unsigned int limit = MAX_LOCK_DEPTH / 2; + + if (lockdep_depth(current) > limit) + netif_close_many_and_unlock(close_head); +#endif +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { struct net_device *dev, *tmp; LIST_HEAD(close_head); + int cnt = 0; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -10807,17 +12332,30 @@ void unregister_netdevice_many_notify(struct list_head *head, BUG_ON(dev->reg_state != NETREG_REGISTERED); } - /* If device is running, close it first. */ - list_for_each_entry(dev, head, unreg_list) - list_add_tail(&dev->close_list, &close_head); - dev_close_many(&close_head, true); + /* If device is running, close it first. Start with ops locked... */ + list_for_each_entry(dev, head, unreg_list) { + if (!(dev->flags & IFF_UP)) + continue; + if (netdev_need_ops_lock(dev)) { + list_add_tail(&dev->close_list, &close_head); + netdev_lock(dev); + } + netif_close_many_and_unlock_cond(&close_head); + } + netif_close_many_and_unlock(&close_head); + /* ... now go over the rest. */ + list_for_each_entry(dev, head, unreg_list) { + if (!netdev_need_ops_lock(dev)) + list_add_tail(&dev->close_list, &close_head); + } + netif_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - write_lock(&dev_base_lock); - unlist_netdevice(dev, false); - dev->reg_state = NETREG_UNREGISTERING; - write_unlock(&dev_base_lock); + unlist_netdevice(dev); + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); + netdev_unlock(dev); } flush_all_backlogs(); @@ -10827,9 +12365,13 @@ void unregister_netdevice_many_notify(struct list_head *head, struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); - + dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); + dev_memory_provider_uninstall(dev); + netdev_unlock_ops(dev); + bpf_dev_bound_netdev_unregister(dev); netdev_offload_xstats_disable_all(dev); @@ -10838,11 +12380,10 @@ void unregister_netdevice_many_notify(struct list_head *head, */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, - portid, nlmsg_seq(nlh)); + portid, nlh); /* * Flush the unicast and multicast chains @@ -10853,11 +12394,17 @@ void unregister_netdevice_many_notify(struct list_head *head, netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); + netdev_rss_contexts_free(dev); + call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + mutex_destroy(&dev->ethtool->rss_lock); + + net_shaper_flush_netdev(dev); + if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); @@ -10878,7 +12425,9 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); + cnt++; } + atomic_add(cnt, &dev_unreg_count); list_del(head); } @@ -10888,7 +12437,7 @@ void unregister_netdevice_many_notify(struct list_head *head, * @head: list of devices * * Note: As most callers use a stack allocated list_head, - * we force a list_del() to make sure stack wont be corrupted later. + * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@ -10909,44 +12458,35 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - rtnl_lock(); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_unlock(); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); -/** - * __dev_change_net_namespace - move device to different nethost namespace - * @dev: device - * @net: network namespace - * @pat: If not NULL name pattern to try if the current device name - * is already taken in the destination network namespace. - * @new_ifindex: If not zero, specifies device index in the target - * namespace. - * - * This function shuts down a device interface and moves it - * to a new network namespace. On success 0 is returned, on - * a failure a netagive errno code is returned. - * - * Callers must hold the rtnl semaphore. - */ - int __dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex) + const char *pat, int new_ifindex, + struct netlink_ext_ack *extack) { + struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); + char new_name[IFNAMSIZ] = {}; int err, new_nsid; ASSERT_RTNL(); /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_immutable) { + NL_SET_ERR_MSG(extack, "The interface netns is immutable"); goto out; + } - /* Ensure the device has been registrered */ - if (dev->reg_state != NETREG_REGISTERED) + /* Ensure the device has been registered */ + if (dev->reg_state != NETREG_REGISTERED) { + NL_SET_ERR_MSG(extack, "The interface isn't registered"); goto out; + } /* Get out if there is nothing todo */ err = 0; @@ -10959,32 +12499,73 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ - if (!pat) + if (!pat) { + NL_SET_ERR_MSG(extack, + "An interface with the same name exists in the target netns"); goto out; - err = dev_get_valid_name(net, dev, pat); - if (err < 0) + } + err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "Unable to use '%s' for the new interface name in the target netns", + pat); + goto out; + } + } + /* Check that none of the altnames conflicts. */ + err = -EEXIST; + netdev_for_each_altname(dev, name_node) { + if (netdev_name_in_use(net, name_node->name)) { + NL_SET_ERR_MSG_FMT(extack, + "An interface with the altname %s exists in the target netns", + name_node->name); goto out; + } } /* Check that new_ifindex isn't used yet. */ - err = -EBUSY; - if (new_ifindex && __dev_get_by_index(net, new_ifindex)) - goto out; + if (new_ifindex) { + err = dev_index_reserve(net, new_ifindex); + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "The ifindex %d is not available in the target netns", + new_ifindex); + goto out; + } + } else { + /* If there is an ifindex conflict assign a new one */ + err = dev_index_reserve(net, dev->ifindex); + if (err == -EBUSY) + err = dev_index_reserve(net, 0); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Unable to allocate a new ifindex in the target netns"); + goto out; + } + new_ifindex = err; + } /* * And now a mini version of register_netdevice unregister_netdevice. */ + netdev_lock_ops(dev); /* If device is running close it first. */ - dev_close(dev); - + netif_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev, true); + unlist_netdevice(dev); + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); synchronize_net(); /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); + netdev_unlock_ops(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -10997,13 +12578,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); - /* If there is an ifindex conflict assign a new one */ - if (!new_ifindex) { - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; - } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11022,28 +12596,44 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); dev->ifindex = new_ifindex; - /* Send a netdev-add uevent to the new namespace */ - kobject_uevent(&dev->dev.kobj, KOBJ_ADD); - netdev_adjacent_add_links(dev); + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock_bh(&netdev_rename_lock); + strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); + } /* Fixup kobjects */ + dev_set_uevent_suppress(&dev->dev, 1); err = device_rename(&dev->dev, dev->name); + dev_set_uevent_suppress(&dev->dev, 0); WARN_ON(err); + /* Send a netdev-add uevent to the new namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); + /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ err = netdev_change_owner(dev, net_old, net); WARN_ON(err); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); - /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); /* * Prevent userspace races by waiting until the network @@ -11056,7 +12646,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, out: return err; } -EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { @@ -11096,7 +12685,7 @@ static int dev_cpu_dead(unsigned int oldcpu) list_del_init(&napi->poll_list); if (napi->poll == process_backlog) - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } @@ -11104,21 +12693,23 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + if (!use_backlog_threads()) { #ifdef CONFIG_RPS - remsd = oldsd->rps_ipi_list; - oldsd->rps_ipi_list = NULL; + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; #endif - /* send out pending IPI's on offline CPU */ - net_rps_send_ipi(remsd); + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0; @@ -11152,6 +12743,94 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); +/** + * netdev_compute_master_upper_features - compute feature from lowers + * @dev: the upper device + * @update_header: whether to update upper device's header_len/headroom/tailroom + * + * Recompute the upper device's feature based on all lower devices. + */ +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header) +{ + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES; + netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES; + netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES; + netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES; + netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES; + unsigned short max_header_len = ETH_HLEN; + unsigned int tso_max_size = TSO_MAX_SIZE; + unsigned short max_headroom = 0; + unsigned short max_tailroom = 0; + u16 tso_max_segs = TSO_MAX_SEGS; + struct net_device *lower_dev; + struct list_head *iter; + + mpls_features = netdev_base_features(mpls_features); + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + gso_partial_features = netdev_increment_features(gso_partial_features, + lower_dev->gso_partial_features, + MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES); + + vlan_features = netdev_increment_features(vlan_features, + lower_dev->vlan_features, + MASTER_UPPER_DEV_VLAN_FEATURES); + + enc_features = netdev_increment_features(enc_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_ENC_FEATURES); + + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + xfrm_features = netdev_increment_features(xfrm_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_XFRM_FEATURES); + + mpls_features = netdev_increment_features(mpls_features, + lower_dev->mpls_features, + MASTER_UPPER_DEV_MPLS_FEATURES); + + dst_release_flag &= lower_dev->priv_flags; + + if (update_header) { + max_header_len = max(max_header_len, lower_dev->hard_header_len); + max_headroom = max(max_headroom, lower_dev->needed_headroom); + max_tailroom = max(max_tailroom, lower_dev->needed_tailroom); + } + + tso_max_size = min(tso_max_size, lower_dev->tso_max_size); + tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs); + } + + dev->gso_partial_features = gso_partial_features; + dev->vlan_features = vlan_features; + dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + dev->hw_enc_features |= xfrm_features; + dev->mpls_features = mpls_features; + + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + dev->priv_flags |= IFF_XMIT_DST_RELEASE; + + if (update_header) { + dev->hard_header_len = max_header_len; + dev->needed_headroom = max_headroom; + dev->needed_tailroom = max_tailroom; + } + + netif_set_tso_max_segs(dev, tso_max_segs); + netif_set_tso_max_size(dev, tso_max_size); + + netdev_change_features(dev); +} +EXPORT_SYMBOL(netdev_compute_master_upper_features); + static struct hlist_head * __net_init netdev_create_hash(void) { int i; @@ -11169,7 +12848,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > - 8 * sizeof_field(struct napi_struct, gro_bitmask)); + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); INIT_LIST_HEAD(&net->dev_base_head); @@ -11181,6 +12860,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; @@ -11278,6 +12959,7 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } @@ -11289,6 +12971,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = { static void __net_exit default_device_exit_net(struct net *net) { + struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the @@ -11300,7 +12983,7 @@ static void __net_exit default_device_exit_net(struct net *net) char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_immutable) continue; /* Leave virtual devices for the generic cleanup */ @@ -11311,6 +12994,11 @@ static void __net_exit default_device_exit_net(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); + + netdev_for_each_altname_safe(dev, name_node, tmp) + if (netdev_name_in_use(&init_net, name_node->name)) + __netdev_name_node_alt_destroy(name_node); + err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", @@ -11353,6 +13041,61 @@ static struct pernet_operations __net_initdata default_device_ops = { .exit_batch = default_device_exit_batch, }; +static void __init net_dev_struct_check(void) +{ + /* TX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); +#ifdef CONFIG_XPS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); + + /* TXRX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); + + /* RX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); +#ifdef CONFIG_NETPOLL + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); +} + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -11360,6 +13103,67 @@ static struct pernet_operations __net_initdata default_device_ops = { * */ +/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ +#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) + +static int net_page_pool_create(int cpuid) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + struct page_pool_params page_pool_params = { + .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, + .flags = PP_FLAG_SYSTEM_POOL, + .nid = cpu_to_mem(cpuid), + }; + struct page_pool *pp_ptr; + int err; + + pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); + if (IS_ERR(pp_ptr)) + return -ENOMEM; + + err = xdp_reg_page_pool(pp_ptr); + if (err) { + page_pool_destroy(pp_ptr); + return err; + } + + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; +#endif + return 0; +} + +static int backlog_napi_should_run(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + + napi_threaded_poll_loop(&sd->backlog, false); +} + +static void backlog_napi_setup(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + napi->thread = this_cpu_read(backlog_napi); + set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { + .store = &backlog_napi, + .thread_should_run = backlog_napi_should_run, + .thread_fn = run_backlog_napi, + .thread_comm = "backlog_napi/%u", + .setup = backlog_napi_setup, +}; + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -11370,13 +13174,14 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); + net_dev_struct_check(); + if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; - INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); @@ -11387,12 +13192,13 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ + flush_backlogs_fallback = flush_backlogs_alloc(); + if (!flush_backlogs_fallback) + goto out; + for_each_possible_cpu(i) { - struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); - INIT_WORK(flush, flush_backlog); - skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD @@ -11405,12 +13211,22 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - spin_lock_init(&sd->defer_lock); - init_gro_hash(&sd->backlog); + gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; + INIT_LIST_HEAD(&sd->backlog.poll_list); + + if (net_page_pool_create(i)) + goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; + if (use_backlog_threads()) + smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; @@ -11436,7 +13252,25 @@ static int __init net_dev_init(void) NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; + + /* avoid static key IPIs to isolated CPUs */ + if (housekeeping_enabled(HK_TYPE_MISC)) + net_enable_timestamp(); out: + if (rc < 0) { + for_each_possible_cpu(i) { + struct page_pool *pp_ptr; + + pp_ptr = per_cpu(system_page_pool.pool, i); + if (!pp_ptr) + continue; + + xdp_unreg_page_pool(pp_ptr); + page_pool_destroy(pp_ptr); + per_cpu(system_page_pool.pool, i) = NULL; + } + } + return rc; } diff --git a/net/core/dev.h b/net/core/dev.h index 814ed5b7b960..da18536cbd35 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -2,19 +2,22 @@ #ifndef _NET_CORE_DEV_H #define _NET_CORE_DEV_H +#include <linux/cleanup.h> #include <linux/types.h> +#include <linux/rwsem.h> +#include <linux/netdevice.h> +#include <net/netdev_lock.h> struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id; struct netlink_ext_ack; +struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { - u64 count; - unsigned int num_buckets; + struct rcu_head rcu; + unsigned int count; + u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; @@ -22,6 +25,37 @@ struct sd_flow_limit { extern int netdev_flow_limit_table_len; +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); +struct net_device *dev_get_by_napi_id(unsigned int napi_id); + +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T)); + +#define for_each_netdev_lock_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock) = NULL; \ + (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \ + ifindex++) + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex); +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *, + if (_T) netdev_unlock_ops_compat(_T)); + +#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \ + (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \ + &ifindex)); \ + ifindex++) + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else @@ -29,23 +63,30 @@ int __init dev_proc_init(void); #endif void linkwatch_init_dev(struct net_device *dev); -void linkwatch_forget_dev(struct net_device *dev); void linkwatch_run_queue(void); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); +#if IS_ENABLED(CONFIG_NET_SHAPER) +void net_shaper_flush_netdev(struct net_device *dev); +void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq); +#else +static inline void net_shaper_flush_netdev(struct net_device *dev) {} +static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq) {} +#endif + /* sysctls not referred to from outside net/core/ */ -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; -extern unsigned int sysctl_skb_defer_max; -extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; +extern struct rw_semaphore dev_addr_sem; + /* rtnl helpers */ extern struct list_head net_todo_list; void netdev_run_todo(void); @@ -56,34 +97,46 @@ struct netdev_name_node { struct list_head list; struct net_device *dev; const char *name; + struct rcu_head rcu; }; int netdev_get_name(struct net *net, char *name, int ifindex); +int netif_change_name(struct net_device *dev, const char *newname); int dev_change_name(struct net_device *dev, const char *newname); +#define netdev_for_each_altname(dev, namenode) \ + list_for_each_entry((namenode), &(dev)->name_node->list, list) +#define netdev_for_each_altname_safe(dev, namenode, next) \ + list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \ + list) + int netdev_name_node_alt_create(struct net_device *dev, const char *name); int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); -int dev_set_mtu_ext(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); +int netif_change_proto_down(struct net_device *dev, bool proto_down); int dev_change_proto_down(struct net_device *dev, bool proto_down); -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value); +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value); typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len); int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +void netif_set_group(struct net_device *dev, int new_group); void dev_set_group(struct net_device *dev, int new_group); +int netif_change_carrier(struct net_device *dev, bool new_carrier); int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); @@ -95,11 +148,27 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh); +static inline void netif_set_up(struct net_device *dev, bool value) +{ + if (value) + dev->flags |= IFF_UP; + else + dev->flags &= ~IFF_UP; + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->up = value; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { /* dev->gso_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_size, size); + if (size <= GSO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gso_ipv4_max_size, size); } static inline void netif_set_gso_max_segs(struct net_device *dev, @@ -114,6 +183,224 @@ static inline void netif_set_gro_max_size(struct net_device *dev, { /* This pairs with the READ_ONCE() in skb_gro_receive() */ WRITE_ONCE(dev->gro_max_size, size); + if (size <= GRO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gro_ipv4_max_size, size); +} + +static inline void netif_set_gso_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_ipv4_max_size, size); +} + +static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_ipv4_max_size, size); } +/** + * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs + * @n: napi struct to get the defer_hard_irqs field from + * + * Return: the per-NAPI value of the defar_hard_irqs field. + */ +static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n) +{ + return READ_ONCE(n->defer_hard_irqs); +} + +/** + * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi + * @n: napi_struct to set the defer_hard_irqs field + * @defer: the value the field should be set to + */ +static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) +{ + WRITE_ONCE(n->defer_hard_irqs, defer); +} + +/** + * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev + * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set + * @defer: the defer_hard_irqs value to set + */ +static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, + u32 defer) +{ + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); + struct napi_struct *napi; + int i; + + WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_defer_hard_irqs(napi, defer); + + for (i = 0; i < count; i++) + netdev->napi_config[i].defer_hard_irqs = defer; +} + +/** + * napi_get_gro_flush_timeout - get the gro_flush_timeout + * @n: napi struct to get the gro_flush_timeout from + * + * Return: the per-NAPI value of the gro_flush_timeout field. + */ +static inline unsigned long +napi_get_gro_flush_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->gro_flush_timeout); +} + +/** + * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi + * @n: napi struct to set the gro_flush_timeout + * @timeout: timeout value to set + * + * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout + */ +static inline void napi_set_gro_flush_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->gro_flush_timeout, timeout); +} + +/** + * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs + * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set + * @timeout: the timeout value to set + */ +static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, + unsigned long timeout) +{ + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); + struct napi_struct *napi; + int i; + + WRITE_ONCE(netdev->gro_flush_timeout, timeout); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_gro_flush_timeout(napi, timeout); + + for (i = 0; i < count; i++) + netdev->napi_config[i].gro_flush_timeout = timeout; +} + +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + +static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) +{ + if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state)) + return NETDEV_NAPI_THREADED_BUSY_POLL; + + if (test_bit(NAPI_STATE_THREADED, &n->state)) + return NETDEV_NAPI_THREADED_ENABLED; + + return NETDEV_NAPI_THREADED_DISABLED; +} + +static inline enum netdev_napi_threaded +napi_get_threaded_config(struct net_device *dev, struct napi_struct *n) +{ + if (n->config) + return n->config->threaded; + return dev->threaded; +} + +int napi_set_threaded(struct napi_struct *n, + enum netdev_napi_threaded threaded); + +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded); + +int rps_cpumask_housekeeping(struct cpumask *mask); + +#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) +void xdp_do_check_flushed(struct napi_struct *napi); +#else +static inline void xdp_do_check_flushed(struct napi_struct *napi) { } +#endif + +/* Best effort check that NAPI is not idle (can't be scheduled to run) */ +static inline void napi_assert_will_not_race(const struct napi_struct *napi) +{ + /* uninitialized instance, can't race */ + if (!napi->poll_list.next) + return; + + /* SCHED bit is set on disabled instances */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + WARN_ON(READ_ONCE(napi->list_owner) != -1); +} + +void kick_defer_list_purge(unsigned int cpu); + +#define XMIT_RECURSION_LIMIT 8 + +#ifndef CONFIG_PREEMPT_RT +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} +#else +static inline bool dev_xmit_recursion(void) +{ + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ + current->net_xmit.recursion--; +} +#endif + +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg); +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg); + #endif diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index baa63dee2829..76c91f224886 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -242,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, __hw_addr_del_entry(from_list, ha, false, false); } -static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) +int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; @@ -260,9 +260,10 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, } return err; } +EXPORT_SYMBOL(__hw_addr_sync_multiple); /* This function only works where there is a strict 1-1 relationship - * between source and destionation of they synch. If you ever need to + * between source and destination of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use * __hw_addr_sync_multiple(). */ @@ -299,8 +300,8 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, EXPORT_SYMBOL(__hw_addr_unsync); /** - * __hw_addr_sync_dev - Synchonize device's multicast list - * @list: address list to syncronize + * __hw_addr_sync_dev - Synchronize device's multicast list + * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed @@ -602,7 +603,7 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); - err = dev_pre_changeaddr_notify(dev, addr, NULL); + err = netif_pre_changeaddr_notify(dev, addr, NULL); if (err) return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 90e7e3811ae7..8e1dba825e94 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test) KUNIT_FAIL(test, "Can't register netdev %d", err); } - rtnl_lock(); return 0; } @@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test) { struct net_device *netdev = test->priv; - rtnl_unlock(); unregister_netdev(netdev); free_netdev(netdev); } @@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); memset(addr, 2, sizeof(addr)); @@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test) memset(addr, 3, sizeof(addr)); dev_addr_set(netdev, addr); KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); + rtnl_unlock(); } static void dev_addr_test_sync_one(struct kunit *test) @@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test) * considered synced and we overwrite in place. */ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_del(struct kunit *test) @@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); for (i = 1; i < 4; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, @@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_del_main(struct kunit *test) @@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test) NETDEV_HW_ADDR_T_LAN)); KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, NETDEV_HW_ADDR_T_LAN)); + rtnl_unlock(); } static void dev_addr_test_add_set(struct kunit *test) @@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); /* There is no external API like dev_addr_add_excl(), * so shuffle the tree a little bit and exploit aliasing. */ @@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_excl(struct kunit *test) @@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test) u8 addr[ETH_ALEN]; int i; + rtnl_lock(); for (i = 0; i < 10; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); @@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test) memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); } + rtnl_unlock(); } static struct kunit_case dev_addr_test_cases[] = { @@ -233,4 +243,5 @@ static struct kunit_suite dev_addr_test_suite = { }; kunit_test_suite(dev_addr_test_suite); +MODULE_DESCRIPTION("KUnit tests for struct netdev_hw_addr_list"); MODULE_LICENSE("GPL"); diff --git a/net/core/dev_api.c b/net/core/dev_api.c new file mode 100644 index 000000000000..f28852078aa6 --- /dev/null +++ b/net/core/dev_api.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/netdevice.h> +#include <net/netdev_lock.h> + +#include "dev.h" + +/** + * dev_change_name() - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_name(struct net_device *dev, const char *newname) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_name(dev, newname); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_set_alias() - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device. + * + * Return: 0 on success, -errno on failure. + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_alias(dev, alias, len); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_alias); + +/** + * dev_change_flags() - change device settings + * @dev: device + * @flags: device state flags + * @extack: netlink extended ack + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_flags(dev, flags, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_change_flags); + +/** + * dev_set_group() - change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + netdev_lock_ops(dev); + netif_set_group(dev, new_group); + netdev_unlock_ops(dev); +} + +int dev_set_mac_address_user(struct net_device *dev, + struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + netdev_lock_ops(dev); + ret = netif_set_mac_address(dev, ss, extack); + netdev_unlock_ops(dev); + up_write(&dev_addr_sem); + + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +/** + * dev_change_net_namespace() - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat) +{ + return __dev_change_net_namespace(dev, net, pat, 0, NULL); +} +EXPORT_SYMBOL_GPL(dev_change_net_namespace); + +/** + * dev_change_carrier() - change device carrier + * @dev: device + * @new_carrier: new value + * + * Change device carrier + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_carrier(struct net_device *dev, bool new_carrier) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_carrier(dev, new_carrier); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_change_tx_queue_len() - change TX queue length of a netdevice + * @dev: device + * @new_len: new tx queue length + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_tx_queue_len(dev, new_len); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_change_proto_down() - set carrier according to proto_down + * @dev: device + * @proto_down: new value + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_proto_down(dev, proto_down); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_open() - prepare an interface for use + * @dev: device to open + * @extack: netlink extended ack + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + * + * Return: 0 on success, -errno on failure. + */ +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_open(dev, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +/** + * dev_close() - shutdown an interface + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +void dev_close(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_close(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(dev_close); + +int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int ret = -ENODEV; + + if (!ops->ndo_eth_ioctl) + return -EOPNOTSUPP; + + netdev_lock_ops(dev); + if (netif_device_present(dev)) + ret = ops->ndo_eth_ioctl(dev, ifr, cmd); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_eth_ioctl); + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_mtu(dev, new_mtu); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_mtu); + +/** + * dev_disable_lro() - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_disable_lro(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(dev_disable_lro); + +/** + * dev_set_promiscuity() - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_promiscuity(dev, inc); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_promiscuity); + +/** + * dev_set_allmulti() - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * + * Return: 0 on success, -errno on failure. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_allmulti(dev, inc, true); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_allmulti); + +/** + * dev_set_mac_address() - change Media Access Control Address + * @dev: device + * @ss: new address + * @extack: netlink extended ack + * + * Change the hardware (MAC) address of the device + * + * Return: 0 on success, -errno on failure. + */ +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_mac_address(dev, ss, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address); + +int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_xdp_propagate(dev, bpf); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL_GPL(dev_xdp_propagate); + +/** + * netdev_state_change() - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_state_change(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(netdev_state_change); + +int dev_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) +{ + int ret; + + netdev_lock(dev); + ret = netif_set_threaded(dev, threaded); + netdev_unlock(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_threaded); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 5cdbfbf9a7dc..53a53357cfef 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -5,9 +5,12 @@ #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> +#include <linux/phylib_stubs.h> +#include <linux/ptp_clock_kernel.h> #include <linux/wireless.h> #include <linux/if_bridge.h> -#include <net/dsa.h> +#include <net/dsa_stubs.h> +#include <net/netdev_lock.h> #include <net/wext.h> #include "dev.h" @@ -63,7 +66,7 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc) } /* Loop over the interfaces, and write an info block for each. */ - rtnl_lock(); + rtnl_net_lock(net); for_each_netdev(net, dev) { if (!pos) done = inet_gifconf(dev, NULL, 0, size); @@ -71,12 +74,12 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc) done = inet_gifconf(dev, pos + total, len - total, size); if (done < 0) { - rtnl_unlock(); + rtnl_net_unlock(net); return -EFAULT; } total += done; } - rtnl_unlock(); + rtnl_net_unlock(net); return put_user(total, &uifc->ifc_len); } @@ -108,7 +111,7 @@ static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) return 0; } -static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) +static int netif_setifmap(struct net_device *dev, struct ifreq *ifr) { struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; @@ -144,7 +147,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (short) dev_get_flags(dev); + ifr->ifr_flags = (short)netif_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface @@ -183,22 +186,18 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm return err; } -static int net_hwtstamp_validate(struct ifreq *ifr) +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { - struct hwtstamp_config cfg; enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; int tx_type_valid = 0; int rx_filter_valid = 0; - if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) - return -EFAULT; - - if (cfg.flags & ~HWTSTAMP_FLAG_MASK) + if (cfg->flags & ~HWTSTAMP_FLAG_MASK) return -EINVAL; - tx_type = cfg.tx_type; - rx_filter = cfg.rx_filter; + tx_type = cfg->tx_type; + rx_filter = cfg->rx_filter; switch (tx_type) { case HWTSTAMP_TX_OFF: @@ -242,25 +241,271 @@ static int net_hwtstamp_validate(struct ifreq *ifr) return 0; } -static int dev_eth_ioctl(struct net_device *dev, - struct ifreq *ifr, unsigned int cmd) +/** + * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * + * Helper for calling the default hardware provider timestamping. + * + * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but + * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this + * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not + * implemented for now, which is still more accurate than letting the netdev + * handle the GET request. + */ +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + cfg->qualifier = hwprov->desc.qualifier; + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) + return phy_hwtstamp_get(hwprov->phydev, cfg); + + if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); + + return -EOPNOTSUPP; + } + + if (phy_is_default_hwtstamp(dev->phydev)) + return phy_hwtstamp_get(dev->phydev, cfg); + + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); +} + +static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; + struct hwtstamp_config cfg; int err; - err = dsa_ndo_eth_ioctl(dev, ifr, cmd); - if (err == 0 || err != -EOPNOTSUPP) + if (!ops->ndo_hwtstamp_get) + return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + kernel_cfg.ifr = ifr; + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); + netdev_unlock_ops(dev); + if (err) return err; - if (ops->ndo_eth_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_eth_ioctl(dev, ifr, cmd); - else - err = -ENODEV; + /* If the request was resolved through an unconverted driver, omit + * the copy_to_user(), since the implementation has already done that + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; } - return err; + return 0; +} + +/** + * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * @extack: Netlink extended ack message structure, for error reporting + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. If the netdev driver needs to perform specific actions even for PHY + * timestamping to work properly (a switch port must trap the timestamped + * frames and not forward them), it must set dev->see_all_hwtstamp_requests. + */ +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config old_cfg = {}; + struct hwtstamp_provider *hwprov; + struct phy_device *phydev; + bool changed = false; + bool phy_ts; + int err; + + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) { + phy_ts = true; + phydev = hwprov->phydev; + } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) { + phy_ts = false; + } else { + return -EOPNOTSUPP; + } + + cfg->qualifier = hwprov->desc.qualifier; + } else { + phy_ts = phy_is_default_hwtstamp(dev->phydev); + if (phy_ts) + phydev = dev->phydev; + } + + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; + + if (phy_ts && dev->see_all_hwtstamp_requests) { + err = ops->ndo_hwtstamp_get(dev, &old_cfg); + if (err) + return err; + } + + if (!phy_ts || dev->see_all_hwtstamp_requests) { + err = ops->ndo_hwtstamp_set(dev, cfg, extack); + if (err) { + if (extack->_msg) + netdev_err(dev, "%s\n", extack->_msg); + return err; + } + } + + if (phy_ts && dev->see_all_hwtstamp_requests) + changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); + + if (phy_ts) { + err = phy_hwtstamp_set(phydev, cfg, extack); + if (err) { + if (changed) + ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); + return err; + } + } + + return 0; +} + +static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; + struct netlink_ext_ack extack = {}; + struct hwtstamp_config cfg; + int err; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + hwtstamp_config_to_kernel(&kernel_cfg, &cfg); + kernel_cfg.ifr = ifr; + + err = net_hwtstamp_validate(&kernel_cfg); + if (err) + return err; + + err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack); + if (err) { + if (extack._msg) + netdev_err(dev, "%s\n", extack._msg); + return err; + } + + if (!ops->ndo_hwtstamp_set) + return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); + netdev_unlock_ops(dev); + if (err) + return err; + + /* The driver may have modified the configuration, so copy the + * updated version of it back to user space + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, + struct kernel_hwtstamp_config *kernel_cfg) +{ + struct ifreq ifrr; + int err; + + if (!kernel_cfg->ifr) + return -EINVAL; + + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); + ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; + + err = dev_eth_ioctl(dev, &ifrr, cmd); + if (err) + return err; + + kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; + kernel_cfg->copied_to_user = true; + + return 0; +} + +int generic_hwtstamp_get_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); } +EXPORT_SYMBOL(generic_hwtstamp_get_lower); + +int generic_hwtstamp_set_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); +} +EXPORT_SYMBOL(generic_hwtstamp_set_lower); static int dev_siocbond(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) @@ -268,10 +513,14 @@ static int dev_siocbond(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocbond) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocbond(dev, ifr, cmd); - else - return -ENODEV; + ret = ops->ndo_siocbond(dev, ifr, cmd); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; @@ -283,10 +532,14 @@ static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocdevprivate) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocdevprivate(dev, ifr, data, cmd); - else - return -ENODEV; + ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; @@ -297,17 +550,21 @@ static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocwandev) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocwandev(dev, ifs); - else - return -ENODEV; + ret = ops->ndo_siocwandev(dev, ifs); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; } /* - * Perform the SIOCxIFxxx calls, inside rtnl_lock() + * Perform the SIOCxIFxxx calls, inside rtnl_net_lock() */ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, unsigned int cmd) @@ -315,7 +572,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; - netdevice_tracker dev_tracker; if (!dev) return -ENODEV; @@ -334,21 +590,28 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: - if (dev->addr_len > sizeof(struct sockaddr)) + if (dev->addr_len > sizeof(ifr->ifr_hwaddr)) return -EINVAL; - return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); + return dev_set_mac_address_user(dev, + (struct sockaddr_storage *)&ifr->ifr_hwaddr, + NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof(ifr->ifr_hwaddr.sa_data_min), + min(sizeof(ifr->ifr_hwaddr.sa_data), (size_t)dev->addr_len)); + netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + netdev_unlock_ops(dev); return 0; case SIOCSIFMAP: - return dev_setifmap(dev, ifr); + netdev_lock_ops(dev); + err = netif_setifmap(dev, ifr); + netdev_unlock_ops(dev); + return err; case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || @@ -356,7 +619,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_lock_ops(dev); + err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_unlock_ops(dev); + return err; case SIOCDELMULTI: if (!ops->ndo_set_rx_mode || @@ -364,7 +630,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_lock_ops(dev); + err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_unlock_ops(dev); + return err; case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) @@ -378,49 +647,31 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, case SIOCWANDEV: return dev_siocwandev(dev, &ifr->ifr_settings); - case SIOCBRADDIF: - case SIOCBRDELIF: - if (!netif_device_present(dev)) - return -ENODEV; - if (!netif_is_bridge_master(dev)) - return -EOPNOTSUPP; - netdev_hold(dev, &dev_tracker, GFP_KERNEL); - rtnl_unlock(); - err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); - netdev_put(dev, &dev_tracker); - rtnl_lock(); - return err; + case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: + return dev_siocdevprivate(dev, ifr, data, cmd); case SIOCSHWTSTAMP: - err = net_hwtstamp_validate(ifr); - if (err) - return err; - fallthrough; + return dev_set_hwtstamp(dev, ifr); - /* - * Unknown or private ioctl - */ - default: - if (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) - return dev_siocdevprivate(dev, ifr, data, cmd); - - if (cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCSHWTSTAMP || - cmd == SIOCGHWTSTAMP) { - err = dev_eth_ioctl(dev, ifr, cmd); - } else if (cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE) { - err = dev_siocbond(dev, ifr, cmd); - } else - err = -EINVAL; + case SIOCGHWTSTAMP: + return dev_get_hwtstamp(dev, ifr); + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSMIIREG: + return dev_eth_ioctl(dev, ifr, cmd); + + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + case SIOCBONDCHANGEACTIVE: + return dev_siocbond(dev, ifr, cmd); + + /* Unknown ioctl */ + default: + err = -EINVAL; } return err; } @@ -462,6 +713,7 @@ EXPORT_SYMBOL(dev_load); * @net: the applicable net namespace * @cmd: command to issue * @ifr: pointer to a struct ifreq in user space + * @data: data exchanged with userspace * @need_copyout: whether or not copy_to_user() should be called * * Issue ioctl functions to devices. This is normally called by the @@ -494,7 +746,8 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, switch (cmd) { case SIOCGIFHWADDR: dev_load(net, ifr->ifr_name); - ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); + ret = netif_get_mac_address(&ifr->ifr_hwaddr, net, + ifr->ifr_name); if (colon) *colon = ':'; return ret; @@ -538,9 +791,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, dev_load(net, ifr->ifr_name); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); + if (colon) *colon = ':'; return ret; @@ -575,8 +830,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -584,9 +837,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); + if (need_copyout) *need_copyout = false; return ret; @@ -609,9 +864,10 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); return ret; } return -ENOTTY; diff --git a/net/core/devlink.c b/net/core/devlink.c deleted file mode 100644 index 032d6d0a5ce6..000000000000 --- a/net/core/devlink.c +++ /dev/null @@ -1,13029 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/core/devlink.c - Network physical/parent device Netlink interface - * - * Heavily inspired by net/wireless/ - * Copyright (c) 2016 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> - */ - -#include <linux/etherdevice.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/gfp.h> -#include <linux/device.h> -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/spinlock.h> -#include <linux/refcount.h> -#include <linux/workqueue.h> -#include <linux/u64_stats_sync.h> -#include <linux/timekeeping.h> -#include <rdma/ib_verbs.h> -#include <net/netlink.h> -#include <net/genetlink.h> -#include <net/rtnetlink.h> -#include <net/net_namespace.h> -#include <net/sock.h> -#include <net/devlink.h> -#define CREATE_TRACE_POINTS -#include <trace/events/devlink.h> - -#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ - (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) - -struct devlink_dev_stats { - u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; - u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; -}; - -struct devlink { - u32 index; - struct xarray ports; - struct list_head rate_list; - struct list_head sb_list; - struct list_head dpipe_table_list; - struct list_head resource_list; - struct list_head param_list; - struct list_head region_list; - struct list_head reporter_list; - struct mutex reporters_lock; /* protects reporter_list */ - struct devlink_dpipe_headers *dpipe_headers; - struct list_head trap_list; - struct list_head trap_group_list; - struct list_head trap_policer_list; - struct list_head linecard_list; - struct mutex linecards_lock; /* protects linecard_list */ - const struct devlink_ops *ops; - u64 features; - struct xarray snapshot_ids; - struct devlink_dev_stats stats; - struct device *dev; - possible_net_t _net; - /* Serializes access to devlink instance specific objects such as - * port, sb, dpipe, resource, params, region, traps and more. - */ - struct mutex lock; - struct lock_class_key lock_key; - u8 reload_failed:1; - refcount_t refcount; - struct completion comp; - struct rcu_head rcu; - struct notifier_block netdevice_nb; - char priv[] __aligned(NETDEV_ALIGN); -}; - -struct devlink_linecard_ops; -struct devlink_linecard_type; - -struct devlink_linecard { - struct list_head list; - struct devlink *devlink; - unsigned int index; - refcount_t refcount; - const struct devlink_linecard_ops *ops; - void *priv; - enum devlink_linecard_state state; - struct mutex state_lock; /* Protects state */ - const char *type; - struct devlink_linecard_type *types; - unsigned int types_count; - struct devlink *nested_devlink; -}; - -/** - * struct devlink_resource - devlink resource - * @name: name of the resource - * @id: id, per devlink instance - * @size: size of the resource - * @size_new: updated size of the resource, reload is needed - * @size_valid: valid in case the total size of the resource is valid - * including its children - * @parent: parent resource - * @size_params: size parameters - * @list: parent list - * @resource_list: list of child resources - * @occ_get: occupancy getter callback - * @occ_get_priv: occupancy getter callback priv - */ -struct devlink_resource { - const char *name; - u64 id; - u64 size; - u64 size_new; - bool size_valid; - struct devlink_resource *parent; - struct devlink_resource_size_params size_params; - struct list_head list; - struct list_head resource_list; - devlink_resource_occ_get_t *occ_get; - void *occ_get_priv; -}; - -void *devlink_priv(struct devlink *devlink) -{ - return &devlink->priv; -} -EXPORT_SYMBOL_GPL(devlink_priv); - -struct devlink *priv_to_devlink(void *priv) -{ - return container_of(priv, struct devlink, priv); -} -EXPORT_SYMBOL_GPL(priv_to_devlink); - -struct device *devlink_to_dev(const struct devlink *devlink) -{ - return devlink->dev; -} -EXPORT_SYMBOL_GPL(devlink_to_dev); - -static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { - { - .name = "destination mac", - .id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC, - .bitwidth = 48, - }, -}; - -struct devlink_dpipe_header devlink_dpipe_header_ethernet = { - .name = "ethernet", - .id = DEVLINK_DPIPE_HEADER_ETHERNET, - .fields = devlink_dpipe_fields_ethernet, - .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet), - .global = true, -}; -EXPORT_SYMBOL_GPL(devlink_dpipe_header_ethernet); - -static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = { - { - .name = "destination ip", - .id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP, - .bitwidth = 32, - }, -}; - -struct devlink_dpipe_header devlink_dpipe_header_ipv4 = { - .name = "ipv4", - .id = DEVLINK_DPIPE_HEADER_IPV4, - .fields = devlink_dpipe_fields_ipv4, - .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4), - .global = true, -}; -EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv4); - -static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = { - { - .name = "destination ip", - .id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP, - .bitwidth = 128, - }, -}; - -struct devlink_dpipe_header devlink_dpipe_header_ipv6 = { - .name = "ipv6", - .id = DEVLINK_DPIPE_HEADER_IPV6, - .fields = devlink_dpipe_fields_ipv6, - .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6), - .global = true, -}; -EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv6); - -EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); -EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); -EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); - -#define DEVLINK_PORT_FN_CAPS_VALID_MASK \ - (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) - -static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { - [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, - [DEVLINK_PORT_FN_ATTR_STATE] = - NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, - DEVLINK_PORT_FN_STATE_ACTIVE), - [DEVLINK_PORT_FN_ATTR_CAPS] = - NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), -}; - -static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { - [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG }, -}; - -static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); -#define DEVLINK_REGISTERED XA_MARK_1 -#define DEVLINK_UNREGISTERING XA_MARK_2 - -/* devlink instances are open to the access from the user space after - * devlink_register() call. Such logical barrier allows us to have certain - * expectations related to locking. - * - * Before *_register() - we are in initialization stage and no parallel - * access possible to the devlink instance. All drivers perform that phase - * by implicitly holding device_lock. - * - * After *_register() - users and driver can access devlink instance at - * the same time. - */ -#define ASSERT_DEVLINK_REGISTERED(d) \ - WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) -#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ - WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) - -struct net *devlink_net(const struct devlink *devlink) -{ - return read_pnet(&devlink->_net); -} -EXPORT_SYMBOL_GPL(devlink_net); - -static void __devlink_put_rcu(struct rcu_head *head) -{ - struct devlink *devlink = container_of(head, struct devlink, rcu); - - complete(&devlink->comp); -} - -void devlink_put(struct devlink *devlink) -{ - if (refcount_dec_and_test(&devlink->refcount)) - /* Make sure unregister operation that may await the completion - * is unblocked only after all users are after the end of - * RCU grace period. - */ - call_rcu(&devlink->rcu, __devlink_put_rcu); -} - -struct devlink *__must_check devlink_try_get(struct devlink *devlink) -{ - if (refcount_inc_not_zero(&devlink->refcount)) - return devlink; - return NULL; -} - -void devl_assert_locked(struct devlink *devlink) -{ - lockdep_assert_held(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_assert_locked); - -#ifdef CONFIG_LOCKDEP -/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ -bool devl_lock_is_held(struct devlink *devlink) -{ - return lockdep_is_held(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_lock_is_held); -#endif - -void devl_lock(struct devlink *devlink) -{ - mutex_lock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_lock); - -int devl_trylock(struct devlink *devlink) -{ - return mutex_trylock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_trylock); - -void devl_unlock(struct devlink *devlink) -{ - mutex_unlock(&devlink->lock); -} -EXPORT_SYMBOL_GPL(devl_unlock); - -static struct devlink * -devlinks_xa_find_get(struct net *net, unsigned long *indexp, xa_mark_t filter, - void * (*xa_find_fn)(struct xarray *, unsigned long *, - unsigned long, xa_mark_t)) -{ - struct devlink *devlink; - - rcu_read_lock(); -retry: - devlink = xa_find_fn(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); - if (!devlink) - goto unlock; - - /* In case devlink_unregister() was already called and "unregistering" - * mark was set, do not allow to get a devlink reference here. - * This prevents live-lock of devlink_unregister() wait for completion. - */ - if (xa_get_mark(&devlinks, *indexp, DEVLINK_UNREGISTERING)) - goto retry; - - /* For a possible retry, the xa_find_after() should be always used */ - xa_find_fn = xa_find_after; - if (!devlink_try_get(devlink)) - goto retry; - if (!net_eq(devlink_net(devlink), net)) { - devlink_put(devlink); - goto retry; - } -unlock: - rcu_read_unlock(); - return devlink; -} - -static struct devlink *devlinks_xa_find_get_first(struct net *net, - unsigned long *indexp, - xa_mark_t filter) -{ - return devlinks_xa_find_get(net, indexp, filter, xa_find); -} - -static struct devlink *devlinks_xa_find_get_next(struct net *net, - unsigned long *indexp, - xa_mark_t filter) -{ - return devlinks_xa_find_get(net, indexp, filter, xa_find_after); -} - -/* Iterate over devlink pointers which were possible to get reference to. - * devlink_put() needs to be called for each iterated devlink pointer - * in loop body in order to release the reference. - */ -#define devlinks_xa_for_each_get(net, index, devlink, filter) \ - for (index = 0, \ - devlink = devlinks_xa_find_get_first(net, &index, filter); \ - devlink; devlink = devlinks_xa_find_get_next(net, &index, filter)) - -#define devlinks_xa_for_each_registered_get(net, index, devlink) \ - devlinks_xa_for_each_get(net, index, devlink, DEVLINK_REGISTERED) - -static struct devlink *devlink_get_from_attrs(struct net *net, - struct nlattr **attrs) -{ - struct devlink *devlink; - unsigned long index; - char *busname; - char *devname; - - if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) - return ERR_PTR(-EINVAL); - - busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); - devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); - - devlinks_xa_for_each_registered_get(net, index, devlink) { - if (strcmp(devlink->dev->bus->name, busname) == 0 && - strcmp(dev_name(devlink->dev), devname) == 0) - return devlink; - devlink_put(devlink); - } - - return ERR_PTR(-ENODEV); -} - -#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ - WARN_ON_ONCE(!(devlink_port)->registered) -#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ - WARN_ON_ONCE((devlink_port)->registered) -#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ - WARN_ON_ONCE(!(devlink_port)->initialized) - -static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, - unsigned int port_index) -{ - return xa_load(&devlink->ports, port_index); -} - -static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, - struct nlattr **attrs) -{ - if (attrs[DEVLINK_ATTR_PORT_INDEX]) { - u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); - struct devlink_port *devlink_port; - - devlink_port = devlink_port_get_by_index(devlink, port_index); - if (!devlink_port) - return ERR_PTR(-ENODEV); - return devlink_port; - } - return ERR_PTR(-EINVAL); -} - -static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - return devlink_port_get_from_attrs(devlink, info->attrs); -} - -static inline bool -devlink_rate_is_leaf(struct devlink_rate *devlink_rate) -{ - return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; -} - -static inline bool -devlink_rate_is_node(struct devlink_rate *devlink_rate) -{ - return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; -} - -static struct devlink_rate * -devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) -{ - struct devlink_rate *devlink_rate; - struct devlink_port *devlink_port; - - devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); - if (IS_ERR(devlink_port)) - return ERR_CAST(devlink_port); - devlink_rate = devlink_port->devlink_rate; - return devlink_rate ?: ERR_PTR(-ENODEV); -} - -static struct devlink_rate * -devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) -{ - static struct devlink_rate *devlink_rate; - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - if (devlink_rate_is_node(devlink_rate) && - !strcmp(node_name, devlink_rate->name)) - return devlink_rate; - } - return ERR_PTR(-ENODEV); -} - -static struct devlink_rate * -devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) -{ - const char *rate_node_name; - size_t len; - - if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) - return ERR_PTR(-EINVAL); - rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); - len = strlen(rate_node_name); - /* Name cannot be empty or decimal number */ - if (!len || strspn(rate_node_name, "0123456789") == len) - return ERR_PTR(-EINVAL); - - return devlink_rate_node_get_by_name(devlink, rate_node_name); -} - -static struct devlink_rate * -devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) -{ - return devlink_rate_node_get_from_attrs(devlink, info->attrs); -} - -static struct devlink_rate * -devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) -{ - struct nlattr **attrs = info->attrs; - - if (attrs[DEVLINK_ATTR_PORT_INDEX]) - return devlink_rate_leaf_get_from_info(devlink, info); - else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) - return devlink_rate_node_get_from_info(devlink, info); - else - return ERR_PTR(-EINVAL); -} - -static struct devlink_linecard * -devlink_linecard_get_by_index(struct devlink *devlink, - unsigned int linecard_index) -{ - struct devlink_linecard *devlink_linecard; - - list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) { - if (devlink_linecard->index == linecard_index) - return devlink_linecard; - } - return NULL; -} - -static bool devlink_linecard_index_exists(struct devlink *devlink, - unsigned int linecard_index) -{ - return devlink_linecard_get_by_index(devlink, linecard_index); -} - -static struct devlink_linecard * -devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) -{ - if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) { - u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); - struct devlink_linecard *linecard; - - mutex_lock(&devlink->linecards_lock); - linecard = devlink_linecard_get_by_index(devlink, linecard_index); - if (linecard) - refcount_inc(&linecard->refcount); - mutex_unlock(&devlink->linecards_lock); - if (!linecard) - return ERR_PTR(-ENODEV); - return linecard; - } - return ERR_PTR(-EINVAL); -} - -static struct devlink_linecard * -devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) -{ - return devlink_linecard_get_from_attrs(devlink, info->attrs); -} - -static void devlink_linecard_put(struct devlink_linecard *linecard) -{ - if (refcount_dec_and_test(&linecard->refcount)) { - mutex_destroy(&linecard->state_lock); - kfree(linecard); - } -} - -struct devlink_sb { - struct list_head list; - unsigned int index; - u32 size; - u16 ingress_pools_count; - u16 egress_pools_count; - u16 ingress_tc_count; - u16 egress_tc_count; -}; - -static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb) -{ - return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count; -} - -static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink, - unsigned int sb_index) -{ - struct devlink_sb *devlink_sb; - - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - if (devlink_sb->index == sb_index) - return devlink_sb; - } - return NULL; -} - -static bool devlink_sb_index_exists(struct devlink *devlink, - unsigned int sb_index) -{ - return devlink_sb_get_by_index(devlink, sb_index); -} - -static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink, - struct nlattr **attrs) -{ - if (attrs[DEVLINK_ATTR_SB_INDEX]) { - u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]); - struct devlink_sb *devlink_sb; - - devlink_sb = devlink_sb_get_by_index(devlink, sb_index); - if (!devlink_sb) - return ERR_PTR(-ENODEV); - return devlink_sb; - } - return ERR_PTR(-EINVAL); -} - -static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - return devlink_sb_get_from_attrs(devlink, info->attrs); -} - -static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb, - struct nlattr **attrs, - u16 *p_pool_index) -{ - u16 val; - - if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX]) - return -EINVAL; - - val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]); - if (val >= devlink_sb_pool_count(devlink_sb)) - return -EINVAL; - *p_pool_index = val; - return 0; -} - -static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb, - struct genl_info *info, - u16 *p_pool_index) -{ - return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs, - p_pool_index); -} - -static int -devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs, - enum devlink_sb_pool_type *p_pool_type) -{ - u8 val; - - if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE]) - return -EINVAL; - - val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]); - if (val != DEVLINK_SB_POOL_TYPE_INGRESS && - val != DEVLINK_SB_POOL_TYPE_EGRESS) - return -EINVAL; - *p_pool_type = val; - return 0; -} - -static int -devlink_sb_pool_type_get_from_info(struct genl_info *info, - enum devlink_sb_pool_type *p_pool_type) -{ - return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type); -} - -static int -devlink_sb_th_type_get_from_attrs(struct nlattr **attrs, - enum devlink_sb_threshold_type *p_th_type) -{ - u8 val; - - if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]) - return -EINVAL; - - val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]); - if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC && - val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC) - return -EINVAL; - *p_th_type = val; - return 0; -} - -static int -devlink_sb_th_type_get_from_info(struct genl_info *info, - enum devlink_sb_threshold_type *p_th_type) -{ - return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type); -} - -static int -devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, - struct nlattr **attrs, - enum devlink_sb_pool_type pool_type, - u16 *p_tc_index) -{ - u16 val; - - if (!attrs[DEVLINK_ATTR_SB_TC_INDEX]) - return -EINVAL; - - val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]); - if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS && - val >= devlink_sb->ingress_tc_count) - return -EINVAL; - if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS && - val >= devlink_sb->egress_tc_count) - return -EINVAL; - *p_tc_index = val; - return 0; -} - -static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, - u32 cap, bool is_enable) -{ - caps->selector |= cap; - if (is_enable) - caps->value |= cap; -} - -static int devlink_port_fn_roce_fill(const struct devlink_ops *ops, - struct devlink_port *devlink_port, - struct nla_bitfield32 *caps, - struct netlink_ext_ack *extack) -{ - bool is_enable; - int err; - - if (!ops->port_fn_roce_get) - return 0; - - err = ops->port_fn_roce_get(devlink_port, &is_enable, extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); - return 0; -} - -static int devlink_port_fn_migratable_fill(const struct devlink_ops *ops, - struct devlink_port *devlink_port, - struct nla_bitfield32 *caps, - struct netlink_ext_ack *extack) -{ - bool is_enable; - int err; - - if (!ops->port_fn_migratable_get || - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) - return 0; - - err = ops->port_fn_migratable_get(devlink_port, &is_enable, extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); - return 0; -} - -static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, - struct devlink_port *devlink_port, - struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) -{ - struct nla_bitfield32 caps = {}; - int err; - - err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack); - if (err) - return err; - - err = devlink_port_fn_migratable_fill(ops, devlink_port, &caps, extack); - if (err) - return err; - - if (!caps.selector) - return 0; - err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, - caps.selector); - if (err) - return err; - - *msg_updated = true; - return 0; -} - -static int -devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, - struct genl_info *info, - enum devlink_sb_pool_type pool_type, - u16 *p_tc_index) -{ - return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs, - pool_type, p_tc_index); -} - -struct devlink_region { - struct devlink *devlink; - struct devlink_port *port; - struct list_head list; - union { - const struct devlink_region_ops *ops; - const struct devlink_port_region_ops *port_ops; - }; - struct mutex snapshot_lock; /* protects snapshot_list, - * max_snapshots and cur_snapshots - * consistency. - */ - struct list_head snapshot_list; - u32 max_snapshots; - u32 cur_snapshots; - u64 size; -}; - -struct devlink_snapshot { - struct list_head list; - struct devlink_region *region; - u8 *data; - u32 id; -}; - -static struct devlink_region * -devlink_region_get_by_name(struct devlink *devlink, const char *region_name) -{ - struct devlink_region *region; - - list_for_each_entry(region, &devlink->region_list, list) - if (!strcmp(region->ops->name, region_name)) - return region; - - return NULL; -} - -static struct devlink_region * -devlink_port_region_get_by_name(struct devlink_port *port, - const char *region_name) -{ - struct devlink_region *region; - - list_for_each_entry(region, &port->region_list, list) - if (!strcmp(region->ops->name, region_name)) - return region; - - return NULL; -} - -static struct devlink_snapshot * -devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) -{ - struct devlink_snapshot *snapshot; - - list_for_each_entry(snapshot, ®ion->snapshot_list, list) - if (snapshot->id == id) - return snapshot; - - return NULL; -} - -#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) -#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) -#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) -#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) -#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) - -static int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_linecard *linecard; - struct devlink_port *devlink_port; - struct devlink *devlink; - int err; - - devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs); - if (IS_ERR(devlink)) - return PTR_ERR(devlink); - devl_lock(devlink); - info->user_ptr[0] = devlink; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { - devlink_port = devlink_port_get_from_info(devlink, info); - if (IS_ERR(devlink_port)) { - err = PTR_ERR(devlink_port); - goto unlock; - } - info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { - devlink_port = devlink_port_get_from_info(devlink, info); - if (!IS_ERR(devlink_port)) - info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { - struct devlink_rate *devlink_rate; - - devlink_rate = devlink_rate_get_from_info(devlink, info); - if (IS_ERR(devlink_rate)) { - err = PTR_ERR(devlink_rate); - goto unlock; - } - info->user_ptr[1] = devlink_rate; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_from_info(devlink, info); - if (IS_ERR(rate_node)) { - err = PTR_ERR(rate_node); - goto unlock; - } - info->user_ptr[1] = rate_node; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) { - err = PTR_ERR(linecard); - goto unlock; - } - info->user_ptr[1] = linecard; - } - return 0; - -unlock: - devl_unlock(devlink); - devlink_put(devlink); - return err; -} - -static void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_linecard *linecard; - struct devlink *devlink; - - devlink = info->user_ptr[0]; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = info->user_ptr[1]; - devlink_linecard_put(linecard); - } - devl_unlock(devlink); - devlink_put(devlink); -} - -static struct genl_family devlink_nl_family; - -enum devlink_multicast_groups { - DEVLINK_MCGRP_CONFIG, -}; - -static const struct genl_multicast_group devlink_nl_mcgrps[] = { - [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, -}; - -static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) -{ - if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) - return -EMSGSIZE; - if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) - return -EMSGSIZE; - return 0; -} - -static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink) -{ - struct nlattr *nested_attr; - - nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK); - if (!nested_attr) - return -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - nla_nest_end(msg, nested_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, nested_attr); - return -EMSGSIZE; -} - -int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) -{ - if (devlink_nl_put_handle(msg, devlink_port->devlink)) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) - return -EMSGSIZE; - return 0; -} - -size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) -{ - struct devlink *devlink = devlink_port->devlink; - - return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ - + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ - + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ -} - -struct devlink_reload_combination { - enum devlink_reload_action action; - enum devlink_reload_limit limit; -}; - -static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { - { - /* can't reinitialize driver with no down time */ - .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, - }, -}; - -static bool -devlink_reload_combination_is_invalid(enum devlink_reload_action action, - enum devlink_reload_limit limit) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) - if (devlink_reload_invalid_combinations[i].action == action && - devlink_reload_invalid_combinations[i].limit == limit) - return true; - return false; -} - -static bool -devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) -{ - return test_bit(action, &devlink->ops->reload_actions); -} - -static bool -devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) -{ - return test_bit(limit, &devlink->ops->reload_limits); -} - -static int devlink_reload_stat_put(struct sk_buff *msg, - enum devlink_reload_limit limit, u32 value) -{ - struct nlattr *reload_stats_entry; - - reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); - if (!reload_stats_entry) - return -EMSGSIZE; - - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || - nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) - goto nla_put_failure; - nla_nest_end(msg, reload_stats_entry); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, reload_stats_entry); - return -EMSGSIZE; -} - -static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) -{ - struct nlattr *reload_stats_attr, *act_info, *act_stats; - int i, j, stat_idx; - u32 value; - - if (!is_remote) - reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); - else - reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); - - if (!reload_stats_attr) - return -EMSGSIZE; - - for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { - if ((!is_remote && - !devlink_reload_action_is_supported(devlink, i)) || - i == DEVLINK_RELOAD_ACTION_UNSPEC) - continue; - act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); - if (!act_info) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) - goto action_info_nest_cancel; - act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); - if (!act_stats) - goto action_info_nest_cancel; - - for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { - /* Remote stats are shown even if not locally supported. - * Stats of actions with unspecified limit are shown - * though drivers don't need to register unspecified - * limit. - */ - if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && - !devlink_reload_limit_is_supported(devlink, j)) || - devlink_reload_combination_is_invalid(i, j)) - continue; - - stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; - if (!is_remote) - value = devlink->stats.reload_stats[stat_idx]; - else - value = devlink->stats.remote_reload_stats[stat_idx]; - if (devlink_reload_stat_put(msg, j, value)) - goto action_stats_nest_cancel; - } - nla_nest_end(msg, act_stats); - nla_nest_end(msg, act_info); - } - nla_nest_end(msg, reload_stats_attr); - return 0; - -action_stats_nest_cancel: - nla_nest_cancel(msg, act_stats); -action_info_nest_cancel: - nla_nest_cancel(msg, act_info); -nla_put_failure: - nla_nest_cancel(msg, reload_stats_attr); - return -EMSGSIZE; -} - -static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - struct nlattr *dev_stats; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) - goto nla_put_failure; - - dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); - if (!dev_stats) - goto nla_put_failure; - - if (devlink_reload_stats_put(msg, devlink, false)) - goto dev_stats_nest_cancel; - if (devlink_reload_stats_put(msg, devlink, true)) - goto dev_stats_nest_cancel; - - nla_nest_end(msg, dev_stats); - genlmsg_end(msg, hdr); - return 0; - -dev_stats_nest_cancel: - nla_nest_cancel(msg, dev_stats); -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); - WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static int devlink_nl_port_attrs_put(struct sk_buff *msg, - struct devlink_port *devlink_port) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - - if (!devlink_port->attrs_set) - return 0; - if (attrs->lanes) { - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) - return -EMSGSIZE; - } - if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) - return -EMSGSIZE; - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) - return -EMSGSIZE; - switch (devlink_port->attrs.flavour) { - case DEVLINK_PORT_FLAVOUR_PCI_PF: - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, - attrs->pci_pf.controller) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) - return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) - return -EMSGSIZE; - break; - case DEVLINK_PORT_FLAVOUR_PCI_VF: - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, - attrs->pci_vf.controller) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) - return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) - return -EMSGSIZE; - break; - case DEVLINK_PORT_FLAVOUR_PCI_SF: - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, - attrs->pci_sf.controller) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_sf.pf) || - nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, - attrs->pci_sf.sf)) - return -EMSGSIZE; - break; - case DEVLINK_PORT_FLAVOUR_PHYSICAL: - case DEVLINK_PORT_FLAVOUR_CPU: - case DEVLINK_PORT_FLAVOUR_DSA: - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, - attrs->phys.port_number)) - return -EMSGSIZE; - if (!attrs->split) - return 0; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, - attrs->phys.port_number)) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, - attrs->phys.split_subport_number)) - return -EMSGSIZE; - break; - default: - break; - } - return 0; -} - -static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops, - struct devlink_port *port, - struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) -{ - u8 hw_addr[MAX_ADDR_LEN]; - int hw_addr_len; - int err; - - if (!ops->port_function_hw_addr_get) - return 0; - - err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len, - extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); - if (err) - return err; - *msg_updated = true; - return 0; -} - -static int devlink_nl_rate_fill(struct sk_buff *msg, - struct devlink_rate *devlink_rate, - enum devlink_command cmd, u32 portid, u32 seq, - int flags, struct netlink_ext_ack *extack) -{ - struct devlink *devlink = devlink_rate->devlink; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) - goto nla_put_failure; - - if (devlink_rate_is_leaf(devlink_rate)) { - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - devlink_rate->devlink_port->index)) - goto nla_put_failure; - } else if (devlink_rate_is_node(devlink_rate)) { - if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, - devlink_rate->name)) - goto nla_put_failure; - } - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, - devlink_rate->tx_share, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX, - devlink_rate->tx_max, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, - devlink_rate->tx_priority)) - goto nla_put_failure; - - if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, - devlink_rate->tx_weight)) - goto nla_put_failure; - - if (devlink_rate->parent) - if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, - devlink_rate->parent->name)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static bool -devlink_port_fn_state_valid(enum devlink_port_fn_state state) -{ - return state == DEVLINK_PORT_FN_STATE_INACTIVE || - state == DEVLINK_PORT_FN_STATE_ACTIVE; -} - -static bool -devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) -{ - return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || - opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; -} - -static int devlink_port_fn_state_fill(const struct devlink_ops *ops, - struct devlink_port *port, - struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) -{ - enum devlink_port_fn_opstate opstate; - enum devlink_port_fn_state state; - int err; - - if (!ops->port_fn_state_get) - return 0; - - err = ops->port_fn_state_get(port, &state, &opstate, extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - if (!devlink_port_fn_state_valid(state)) { - WARN_ON_ONCE(1); - NL_SET_ERR_MSG_MOD(extack, "Invalid state read from driver"); - return -EINVAL; - } - if (!devlink_port_fn_opstate_valid(opstate)) { - WARN_ON_ONCE(1); - NL_SET_ERR_MSG_MOD(extack, - "Invalid operational state read from driver"); - return -EINVAL; - } - if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || - nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) - return -EMSGSIZE; - *msg_updated = true; - return 0; -} - -static int -devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, - struct netlink_ext_ack *extack) -{ - const struct devlink_ops *ops = devlink_port->devlink->ops; - - return ops->port_fn_migratable_set(devlink_port, enable, extack); -} - -static int -devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, - struct netlink_ext_ack *extack) -{ - const struct devlink_ops *ops = devlink_port->devlink->ops; - - return ops->port_fn_roce_set(devlink_port, enable, extack); -} - -static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, - const struct nlattr *attr, - struct netlink_ext_ack *extack) -{ - struct nla_bitfield32 caps; - u32 caps_value; - int err; - - caps = nla_get_bitfield32(attr); - caps_value = caps.value & caps.selector; - if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { - err = devlink_port_fn_roce_set(devlink_port, - caps_value & DEVLINK_PORT_FN_CAP_ROCE, - extack); - if (err) - return err; - } - if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { - err = devlink_port_fn_mig_set(devlink_port, caps_value & - DEVLINK_PORT_FN_CAP_MIGRATABLE, - extack); - if (err) - return err; - } - return 0; -} - -static int -devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, - struct netlink_ext_ack *extack) -{ - const struct devlink_ops *ops; - struct nlattr *function_attr; - bool msg_updated = false; - int err; - - function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); - if (!function_attr) - return -EMSGSIZE; - - ops = port->devlink->ops; - err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack, - &msg_updated); - if (err) - goto out; - err = devlink_port_fn_caps_fill(ops, port, msg, extack, - &msg_updated); - if (err) - goto out; - err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated); -out: - if (err || !msg_updated) - nla_nest_cancel(msg, function_attr); - else - nla_nest_end(msg, function_attr); - return err; -} - -static int devlink_nl_port_fill(struct sk_buff *msg, - struct devlink_port *devlink_port, - enum devlink_command cmd, u32 portid, u32 seq, - int flags, struct netlink_ext_ack *extack) -{ - struct devlink *devlink = devlink_port->devlink; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) - goto nla_put_failure; - - spin_lock_bh(&devlink_port->type_lock); - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) - goto nla_put_failure_type_locked; - if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && - nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, - devlink_port->desired_type)) - goto nla_put_failure_type_locked; - if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { - if (devlink_port->type_eth.netdev && - (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, - devlink_port->type_eth.ifindex) || - nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, - devlink_port->type_eth.ifname))) - goto nla_put_failure_type_locked; - } - if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { - struct ib_device *ibdev = devlink_port->type_ib.ibdev; - - if (ibdev && - nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, - ibdev->name)) - goto nla_put_failure_type_locked; - } - spin_unlock_bh(&devlink_port->type_lock); - if (devlink_nl_port_attrs_put(msg, devlink_port)) - goto nla_put_failure; - if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) - goto nla_put_failure; - if (devlink_port->linecard && - nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, - devlink_port->linecard->index)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure_type_locked: - spin_unlock_bh(&devlink_port->type_lock); -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_port_notify(struct devlink_port *devlink_port, - enum devlink_command cmd) -{ - struct devlink *devlink = devlink_port->devlink; - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_rate_notify(struct devlink_rate *devlink_rate, - enum devlink_command cmd) -{ - struct devlink *devlink = devlink_rate->devlink; - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_rate *devlink_rate; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; - u32 id = NETLINK_CB(cb->skb).portid; - - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, NULL); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_rate *devlink_rate = info->user_ptr[1]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, - info->snd_portid, info->snd_seq, 0, - info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static bool -devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, - struct devlink_rate *parent) -{ - while (parent) { - if (parent == devlink_rate) - return true; - parent = parent->parent; - } - return false; -} - -static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start) { - idx++; - devlink_put(devlink); - continue; - } - - devl_lock(devlink); - err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - devl_unlock(devlink); - devlink_put(devlink); - - if (err) - goto out; - idx++; - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, - info->snd_portid, info->snd_seq, 0, - info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - struct devlink_port *devlink_port; - unsigned long index, port_index; - int start = cb->args[0]; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - xa_for_each(&devlink->ports, port_index, devlink_port) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_port_fill(msg, devlink_port, - DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->extack); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int devlink_port_type_set(struct devlink_port *devlink_port, - enum devlink_port_type port_type) - -{ - int err; - - if (!devlink_port->devlink->ops->port_type_set) - return -EOPNOTSUPP; - - if (port_type == devlink_port->type) - return 0; - - err = devlink_port->devlink->ops->port_type_set(devlink_port, - port_type); - if (err) - return err; - - devlink_port->desired_type = port_type; - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); - return 0; -} - -static int devlink_port_function_hw_addr_set(struct devlink_port *port, - const struct nlattr *attr, - struct netlink_ext_ack *extack) -{ - const struct devlink_ops *ops = port->devlink->ops; - const u8 *hw_addr; - int hw_addr_len; - - hw_addr = nla_data(attr); - hw_addr_len = nla_len(attr); - if (hw_addr_len > MAX_ADDR_LEN) { - NL_SET_ERR_MSG_MOD(extack, "Port function hardware address too long"); - return -EINVAL; - } - if (port->type == DEVLINK_PORT_TYPE_ETH) { - if (hw_addr_len != ETH_ALEN) { - NL_SET_ERR_MSG_MOD(extack, "Address must be 6 bytes for Ethernet device"); - return -EINVAL; - } - if (!is_unicast_ether_addr(hw_addr)) { - NL_SET_ERR_MSG_MOD(extack, "Non-unicast hardware address unsupported"); - return -EINVAL; - } - } - - return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len, - extack); -} - -static int devlink_port_fn_state_set(struct devlink_port *port, - const struct nlattr *attr, - struct netlink_ext_ack *extack) -{ - enum devlink_port_fn_state state; - const struct devlink_ops *ops; - - state = nla_get_u8(attr); - ops = port->devlink->ops; - return ops->port_fn_state_set(port, state, extack); -} - -static int devlink_port_function_validate(struct devlink_port *devlink_port, - struct nlattr **tb, - struct netlink_ext_ack *extack) -{ - const struct devlink_ops *ops = devlink_port->devlink->ops; - struct nlattr *attr; - - if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && - !ops->port_function_hw_addr_set) { - NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], - "Port doesn't support function attributes"); - return -EOPNOTSUPP; - } - if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { - NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], - "Function does not support state setting"); - return -EOPNOTSUPP; - } - attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; - if (attr) { - struct nla_bitfield32 caps; - - caps = nla_get_bitfield32(attr); - if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && - !ops->port_fn_roce_set) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "Port doesn't support RoCE function attribute"); - return -EOPNOTSUPP; - } - if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { - if (!ops->port_fn_migratable_set) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "Port doesn't support migratable function attribute"); - return -EOPNOTSUPP; - } - if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "migratable function attribute supported for VFs only"); - return -EOPNOTSUPP; - } - } - } - return 0; -} - -static int devlink_port_function_set(struct devlink_port *port, - const struct nlattr *attr, - struct netlink_ext_ack *extack) -{ - struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; - int err; - - err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, - devlink_function_nl_policy, extack); - if (err < 0) { - NL_SET_ERR_MSG_MOD(extack, "Fail to parse port function attributes"); - return err; - } - - err = devlink_port_function_validate(port, tb, extack); - if (err) - return err; - - attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; - if (attr) { - err = devlink_port_function_hw_addr_set(port, attr, extack); - if (err) - return err; - } - - attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; - if (attr) { - err = devlink_port_fn_caps_set(port, attr, extack); - if (err) - return err; - } - - /* Keep this as the last function attribute set, so that when - * multiple port function attributes are set along with state, - * Those can be applied first before activating the state. - */ - attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; - if (attr) - err = devlink_port_fn_state_set(port, attr, extack); - - if (!err) - devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); - return err; -} - -static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - int err; - - if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { - enum devlink_port_type port_type; - - port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); - err = devlink_port_type_set(devlink_port, port_type); - if (err) - return err; - } - - if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { - struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; - struct netlink_ext_ack *extack = info->extack; - - err = devlink_port_function_set(devlink_port, attr, extack); - if (err) - return err; - } - - return 0; -} - -static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = info->user_ptr[0]; - u32 count; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) - return -EINVAL; - if (!devlink->ops->port_split) - return -EOPNOTSUPP; - - count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); - - if (!devlink_port->attrs.splittable) { - /* Split ports cannot be split. */ - if (devlink_port->attrs.split) - NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split further"); - else - NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split"); - return -EINVAL; - } - - if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid split count"); - return -EINVAL; - } - - return devlink->ops->port_split(devlink, devlink_port, count, - info->extack); -} - -static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = info->user_ptr[0]; - - if (!devlink->ops->port_unsplit) - return -EOPNOTSUPP; - return devlink->ops->port_unsplit(devlink, devlink_port, info->extack); -} - -static int devlink_port_new_notify(struct devlink *devlink, - unsigned int port_index, - struct genl_info *info) -{ - struct devlink_port *devlink_port; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - lockdep_assert_held(&devlink->lock); - devlink_port = devlink_port_get_by_index(devlink, port_index); - if (!devlink_port) { - err = -ENODEV; - goto out; - } - - err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, - info->snd_portid, info->snd_seq, 0, NULL); - if (err) - goto out; - - return genlmsg_reply(msg, info); - -out: - nlmsg_free(msg); - return err; -} - -static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink_port_new_attrs new_attrs = {}; - struct devlink *devlink = info->user_ptr[0]; - unsigned int new_port_index; - int err; - - if (!devlink->ops->port_new || !devlink->ops->port_del) - return -EOPNOTSUPP; - - if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || - !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { - NL_SET_ERR_MSG_MOD(extack, "Port flavour or PCI PF are not specified"); - return -EINVAL; - } - new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); - new_attrs.pfnum = - nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - /* Port index of the new port being created by driver. */ - new_attrs.port_index = - nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - new_attrs.port_index_valid = true; - } - if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { - new_attrs.controller = - nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); - new_attrs.controller_valid = true; - } - if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && - info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { - new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); - new_attrs.sfnum_valid = true; - } - - err = devlink->ops->port_new(devlink, &new_attrs, extack, - &new_port_index); - if (err) - return err; - - err = devlink_port_new_notify(devlink, new_port_index, info); - if (err && err != -ENODEV) { - /* Fail to send the response; destroy newly created port. */ - devlink->ops->port_del(devlink, new_port_index, extack); - } - return err; -} - -static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - unsigned int port_index; - - if (!devlink->ops->port_del) - return -EOPNOTSUPP; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_INDEX)) { - NL_SET_ERR_MSG_MOD(extack, "Port index is not specified"); - return -EINVAL; - } - port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - return devlink->ops->port_del(devlink, port_index, extack); -} - -static int -devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, - struct genl_info *info, - struct nlattr *nla_parent) -{ - struct devlink *devlink = devlink_rate->devlink; - const char *parent_name = nla_data(nla_parent); - const struct devlink_ops *ops = devlink->ops; - size_t len = strlen(parent_name); - struct devlink_rate *parent; - int err = -EOPNOTSUPP; - - parent = devlink_rate->parent; - - if (parent && !len) { - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_parent_set(devlink_rate, NULL, - devlink_rate->priv, NULL, - info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_parent_set(devlink_rate, NULL, - devlink_rate->priv, NULL, - info->extack); - if (err) - return err; - - refcount_dec(&parent->refcnt); - devlink_rate->parent = NULL; - } else if (len) { - parent = devlink_rate_node_get_by_name(devlink, parent_name); - if (IS_ERR(parent)) - return -ENODEV; - - if (parent == devlink_rate) { - NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed"); - return -EINVAL; - } - - if (devlink_rate_is_node(devlink_rate) && - devlink_rate_is_parent_node(devlink_rate, parent->parent)) { - NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node."); - return -EEXIST; - } - - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_parent_set(devlink_rate, parent, - devlink_rate->priv, parent->priv, - info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_parent_set(devlink_rate, parent, - devlink_rate->priv, parent->priv, - info->extack); - if (err) - return err; - - if (devlink_rate->parent) - /* we're reassigning to other parent in this case */ - refcount_dec(&devlink_rate->parent->refcnt); - - refcount_inc(&parent->refcnt); - devlink_rate->parent = parent; - } - - return 0; -} - -static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, - const struct devlink_ops *ops, - struct genl_info *info) -{ - struct nlattr *nla_parent, **attrs = info->attrs; - int err = -EOPNOTSUPP; - u32 priority; - u32 weight; - u64 rate; - - if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { - rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, - rate, info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, - rate, info->extack); - if (err) - return err; - devlink_rate->tx_share = rate; - } - - if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { - rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, - rate, info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, - rate, info->extack); - if (err) - return err; - devlink_rate->tx_max = rate; - } - - if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { - priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, - priority, info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, - priority, info->extack); - - if (err) - return err; - devlink_rate->tx_priority = priority; - } - - if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { - weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, - weight, info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, - weight, info->extack); - - if (err) - return err; - devlink_rate->tx_weight = weight; - } - - nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; - if (nla_parent) { - err = devlink_nl_rate_parent_node_set(devlink_rate, info, - nla_parent); - if (err) - return err; - } - - return 0; -} - -static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, - struct genl_info *info, - enum devlink_rate_type type) -{ - struct nlattr **attrs = info->attrs; - - if (type == DEVLINK_RATE_TYPE_LEAF) { - if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { - NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { - NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && - !ops->rate_leaf_parent_set) { - NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], - "TX priority set isn't supported for the leafs"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], - "TX weight set isn't supported for the leafs"); - return false; - } - } else if (type == DEVLINK_RATE_TYPE_NODE) { - if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { - NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { - NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && - !ops->rate_node_parent_set) { - NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], - "TX priority set isn't supported for the nodes"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], - "TX weight set isn't supported for the nodes"); - return false; - } - } else { - WARN(1, "Unknown type of rate object"); - return false; - } - - return true; -} - -static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_rate *devlink_rate = info->user_ptr[1]; - struct devlink *devlink = devlink_rate->devlink; - const struct devlink_ops *ops = devlink->ops; - int err; - - if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) - return -EOPNOTSUPP; - - err = devlink_nl_rate_set(devlink_rate, ops, info); - - if (!err) - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); - return err; -} - -static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_rate *rate_node; - const struct devlink_ops *ops; - int err; - - ops = devlink->ops; - if (!ops || !ops->rate_node_new || !ops->rate_node_del) { - NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported"); - return -EOPNOTSUPP; - } - - if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) - return -EOPNOTSUPP; - - rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); - if (!IS_ERR(rate_node)) - return -EEXIST; - else if (rate_node == ERR_PTR(-EINVAL)) - return -EINVAL; - - rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); - if (!rate_node) - return -ENOMEM; - - rate_node->devlink = devlink; - rate_node->type = DEVLINK_RATE_TYPE_NODE; - rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); - if (!rate_node->name) { - err = -ENOMEM; - goto err_strdup; - } - - err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); - if (err) - goto err_node_new; - - err = devlink_nl_rate_set(rate_node, ops, info); - if (err) - goto err_rate_set; - - refcount_set(&rate_node->refcnt, 1); - list_add(&rate_node->list, &devlink->rate_list); - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); - return 0; - -err_rate_set: - ops->rate_node_del(rate_node, rate_node->priv, info->extack); -err_node_new: - kfree(rate_node->name); -err_strdup: - kfree(rate_node); - return err; -} - -static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_rate *rate_node = info->user_ptr[1]; - struct devlink *devlink = rate_node->devlink; - const struct devlink_ops *ops = devlink->ops; - int err; - - if (refcount_read(&rate_node->refcnt) > 1) { - NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node."); - return -EBUSY; - } - - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); - err = ops->rate_node_del(rate_node, rate_node->priv, info->extack); - if (rate_node->parent) - refcount_dec(&rate_node->parent->refcnt); - list_del(&rate_node->list); - kfree(rate_node->name); - kfree(rate_node); - return err; -} - -struct devlink_linecard_type { - const char *type; - const void *priv; -}; - -static int devlink_nl_linecard_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_linecard *linecard, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct netlink_ext_ack *extack) -{ - struct devlink_linecard_type *linecard_type; - struct nlattr *attr; - void *hdr; - int i; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state)) - goto nla_put_failure; - if (linecard->type && - nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type)) - goto nla_put_failure; - - if (linecard->types_count) { - attr = nla_nest_start(msg, - DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES); - if (!attr) - goto nla_put_failure; - for (i = 0; i < linecard->types_count; i++) { - linecard_type = &linecard->types[i]; - if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, - linecard_type->type)) { - nla_nest_cancel(msg, attr); - goto nla_put_failure; - } - } - nla_nest_end(msg, attr); - } - - if (linecard->nested_devlink && - devlink_nl_put_nested_handle(msg, linecard->nested_devlink)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_linecard_notify(struct devlink_linecard *linecard, - enum devlink_command cmd) -{ - struct devlink *devlink = linecard->devlink; - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && - cmd != DEVLINK_CMD_LINECARD_DEL); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0, - NULL); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_linecard *linecard = info->user_ptr[1]; - struct devlink *devlink = linecard->devlink; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - info->snd_portid, info->snd_seq, 0, - info->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_linecard *linecard; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - mutex_lock(&devlink->linecards_lock); - list_for_each_entry(linecard, &devlink->linecard_list, list) { - if (idx < start) { - idx++; - continue; - } - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); - } -out: - cb->args[0] = idx; - return msg->len; -} - -static struct devlink_linecard_type * -devlink_linecard_type_lookup(struct devlink_linecard *linecard, - const char *type) -{ - struct devlink_linecard_type *linecard_type; - int i; - - for (i = 0; i < linecard->types_count; i++) { - linecard_type = &linecard->types[i]; - if (!strcmp(type, linecard_type->type)) - return linecard_type; - } - return NULL; -} - -static int devlink_linecard_type_set(struct devlink_linecard *linecard, - const char *type, - struct netlink_ext_ack *extack) -{ - const struct devlink_linecard_ops *ops = linecard->ops; - struct devlink_linecard_type *linecard_type; - int err; - - mutex_lock(&linecard->state_lock); - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned"); - err = -EBUSY; - goto out; - } - - linecard_type = devlink_linecard_type_lookup(linecard, type); - if (!linecard_type) { - NL_SET_ERR_MSG_MOD(extack, "Unsupported line card type provided"); - err = -EINVAL; - goto out; - } - - if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && - linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - NL_SET_ERR_MSG_MOD(extack, "Line card already provisioned"); - err = -EBUSY; - /* Check if the line card is provisioned in the same - * way the user asks. In case it is, make the operation - * to return success. - */ - if (ops->same_provision && - ops->same_provision(linecard, linecard->priv, - linecard_type->type, - linecard_type->priv)) - err = 0; - goto out; - } - - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; - linecard->type = linecard_type->type; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - err = ops->provision(linecard, linecard->priv, linecard_type->type, - linecard_type->priv, extack); - if (err) { - /* Provisioning failed. Assume the linecard is unprovisioned - * for future operations. - */ - mutex_lock(&linecard->state_lock); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - } - return err; - -out: - mutex_unlock(&linecard->state_lock); - return err; -} - -static int devlink_linecard_type_unset(struct devlink_linecard *linecard, - struct netlink_ext_ack *extack) -{ - int err; - - mutex_lock(&linecard->state_lock); - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - err = 0; - goto out; - } - - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { - NL_SET_ERR_MSG_MOD(extack, "Line card is not provisioned"); - err = 0; - goto out; - } - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - err = linecard->ops->unprovision(linecard, linecard->priv, - extack); - if (err) { - /* Unprovisioning failed. Assume the linecard is unprovisioned - * for future operations. - */ - mutex_lock(&linecard->state_lock); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - } - return err; - -out: - mutex_unlock(&linecard->state_lock); - return err; -} - -static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_linecard *linecard = info->user_ptr[1]; - struct netlink_ext_ack *extack = info->extack; - int err; - - if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { - const char *type; - - type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); - if (strcmp(type, "")) { - err = devlink_linecard_type_set(linecard, type, extack); - if (err) - return err; - } else { - err = devlink_linecard_type_unset(linecard, extack); - if (err) - return err; - } - } - - return 0; -} - -static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, - struct devlink_sb *devlink_sb, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT, - devlink_sb->ingress_pools_count)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT, - devlink_sb->egress_pools_count)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT, - devlink_sb->ingress_tc_count)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT, - devlink_sb->egress_tc_count)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb; - struct sk_buff *msg; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_sb_fill(msg, devlink, devlink_sb, - DEVLINK_CMD_SB_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_sb_fill(msg, devlink, devlink_sb, - DEVLINK_CMD_SB_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, - struct devlink_sb *devlink_sb, - u16 pool_index, enum devlink_command cmd, - u32 portid, u32 seq, int flags) -{ - struct devlink_sb_pool_info pool_info; - void *hdr; - int err; - - err = devlink->ops->sb_pool_get(devlink, devlink_sb->index, - pool_index, &pool_info); - if (err) - return err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE, - pool_info.threshold_type)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE, - pool_info.cell_size)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb; - struct sk_buff *msg; - u16 pool_index; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - if (!devlink->ops->sb_pool_get) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index, - DEVLINK_CMD_SB_POOL_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, - struct devlink *devlink, - struct devlink_sb *devlink_sb, - u32 portid, u32 seq) -{ - u16 pool_count = devlink_sb_pool_count(devlink_sb); - u16 pool_index; - int err; - - for (pool_index = 0; pool_index < pool_count; pool_index++) { - if (*p_idx < start) { - (*p_idx)++; - continue; - } - err = devlink_nl_sb_pool_fill(msg, devlink, - devlink_sb, - pool_index, - DEVLINK_CMD_SB_POOL_NEW, - portid, seq, NLM_F_MULTI); - if (err) - return err; - (*p_idx)++; - } - return 0; -} - -static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_pool_get) - goto retry; - - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_pool_get_dumpit(msg, start, &idx, devlink, - devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - } - devl_unlock(devlink); -retry: - devlink_put(devlink); - } -out: - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, - u16 pool_index, u32 size, - enum devlink_sb_threshold_type threshold_type, - struct netlink_ext_ack *extack) - -{ - const struct devlink_ops *ops = devlink->ops; - - if (ops->sb_pool_set) - return ops->sb_pool_set(devlink, sb_index, pool_index, - size, threshold_type, extack); - return -EOPNOTSUPP; -} - -static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - enum devlink_sb_threshold_type threshold_type; - struct devlink_sb *devlink_sb; - u16 pool_index; - u32 size; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - err = devlink_sb_th_type_get_from_info(info, &threshold_type); - if (err) - return err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE)) - return -EINVAL; - - size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); - return devlink_sb_pool_set(devlink, devlink_sb->index, - pool_index, size, threshold_type, - info->extack); -} - -static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_port *devlink_port, - struct devlink_sb *devlink_sb, - u16 pool_index, - enum devlink_command cmd, - u32 portid, u32 seq, int flags) -{ - const struct devlink_ops *ops = devlink->ops; - u32 threshold; - void *hdr; - int err; - - err = ops->sb_port_pool_get(devlink_port, devlink_sb->index, - pool_index, &threshold); - if (err) - return err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) - goto nla_put_failure; - - if (ops->sb_occ_port_pool_get) { - u32 cur; - u32 max; - - err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, - pool_index, &cur, &max); - if (err && err != -EOPNOTSUPP) - goto sb_occ_get_failure; - if (!err) { - if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) - goto nla_put_failure; - } - } - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - err = -EMSGSIZE; -sb_occ_get_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; - struct devlink_sb *devlink_sb; - struct sk_buff *msg; - u16 pool_index; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - if (!devlink->ops->sb_port_pool_get) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port, - devlink_sb, pool_index, - DEVLINK_CMD_SB_PORT_POOL_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, - struct devlink *devlink, - struct devlink_sb *devlink_sb, - u32 portid, u32 seq) -{ - struct devlink_port *devlink_port; - u16 pool_count = devlink_sb_pool_count(devlink_sb); - unsigned long port_index; - u16 pool_index; - int err; - - xa_for_each(&devlink->ports, port_index, devlink_port) { - for (pool_index = 0; pool_index < pool_count; pool_index++) { - if (*p_idx < start) { - (*p_idx)++; - continue; - } - err = devlink_nl_sb_port_pool_fill(msg, devlink, - devlink_port, - devlink_sb, - pool_index, - DEVLINK_CMD_SB_PORT_POOL_NEW, - portid, seq, - NLM_F_MULTI); - if (err) - return err; - (*p_idx)++; - } - } - return 0; -} - -static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_port_pool_get) - goto retry; - - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_port_pool_get_dumpit(msg, start, &idx, - devlink, devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - } - devl_unlock(devlink); -retry: - devlink_put(devlink); - } -out: - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, - unsigned int sb_index, u16 pool_index, - u32 threshold, - struct netlink_ext_ack *extack) - -{ - const struct devlink_ops *ops = devlink_port->devlink->ops; - - if (ops->sb_port_pool_set) - return ops->sb_port_pool_set(devlink_port, sb_index, - pool_index, threshold, extack); - return -EOPNOTSUPP; -} - -static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb; - u16 pool_index; - u32 threshold; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) - return -EINVAL; - - threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); - return devlink_sb_port_pool_set(devlink_port, devlink_sb->index, - pool_index, threshold, info->extack); -} - -static int -devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink, - struct devlink_port *devlink_port, - struct devlink_sb *devlink_sb, u16 tc_index, - enum devlink_sb_pool_type pool_type, - enum devlink_command cmd, - u32 portid, u32 seq, int flags) -{ - const struct devlink_ops *ops = devlink->ops; - u16 pool_index; - u32 threshold; - void *hdr; - int err; - - err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index, - tc_index, pool_type, - &pool_index, &threshold); - if (err) - return err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) - goto nla_put_failure; - - if (ops->sb_occ_tc_port_bind_get) { - u32 cur; - u32 max; - - err = ops->sb_occ_tc_port_bind_get(devlink_port, - devlink_sb->index, - tc_index, pool_type, - &cur, &max); - if (err && err != -EOPNOTSUPP) - return err; - if (!err) { - if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) - goto nla_put_failure; - } - } - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; - struct devlink_sb *devlink_sb; - struct sk_buff *msg; - enum devlink_sb_pool_type pool_type; - u16 tc_index; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_type_get_from_info(info, &pool_type); - if (err) - return err; - - err = devlink_sb_tc_index_get_from_info(devlink_sb, info, - pool_type, &tc_index); - if (err) - return err; - - if (!devlink->ops->sb_tc_pool_bind_get) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port, - devlink_sb, tc_index, pool_type, - DEVLINK_CMD_SB_TC_POOL_BIND_NEW, - info->snd_portid, - info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, - int start, int *p_idx, - struct devlink *devlink, - struct devlink_sb *devlink_sb, - u32 portid, u32 seq) -{ - struct devlink_port *devlink_port; - unsigned long port_index; - u16 tc_index; - int err; - - xa_for_each(&devlink->ports, port_index, devlink_port) { - for (tc_index = 0; - tc_index < devlink_sb->ingress_tc_count; tc_index++) { - if (*p_idx < start) { - (*p_idx)++; - continue; - } - err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, - devlink_port, - devlink_sb, - tc_index, - DEVLINK_SB_POOL_TYPE_INGRESS, - DEVLINK_CMD_SB_TC_POOL_BIND_NEW, - portid, seq, - NLM_F_MULTI); - if (err) - return err; - (*p_idx)++; - } - for (tc_index = 0; - tc_index < devlink_sb->egress_tc_count; tc_index++) { - if (*p_idx < start) { - (*p_idx)++; - continue; - } - err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, - devlink_port, - devlink_sb, - tc_index, - DEVLINK_SB_POOL_TYPE_EGRESS, - DEVLINK_CMD_SB_TC_POOL_BIND_NEW, - portid, seq, - NLM_F_MULTI); - if (err) - return err; - (*p_idx)++; - } - } - return 0; -} - -static int -devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - struct devlink_sb *devlink_sb; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (!devlink->ops->sb_tc_pool_bind_get) - goto retry; - - devl_lock(devlink); - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, - devlink, - devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - } - devl_unlock(devlink); -retry: - devlink_put(devlink); - } -out: - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, - unsigned int sb_index, u16 tc_index, - enum devlink_sb_pool_type pool_type, - u16 pool_index, u32 threshold, - struct netlink_ext_ack *extack) - -{ - const struct devlink_ops *ops = devlink_port->devlink->ops; - - if (ops->sb_tc_pool_bind_set) - return ops->sb_tc_pool_bind_set(devlink_port, sb_index, - tc_index, pool_type, - pool_index, threshold, extack); - return -EOPNOTSUPP; -} - -static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = info->user_ptr[0]; - enum devlink_sb_pool_type pool_type; - struct devlink_sb *devlink_sb; - u16 tc_index; - u16 pool_index; - u32 threshold; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_type_get_from_info(info, &pool_type); - if (err) - return err; - - err = devlink_sb_tc_index_get_from_info(devlink_sb, info, - pool_type, &tc_index); - if (err) - return err; - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) - return -EINVAL; - - threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); - return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index, - tc_index, pool_type, - pool_index, threshold, info->extack); -} - -static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const struct devlink_ops *ops = devlink->ops; - struct devlink_sb *devlink_sb; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - if (ops->sb_occ_snapshot) - return ops->sb_occ_snapshot(devlink, devlink_sb->index); - return -EOPNOTSUPP; -} - -static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const struct devlink_ops *ops = devlink->ops; - struct devlink_sb *devlink_sb; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - if (ops->sb_occ_max_clear) - return ops->sb_occ_max_clear(devlink, devlink_sb->index); - return -EOPNOTSUPP; -} - -static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - const struct devlink_ops *ops = devlink->ops; - enum devlink_eswitch_encap_mode encap_mode; - u8 inline_mode; - void *hdr; - int err = 0; - u16 mode; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = devlink_nl_put_handle(msg, devlink); - if (err) - goto nla_put_failure; - - if (ops->eswitch_mode_get) { - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - goto nla_put_failure; - err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); - if (err) - goto nla_put_failure; - } - - if (ops->eswitch_inline_mode_get) { - err = ops->eswitch_inline_mode_get(devlink, &inline_mode); - if (err) - goto nla_put_failure; - err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, - inline_mode); - if (err) - goto nla_put_failure; - } - - if (ops->eswitch_encap_mode_get) { - err = ops->eswitch_encap_mode_get(devlink, &encap_mode); - if (err) - goto nla_put_failure; - err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); - if (err) - goto nla_put_failure; - } - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET, - info->snd_portid, info->snd_seq, 0); - - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack) -{ - struct devlink_rate *devlink_rate; - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) - if (devlink_rate_is_node(devlink_rate)) { - NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists."); - return -EBUSY; - } - return 0; -} - -static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const struct devlink_ops *ops = devlink->ops; - enum devlink_eswitch_encap_mode encap_mode; - u8 inline_mode; - int err = 0; - u16 mode; - - if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { - if (!ops->eswitch_mode_set) - return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); - err = devlink_rate_nodes_check(devlink, mode, info->extack); - if (err) - return err; - err = ops->eswitch_mode_set(devlink, mode, info->extack); - if (err) - return err; - } - - if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { - if (!ops->eswitch_inline_mode_set) - return -EOPNOTSUPP; - inline_mode = nla_get_u8( - info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); - err = ops->eswitch_inline_mode_set(devlink, inline_mode, - info->extack); - if (err) - return err; - } - - if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { - if (!ops->eswitch_encap_mode_set) - return -EOPNOTSUPP; - encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); - err = ops->eswitch_encap_mode_set(devlink, encap_mode, - info->extack); - if (err) - return err; - } - - return 0; -} - -int devlink_dpipe_match_put(struct sk_buff *skb, - struct devlink_dpipe_match *match) -{ - struct devlink_dpipe_header *header = match->header; - struct devlink_dpipe_field *field = &header->fields[match->field_id]; - struct nlattr *match_attr; - - match_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_MATCH); - if (!match_attr) - return -EMSGSIZE; - - if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || - nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) - goto nla_put_failure; - - nla_nest_end(skb, match_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, match_attr); - return -EMSGSIZE; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_match_put); - -static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table, - struct sk_buff *skb) -{ - struct nlattr *matches_attr; - - matches_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_TABLE_MATCHES); - if (!matches_attr) - return -EMSGSIZE; - - if (table->table_ops->matches_dump(table->priv, skb)) - goto nla_put_failure; - - nla_nest_end(skb, matches_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, matches_attr); - return -EMSGSIZE; -} - -int devlink_dpipe_action_put(struct sk_buff *skb, - struct devlink_dpipe_action *action) -{ - struct devlink_dpipe_header *header = action->header; - struct devlink_dpipe_field *field = &header->fields[action->field_id]; - struct nlattr *action_attr; - - action_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ACTION); - if (!action_attr) - return -EMSGSIZE; - - if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || - nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) - goto nla_put_failure; - - nla_nest_end(skb, action_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, action_attr); - return -EMSGSIZE; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_action_put); - -static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table, - struct sk_buff *skb) -{ - struct nlattr *actions_attr; - - actions_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_TABLE_ACTIONS); - if (!actions_attr) - return -EMSGSIZE; - - if (table->table_ops->actions_dump(table->priv, skb)) - goto nla_put_failure; - - nla_nest_end(skb, actions_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, actions_attr); - return -EMSGSIZE; -} - -static int devlink_dpipe_table_put(struct sk_buff *skb, - struct devlink_dpipe_table *table) -{ - struct nlattr *table_attr; - u64 table_size; - - table_size = table->table_ops->size_get(table->priv); - table_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLE); - if (!table_attr) - return -EMSGSIZE; - - if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, - table->counters_enabled)) - goto nla_put_failure; - - if (table->resource_valid) { - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, - table->resource_id, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, - table->resource_units, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - } - if (devlink_dpipe_matches_put(table, skb)) - goto nla_put_failure; - - if (devlink_dpipe_actions_put(table, skb)) - goto nla_put_failure; - - nla_nest_end(skb, table_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, table_attr); - return -EMSGSIZE; -} - -static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb, - struct genl_info *info) -{ - int err; - - if (*pskb) { - err = genlmsg_reply(*pskb, info); - if (err) - return err; - } - *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!*pskb) - return -ENOMEM; - return 0; -} - -static int devlink_dpipe_tables_fill(struct genl_info *info, - enum devlink_command cmd, int flags, - struct list_head *dpipe_tables, - const char *table_name) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_dpipe_table *table; - struct nlattr *tables_attr; - struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; - bool incomplete; - void *hdr; - int i; - int err; - - table = list_first_entry(dpipe_tables, - struct devlink_dpipe_table, list); -start_again: - err = devlink_dpipe_send_and_alloc_skb(&skb, info); - if (err) - return err; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) { - nlmsg_free(skb); - return -EMSGSIZE; - } - - if (devlink_nl_put_handle(skb, devlink)) - goto nla_put_failure; - tables_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLES); - if (!tables_attr) - goto nla_put_failure; - - i = 0; - incomplete = false; - list_for_each_entry_from(table, dpipe_tables, list) { - if (!table_name) { - err = devlink_dpipe_table_put(skb, table); - if (err) { - if (!i) - goto err_table_put; - incomplete = true; - break; - } - } else { - if (!strcmp(table->name, table_name)) { - err = devlink_dpipe_table_put(skb, table); - if (err) - break; - } - } - i++; - } - - nla_nest_end(skb, tables_attr); - genlmsg_end(skb, hdr); - if (incomplete) - goto start_again; - -send_done: - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = devlink_dpipe_send_and_alloc_skb(&skb, info); - if (err) - return err; - goto send_done; - } - - return genlmsg_reply(skb, info); - -nla_put_failure: - err = -EMSGSIZE; -err_table_put: - nlmsg_free(skb); - return err; -} - -static int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const char *table_name = NULL; - - if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) - table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); - - return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0, - &devlink->dpipe_table_list, - table_name); -} - -static int devlink_dpipe_value_put(struct sk_buff *skb, - struct devlink_dpipe_value *value) -{ - if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE, - value->value_size, value->value)) - return -EMSGSIZE; - if (value->mask) - if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK, - value->value_size, value->mask)) - return -EMSGSIZE; - if (value->mapping_valid) - if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING, - value->mapping_value)) - return -EMSGSIZE; - return 0; -} - -static int devlink_dpipe_action_value_put(struct sk_buff *skb, - struct devlink_dpipe_value *value) -{ - if (!value->action) - return -EINVAL; - if (devlink_dpipe_action_put(skb, value->action)) - return -EMSGSIZE; - if (devlink_dpipe_value_put(skb, value)) - return -EMSGSIZE; - return 0; -} - -static int devlink_dpipe_action_values_put(struct sk_buff *skb, - struct devlink_dpipe_value *values, - unsigned int values_count) -{ - struct nlattr *action_attr; - int i; - int err; - - for (i = 0; i < values_count; i++) { - action_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_ACTION_VALUE); - if (!action_attr) - return -EMSGSIZE; - err = devlink_dpipe_action_value_put(skb, &values[i]); - if (err) - goto err_action_value_put; - nla_nest_end(skb, action_attr); - } - return 0; - -err_action_value_put: - nla_nest_cancel(skb, action_attr); - return err; -} - -static int devlink_dpipe_match_value_put(struct sk_buff *skb, - struct devlink_dpipe_value *value) -{ - if (!value->match) - return -EINVAL; - if (devlink_dpipe_match_put(skb, value->match)) - return -EMSGSIZE; - if (devlink_dpipe_value_put(skb, value)) - return -EMSGSIZE; - return 0; -} - -static int devlink_dpipe_match_values_put(struct sk_buff *skb, - struct devlink_dpipe_value *values, - unsigned int values_count) -{ - struct nlattr *match_attr; - int i; - int err; - - for (i = 0; i < values_count; i++) { - match_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_MATCH_VALUE); - if (!match_attr) - return -EMSGSIZE; - err = devlink_dpipe_match_value_put(skb, &values[i]); - if (err) - goto err_match_value_put; - nla_nest_end(skb, match_attr); - } - return 0; - -err_match_value_put: - nla_nest_cancel(skb, match_attr); - return err; -} - -static int devlink_dpipe_entry_put(struct sk_buff *skb, - struct devlink_dpipe_entry *entry) -{ - struct nlattr *entry_attr, *matches_attr, *actions_attr; - int err; - - entry_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ENTRY); - if (!entry_attr) - return -EMSGSIZE; - - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (entry->counter_valid) - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, - entry->counter, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - matches_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES); - if (!matches_attr) - goto nla_put_failure; - - err = devlink_dpipe_match_values_put(skb, entry->match_values, - entry->match_values_count); - if (err) { - nla_nest_cancel(skb, matches_attr); - goto err_match_values_put; - } - nla_nest_end(skb, matches_attr); - - actions_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES); - if (!actions_attr) - goto nla_put_failure; - - err = devlink_dpipe_action_values_put(skb, entry->action_values, - entry->action_values_count); - if (err) { - nla_nest_cancel(skb, actions_attr); - goto err_action_values_put; - } - nla_nest_end(skb, actions_attr); - - nla_nest_end(skb, entry_attr); - return 0; - -nla_put_failure: - err = -EMSGSIZE; -err_match_values_put: -err_action_values_put: - nla_nest_cancel(skb, entry_attr); - return err; -} - -static struct devlink_dpipe_table * -devlink_dpipe_table_find(struct list_head *dpipe_tables, - const char *table_name, struct devlink *devlink) -{ - struct devlink_dpipe_table *table; - list_for_each_entry_rcu(table, dpipe_tables, list, - lockdep_is_held(&devlink->lock)) { - if (!strcmp(table->name, table_name)) - return table; - } - return NULL; -} - -int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) -{ - struct devlink *devlink; - int err; - - err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb, - dump_ctx->info); - if (err) - return err; - - dump_ctx->hdr = genlmsg_put(dump_ctx->skb, - dump_ctx->info->snd_portid, - dump_ctx->info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, - dump_ctx->cmd); - if (!dump_ctx->hdr) - goto nla_put_failure; - - devlink = dump_ctx->info->user_ptr[0]; - if (devlink_nl_put_handle(dump_ctx->skb, devlink)) - goto nla_put_failure; - dump_ctx->nest = nla_nest_start_noflag(dump_ctx->skb, - DEVLINK_ATTR_DPIPE_ENTRIES); - if (!dump_ctx->nest) - goto nla_put_failure; - return 0; - -nla_put_failure: - nlmsg_free(dump_ctx->skb); - return -EMSGSIZE; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare); - -int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, - struct devlink_dpipe_entry *entry) -{ - return devlink_dpipe_entry_put(dump_ctx->skb, entry); -} -EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append); - -int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) -{ - nla_nest_end(dump_ctx->skb, dump_ctx->nest); - genlmsg_end(dump_ctx->skb, dump_ctx->hdr); - return 0; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close); - -void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry) - -{ - unsigned int value_count, value_index; - struct devlink_dpipe_value *value; - - value = entry->action_values; - value_count = entry->action_values_count; - for (value_index = 0; value_index < value_count; value_index++) { - kfree(value[value_index].value); - kfree(value[value_index].mask); - } - - value = entry->match_values; - value_count = entry->match_values_count; - for (value_index = 0; value_index < value_count; value_index++) { - kfree(value[value_index].value); - kfree(value[value_index].mask); - } -} -EXPORT_SYMBOL_GPL(devlink_dpipe_entry_clear); - -static int devlink_dpipe_entries_fill(struct genl_info *info, - enum devlink_command cmd, int flags, - struct devlink_dpipe_table *table) -{ - struct devlink_dpipe_dump_ctx dump_ctx; - struct nlmsghdr *nlh; - int err; - - dump_ctx.skb = NULL; - dump_ctx.cmd = cmd; - dump_ctx.info = info; - - err = table->table_ops->entries_dump(table->priv, - table->counters_enabled, - &dump_ctx); - if (err) - return err; - -send_done: - nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info); - if (err) - return err; - goto send_done; - } - return genlmsg_reply(dump_ctx.skb, info); -} - -static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_dpipe_table *table; - const char *table_name; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME)) - return -EINVAL; - - table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - if (!table) - return -EINVAL; - - if (!table->table_ops->entries_dump) - return -EINVAL; - - return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET, - 0, table); -} - -static int devlink_dpipe_fields_put(struct sk_buff *skb, - const struct devlink_dpipe_header *header) -{ - struct devlink_dpipe_field *field; - struct nlattr *field_attr; - int i; - - for (i = 0; i < header->fields_count; i++) { - field = &header->fields[i]; - field_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_FIELD); - if (!field_attr) - return -EMSGSIZE; - if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type)) - goto nla_put_failure; - nla_nest_end(skb, field_attr); - } - return 0; - -nla_put_failure: - nla_nest_cancel(skb, field_attr); - return -EMSGSIZE; -} - -static int devlink_dpipe_header_put(struct sk_buff *skb, - struct devlink_dpipe_header *header) -{ - struct nlattr *fields_attr, *header_attr; - int err; - - header_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADER); - if (!header_attr) - return -EMSGSIZE; - - if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || - nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) - goto nla_put_failure; - - fields_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_HEADER_FIELDS); - if (!fields_attr) - goto nla_put_failure; - - err = devlink_dpipe_fields_put(skb, header); - if (err) { - nla_nest_cancel(skb, fields_attr); - goto nla_put_failure; - } - nla_nest_end(skb, fields_attr); - nla_nest_end(skb, header_attr); - return 0; - -nla_put_failure: - err = -EMSGSIZE; - nla_nest_cancel(skb, header_attr); - return err; -} - -static int devlink_dpipe_headers_fill(struct genl_info *info, - enum devlink_command cmd, int flags, - struct devlink_dpipe_headers * - dpipe_headers) -{ - struct devlink *devlink = info->user_ptr[0]; - struct nlattr *headers_attr; - struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; - void *hdr; - int i, j; - int err; - - i = 0; -start_again: - err = devlink_dpipe_send_and_alloc_skb(&skb, info); - if (err) - return err; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) { - nlmsg_free(skb); - return -EMSGSIZE; - } - - if (devlink_nl_put_handle(skb, devlink)) - goto nla_put_failure; - headers_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADERS); - if (!headers_attr) - goto nla_put_failure; - - j = 0; - for (; i < dpipe_headers->headers_count; i++) { - err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]); - if (err) { - if (!j) - goto err_table_put; - break; - } - j++; - } - nla_nest_end(skb, headers_attr); - genlmsg_end(skb, hdr); - if (i != dpipe_headers->headers_count) - goto start_again; - -send_done: - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = devlink_dpipe_send_and_alloc_skb(&skb, info); - if (err) - return err; - goto send_done; - } - return genlmsg_reply(skb, info); - -nla_put_failure: - err = -EMSGSIZE; -err_table_put: - nlmsg_free(skb); - return err; -} - -static int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - - if (!devlink->dpipe_headers) - return -EOPNOTSUPP; - return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET, - 0, devlink->dpipe_headers); -} - -static int devlink_dpipe_table_counters_set(struct devlink *devlink, - const char *table_name, - bool enable) -{ - struct devlink_dpipe_table *table; - - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - if (!table) - return -EINVAL; - - if (table->counter_control_extern) - return -EOPNOTSUPP; - - if (!(table->counters_enabled ^ enable)) - return 0; - - table->counters_enabled = enable; - if (table->table_ops->counters_set_update) - table->table_ops->counters_set_update(table->priv, enable); - return 0; -} - -static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const char *table_name; - bool counters_enable; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) || - GENL_REQ_ATTR_CHECK(info, - DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED)) - return -EINVAL; - - table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); - counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]); - - return devlink_dpipe_table_counters_set(devlink, table_name, - counters_enable); -} - -static struct devlink_resource * -devlink_resource_find(struct devlink *devlink, - struct devlink_resource *resource, u64 resource_id) -{ - struct list_head *resource_list; - - if (resource) - resource_list = &resource->resource_list; - else - resource_list = &devlink->resource_list; - - list_for_each_entry(resource, resource_list, list) { - struct devlink_resource *child_resource; - - if (resource->id == resource_id) - return resource; - - child_resource = devlink_resource_find(devlink, resource, - resource_id); - if (child_resource) - return child_resource; - } - return NULL; -} - -static void -devlink_resource_validate_children(struct devlink_resource *resource) -{ - struct devlink_resource *child_resource; - bool size_valid = true; - u64 parts_size = 0; - - if (list_empty(&resource->resource_list)) - goto out; - - list_for_each_entry(child_resource, &resource->resource_list, list) - parts_size += child_resource->size_new; - - if (parts_size > resource->size_new) - size_valid = false; -out: - resource->size_valid = size_valid; -} - -static int -devlink_resource_validate_size(struct devlink_resource *resource, u64 size, - struct netlink_ext_ack *extack) -{ - u64 reminder; - int err = 0; - - if (size > resource->size_params.size_max) { - NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum"); - err = -EINVAL; - } - - if (size < resource->size_params.size_min) { - NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum"); - err = -EINVAL; - } - - div64_u64_rem(size, resource->size_params.size_granularity, &reminder); - if (reminder) { - NL_SET_ERR_MSG_MOD(extack, "Wrong granularity"); - err = -EINVAL; - } - - return err; -} - -static int devlink_nl_cmd_resource_set(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_resource *resource; - u64 resource_id; - u64 size; - int err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) || - GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE)) - return -EINVAL; - resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (!resource) - return -EINVAL; - - size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); - err = devlink_resource_validate_size(resource, size, info->extack); - if (err) - return err; - - resource->size_new = size; - devlink_resource_validate_children(resource); - if (resource->parent) - devlink_resource_validate_children(resource->parent); - return 0; -} - -static int -devlink_resource_size_params_put(struct devlink_resource *resource, - struct sk_buff *skb) -{ - struct devlink_resource_size_params *size_params; - - size_params = &resource->size_params; - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, - size_params->size_granularity, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, - size_params->size_max, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, - size_params->size_min, DEVLINK_ATTR_PAD) || - nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit)) - return -EMSGSIZE; - return 0; -} - -static int devlink_resource_occ_put(struct devlink_resource *resource, - struct sk_buff *skb) -{ - if (!resource->occ_get) - return 0; - return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, - resource->occ_get(resource->occ_get_priv), - DEVLINK_ATTR_PAD); -} - -static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, - struct devlink_resource *resource) -{ - struct devlink_resource *child_resource; - struct nlattr *child_resource_attr; - struct nlattr *resource_attr; - - resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE); - if (!resource_attr) - return -EMSGSIZE; - - if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, - DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (resource->size != resource->size_new && - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, - resource->size_new, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (devlink_resource_occ_put(resource, skb)) - goto nla_put_failure; - if (devlink_resource_size_params_put(resource, skb)) - goto nla_put_failure; - if (list_empty(&resource->resource_list)) - goto out; - - if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, - resource->size_valid)) - goto nla_put_failure; - - child_resource_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_RESOURCE_LIST); - if (!child_resource_attr) - goto nla_put_failure; - - list_for_each_entry(child_resource, &resource->resource_list, list) { - if (devlink_resource_put(devlink, skb, child_resource)) - goto resource_put_failure; - } - - nla_nest_end(skb, child_resource_attr); -out: - nla_nest_end(skb, resource_attr); - return 0; - -resource_put_failure: - nla_nest_cancel(skb, child_resource_attr); -nla_put_failure: - nla_nest_cancel(skb, resource_attr); - return -EMSGSIZE; -} - -static int devlink_resource_fill(struct genl_info *info, - enum devlink_command cmd, int flags) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_resource *resource; - struct nlattr *resources_attr; - struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; - bool incomplete; - void *hdr; - int i; - int err; - - resource = list_first_entry(&devlink->resource_list, - struct devlink_resource, list); -start_again: - err = devlink_dpipe_send_and_alloc_skb(&skb, info); - if (err) - return err; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) { - nlmsg_free(skb); - return -EMSGSIZE; - } - - if (devlink_nl_put_handle(skb, devlink)) - goto nla_put_failure; - - resources_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_RESOURCE_LIST); - if (!resources_attr) - goto nla_put_failure; - - incomplete = false; - i = 0; - list_for_each_entry_from(resource, &devlink->resource_list, list) { - err = devlink_resource_put(devlink, skb, resource); - if (err) { - if (!i) - goto err_resource_put; - incomplete = true; - break; - } - i++; - } - nla_nest_end(skb, resources_attr); - genlmsg_end(skb, hdr); - if (incomplete) - goto start_again; -send_done: - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = devlink_dpipe_send_and_alloc_skb(&skb, info); - if (err) - return err; - goto send_done; - } - return genlmsg_reply(skb, info); - -nla_put_failure: - err = -EMSGSIZE; -err_resource_put: - nlmsg_free(skb); - return err; -} - -static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - - if (list_empty(&devlink->resource_list)) - return -EOPNOTSUPP; - - return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); -} - -static int -devlink_resources_validate(struct devlink *devlink, - struct devlink_resource *resource, - struct genl_info *info) -{ - struct list_head *resource_list; - int err = 0; - - if (resource) - resource_list = &resource->resource_list; - else - resource_list = &devlink->resource_list; - - list_for_each_entry(resource, resource_list, list) { - if (!resource->size_valid) - return -EINVAL; - err = devlink_resources_validate(devlink, resource, info); - if (err) - return err; - } - return err; -} - -static struct net *devlink_netns_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; - struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; - struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; - struct net *net; - - if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { - NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified"); - return ERR_PTR(-EINVAL); - } - - if (netns_pid_attr) { - net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); - } else if (netns_fd_attr) { - net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); - } else if (netns_id_attr) { - net = get_net_ns_by_id(sock_net(skb->sk), - nla_get_u32(netns_id_attr)); - if (!net) - net = ERR_PTR(-EINVAL); - } else { - WARN_ON(1); - net = ERR_PTR(-EINVAL); - } - if (IS_ERR(net)) { - NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace"); - return ERR_PTR(-EINVAL); - } - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { - put_net(net); - return ERR_PTR(-EPERM); - } - return net; -} - -static void devlink_param_notify(struct devlink *devlink, - unsigned int port_index, - struct devlink_param_item *param_item, - enum devlink_command cmd); - -static void devlink_ns_change_notify(struct devlink *devlink, - struct net *dest_net, struct net *curr_net, - bool new) -{ - struct devlink_param_item *param_item; - enum devlink_command cmd; - - /* Userspace needs to be notified about devlink objects - * removed from original and entering new network namespace. - * The rest of the devlink objects are re-created during - * reload process so the notifications are generated separatelly. - */ - - if (!dest_net || net_eq(dest_net, curr_net)) - return; - - if (new) - devlink_notify(devlink, DEVLINK_CMD_NEW); - - cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL; - list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, cmd); - - if (!new) - devlink_notify(devlink, DEVLINK_CMD_DEL); -} - -static bool devlink_reload_supported(const struct devlink_ops *ops) -{ - return ops->reload_down && ops->reload_up; -} - -static void devlink_reload_failed_set(struct devlink *devlink, - bool reload_failed) -{ - if (devlink->reload_failed == reload_failed) - return; - devlink->reload_failed = reload_failed; - devlink_notify(devlink, DEVLINK_CMD_NEW); -} - -bool devlink_is_reload_failed(const struct devlink *devlink) -{ - return devlink->reload_failed; -} -EXPORT_SYMBOL_GPL(devlink_is_reload_failed); - -static void -__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, - enum devlink_reload_limit limit, u32 actions_performed) -{ - unsigned long actions = actions_performed; - int stat_idx; - int action; - - for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { - stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; - reload_stats[stat_idx]++; - } - devlink_notify(devlink, DEVLINK_CMD_NEW); -} - -static void -devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, - u32 actions_performed) -{ - __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, - actions_performed); -} - -/** - * devlink_remote_reload_actions_performed - Update devlink on reload actions - * performed which are not a direct result of devlink reload call. - * - * This should be called by a driver after performing reload actions in case it was not - * a result of devlink reload call. For example fw_activate was performed as a result - * of devlink reload triggered fw_activate on another host. - * The motivation for this function is to keep data on reload actions performed on this - * function whether it was done due to direct devlink reload call or not. - * - * @devlink: devlink - * @limit: reload limit - * @actions_performed: bitmask of actions performed - */ -void devlink_remote_reload_actions_performed(struct devlink *devlink, - enum devlink_reload_limit limit, - u32 actions_performed) -{ - if (WARN_ON(!actions_performed || - actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || - actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || - limit > DEVLINK_RELOAD_LIMIT_MAX)) - return; - - __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, - actions_performed); -} -EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); - -static int devlink_reload(struct devlink *devlink, struct net *dest_net, - enum devlink_reload_action action, enum devlink_reload_limit limit, - u32 *actions_performed, struct netlink_ext_ack *extack) -{ - u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; - struct net *curr_net; - int err; - - memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, - sizeof(remote_reload_stats)); - - curr_net = devlink_net(devlink); - devlink_ns_change_notify(devlink, dest_net, curr_net, false); - err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); - if (err) - return err; - - if (dest_net && !net_eq(dest_net, curr_net)) { - move_netdevice_notifier_net(curr_net, dest_net, - &devlink->netdevice_nb); - write_pnet(&devlink->_net, dest_net); - } - - err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); - devlink_reload_failed_set(devlink, !!err); - if (err) - return err; - - devlink_ns_change_notify(devlink, dest_net, curr_net, true); - WARN_ON(!(*actions_performed & BIT(action))); - /* Catch driver on updating the remote action within devlink reload */ - WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, - sizeof(remote_reload_stats))); - devlink_reload_stats_update(devlink, limit, *actions_performed); - return 0; -} - -static int -devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, - enum devlink_command cmd, struct genl_info *info) -{ - struct sk_buff *msg; - void *hdr; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); - if (!hdr) - goto free_msg; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, - actions_performed)) - goto nla_put_failure; - genlmsg_end(msg, hdr); - - return genlmsg_reply(msg, info); - -nla_put_failure: - genlmsg_cancel(msg, hdr); -free_msg: - nlmsg_free(msg); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - enum devlink_reload_action action; - enum devlink_reload_limit limit; - struct net *dest_net = NULL; - u32 actions_performed; - int err; - - if (!(devlink->features & DEVLINK_F_RELOAD)) - return -EOPNOTSUPP; - - err = devlink_resources_validate(devlink, NULL, info); - if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); - return err; - } - - if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) - action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); - else - action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; - - if (!devlink_reload_action_is_supported(devlink, action)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested reload action is not supported by the driver"); - return -EOPNOTSUPP; - } - - limit = DEVLINK_RELOAD_LIMIT_UNSPEC; - if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { - struct nla_bitfield32 limits; - u32 limits_selected; - - limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); - limits_selected = limits.value & limits.selector; - if (!limits_selected) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected"); - return -EINVAL; - } - for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) - if (limits_selected & BIT(limit)) - break; - /* UAPI enables multiselection, but currently it is not used */ - if (limits_selected != BIT(limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Multiselection of limit is not supported"); - return -EOPNOTSUPP; - } - if (!devlink_reload_limit_is_supported(devlink, limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested limit is not supported by the driver"); - return -EOPNOTSUPP; - } - if (devlink_reload_combination_is_invalid(action, limit)) { - NL_SET_ERR_MSG_MOD(info->extack, - "Requested limit is invalid for this action"); - return -EINVAL; - } - } - if (info->attrs[DEVLINK_ATTR_NETNS_PID] || - info->attrs[DEVLINK_ATTR_NETNS_FD] || - info->attrs[DEVLINK_ATTR_NETNS_ID]) { - dest_net = devlink_netns_get(skb, info); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - } - - err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); - - if (dest_net) - put_net(dest_net); - - if (err) - return err; - /* For backward compatibility generate reply only if attributes used by user */ - if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) - return 0; - - return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, - DEVLINK_CMD_RELOAD, info); -} - -static int devlink_nl_flash_update_fill(struct sk_buff *msg, - struct devlink *devlink, - enum devlink_command cmd, - struct devlink_flash_notify *params) -{ - void *hdr; - - hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) - goto out; - - if (params->status_msg && - nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, - params->status_msg)) - goto nla_put_failure; - if (params->component && - nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, - params->component)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, - params->done, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, - params->total, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, - params->timeout, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - -out: - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void __devlink_flash_update_notify(struct devlink *devlink, - enum devlink_command cmd, - struct devlink_flash_notify *params) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && - cmd != DEVLINK_CMD_FLASH_UPDATE_END && - cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_flash_update_fill(msg, devlink, cmd, params); - if (err) - goto out_free_msg; - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); - return; - -out_free_msg: - nlmsg_free(msg); -} - -static void devlink_flash_update_begin_notify(struct devlink *devlink) -{ - struct devlink_flash_notify params = {}; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE, - ¶ms); -} - -static void devlink_flash_update_end_notify(struct devlink *devlink) -{ - struct devlink_flash_notify params = {}; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_END, - ¶ms); -} - -void devlink_flash_update_status_notify(struct devlink *devlink, - const char *status_msg, - const char *component, - unsigned long done, - unsigned long total) -{ - struct devlink_flash_notify params = { - .status_msg = status_msg, - .component = component, - .done = done, - .total = total, - }; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_STATUS, - ¶ms); -} -EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); - -void devlink_flash_update_timeout_notify(struct devlink *devlink, - const char *status_msg, - const char *component, - unsigned long timeout) -{ - struct devlink_flash_notify params = { - .status_msg = status_msg, - .component = component, - .timeout = timeout, - }; - - __devlink_flash_update_notify(devlink, - DEVLINK_CMD_FLASH_UPDATE_STATUS, - ¶ms); -} -EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); - -struct devlink_info_req { - struct sk_buff *msg; - void (*version_cb)(const char *version_name, - enum devlink_info_version_type version_type, - void *version_cb_priv); - void *version_cb_priv; -}; - -struct devlink_flash_component_lookup_ctx { - const char *lookup_name; - bool lookup_name_found; -}; - -static void -devlink_flash_component_lookup_cb(const char *version_name, - enum devlink_info_version_type version_type, - void *version_cb_priv) -{ - struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv; - - if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT || - lookup_ctx->lookup_name_found) - return; - - lookup_ctx->lookup_name_found = - !strcmp(lookup_ctx->lookup_name, version_name); -} - -static int devlink_flash_component_get(struct devlink *devlink, - struct nlattr *nla_component, - const char **p_component, - struct netlink_ext_ack *extack) -{ - struct devlink_flash_component_lookup_ctx lookup_ctx = {}; - struct devlink_info_req req = {}; - const char *component; - int ret; - - if (!nla_component) - return 0; - - component = nla_data(nla_component); - - if (!devlink->ops->info_get) { - NL_SET_ERR_MSG_ATTR(extack, nla_component, - "component update is not supported by this device"); - return -EOPNOTSUPP; - } - - lookup_ctx.lookup_name = component; - req.version_cb = devlink_flash_component_lookup_cb; - req.version_cb_priv = &lookup_ctx; - - ret = devlink->ops->info_get(devlink, &req, NULL); - if (ret) - return ret; - - if (!lookup_ctx.lookup_name_found) { - NL_SET_ERR_MSG_ATTR(extack, nla_component, - "selected component is not supported by this device"); - return -EINVAL; - } - *p_component = component; - return 0; -} - -static int devlink_nl_cmd_flash_update(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *nla_overwrite_mask, *nla_file_name; - struct devlink_flash_update_params params = {}; - struct devlink *devlink = info->user_ptr[0]; - const char *file_name; - u32 supported_params; - int ret; - - if (!devlink->ops->flash_update) - return -EOPNOTSUPP; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME)) - return -EINVAL; - - ret = devlink_flash_component_get(devlink, - info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT], - ¶ms.component, info->extack); - if (ret) - return ret; - - supported_params = devlink->ops->supported_flash_update_params; - - nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; - if (nla_overwrite_mask) { - struct nla_bitfield32 sections; - - if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { - NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, - "overwrite settings are not supported by this device"); - return -EOPNOTSUPP; - } - sections = nla_get_bitfield32(nla_overwrite_mask); - params.overwrite_mask = sections.value & sections.selector; - } - - nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]; - file_name = nla_data(nla_file_name); - ret = request_firmware(¶ms.fw, file_name, devlink->dev); - if (ret) { - NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file"); - return ret; - } - - devlink_flash_update_begin_notify(devlink); - ret = devlink->ops->flash_update(devlink, ¶ms, info->extack); - devlink_flash_update_end_notify(devlink); - - release_firmware(params.fw); - - return ret; -} - -static int -devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink, - u32 portid, u32 seq, int flags, - struct netlink_ext_ack *extack) -{ - struct nlattr *selftests; - void *hdr; - int err; - int i; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, - DEVLINK_CMD_SELFTESTS_GET); - if (!hdr) - return -EMSGSIZE; - - err = -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto err_cancel_msg; - - selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); - if (!selftests) - goto err_cancel_msg; - - for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; - i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { - if (devlink->ops->selftest_check(devlink, i, extack)) { - err = nla_put_flag(msg, i); - if (err) - goto err_cancel_msg; - } - } - - nla_nest_end(msg, selftests); - genlmsg_end(msg, hdr); - return 0; - -err_cancel_msg: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - if (!devlink->ops->selftest_check) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid, - info->snd_seq, 0, info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_selftests_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start || !devlink->ops->selftest_check) - goto inc; - - devl_lock(devlink); - err = devlink_nl_selftests_fill(msg, devlink, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->extack); - devl_unlock(devlink); - if (err) { - devlink_put(devlink); - break; - } -inc: - idx++; - devlink_put(devlink); - } - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, - enum devlink_selftest_status test_status) -{ - struct nlattr *result_attr; - - result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT); - if (!result_attr) - return -EMSGSIZE; - - if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) || - nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS, - test_status)) - goto nla_put_failure; - - nla_nest_end(skb, result_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, result_attr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_selftests_run(struct sk_buff *skb, - struct genl_info *info) -{ - struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1]; - struct devlink *devlink = info->user_ptr[0]; - struct nlattr *attrs, *selftests; - struct sk_buff *msg; - void *hdr; - int err; - int i; - - if (!devlink->ops->selftest_run || !devlink->ops->selftest_check) - return -EOPNOTSUPP; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS)) - return -EINVAL; - - attrs = info->attrs[DEVLINK_ATTR_SELFTESTS]; - - err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs, - devlink_selftest_nl_policy, info->extack); - if (err < 0) - return err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = -EMSGSIZE; - hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, - &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN); - if (!hdr) - goto free_msg; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); - if (!selftests) - goto genlmsg_cancel; - - for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; - i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { - enum devlink_selftest_status test_status; - - if (nla_get_flag(tb[i])) { - if (!devlink->ops->selftest_check(devlink, i, - info->extack)) { - if (devlink_selftest_result_put(msg, i, - DEVLINK_SELFTEST_STATUS_SKIP)) - goto selftests_nest_cancel; - continue; - } - - test_status = devlink->ops->selftest_run(devlink, i, - info->extack); - if (devlink_selftest_result_put(msg, i, test_status)) - goto selftests_nest_cancel; - } - } - - nla_nest_end(msg, selftests); - genlmsg_end(msg, hdr); - return genlmsg_reply(msg, info); - -selftests_nest_cancel: - nla_nest_cancel(msg, selftests); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); -free_msg: - nlmsg_free(msg); - return err; -} - -static const struct devlink_param devlink_param_generic[] = { - { - .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, - .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, - .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, - .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, - .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, - .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, - .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, - .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, - .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, - .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, - .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, - .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, - .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, - .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, - .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, - .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, - .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, - .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, - .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, - .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, - .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, - .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, - .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, - .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, - .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME, - .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, - .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, - .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, - }, -}; - -static int devlink_param_generic_verify(const struct devlink_param *param) -{ - /* verify it match generic parameter by id and name */ - if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) - return -EINVAL; - if (strcmp(param->name, devlink_param_generic[param->id].name)) - return -ENOENT; - - WARN_ON(param->type != devlink_param_generic[param->id].type); - - return 0; -} - -static int devlink_param_driver_verify(const struct devlink_param *param) -{ - int i; - - if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) - return -EINVAL; - /* verify no such name in generic params */ - for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) - if (!strcmp(param->name, devlink_param_generic[i].name)) - return -EEXIST; - - return 0; -} - -static struct devlink_param_item * -devlink_param_find_by_name(struct list_head *param_list, - const char *param_name) -{ - struct devlink_param_item *param_item; - - list_for_each_entry(param_item, param_list, list) - if (!strcmp(param_item->param->name, param_name)) - return param_item; - return NULL; -} - -static struct devlink_param_item * -devlink_param_find_by_id(struct list_head *param_list, u32 param_id) -{ - struct devlink_param_item *param_item; - - list_for_each_entry(param_item, param_list, list) - if (param_item->param->id == param_id) - return param_item; - return NULL; -} - -static bool -devlink_param_cmode_is_supported(const struct devlink_param *param, - enum devlink_param_cmode cmode) -{ - return test_bit(cmode, ¶m->supported_cmodes); -} - -static int devlink_param_get(struct devlink *devlink, - const struct devlink_param *param, - struct devlink_param_gset_ctx *ctx) -{ - if (!param->get || devlink->reload_failed) - return -EOPNOTSUPP; - return param->get(devlink, param->id, ctx); -} - -static int devlink_param_set(struct devlink *devlink, - const struct devlink_param *param, - struct devlink_param_gset_ctx *ctx) -{ - if (!param->set || devlink->reload_failed) - return -EOPNOTSUPP; - return param->set(devlink, param->id, ctx); -} - -static int -devlink_param_type_to_nla_type(enum devlink_param_type param_type) -{ - switch (param_type) { - case DEVLINK_PARAM_TYPE_U8: - return NLA_U8; - case DEVLINK_PARAM_TYPE_U16: - return NLA_U16; - case DEVLINK_PARAM_TYPE_U32: - return NLA_U32; - case DEVLINK_PARAM_TYPE_STRING: - return NLA_STRING; - case DEVLINK_PARAM_TYPE_BOOL: - return NLA_FLAG; - default: - return -EINVAL; - } -} - -static int -devlink_nl_param_value_fill_one(struct sk_buff *msg, - enum devlink_param_type type, - enum devlink_param_cmode cmode, - union devlink_param_value val) -{ - struct nlattr *param_value_attr; - - param_value_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_PARAM_VALUE); - if (!param_value_attr) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) - goto value_nest_cancel; - - switch (type) { - case DEVLINK_PARAM_TYPE_U8: - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) - goto value_nest_cancel; - break; - case DEVLINK_PARAM_TYPE_U16: - if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) - goto value_nest_cancel; - break; - case DEVLINK_PARAM_TYPE_U32: - if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) - goto value_nest_cancel; - break; - case DEVLINK_PARAM_TYPE_STRING: - if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, - val.vstr)) - goto value_nest_cancel; - break; - case DEVLINK_PARAM_TYPE_BOOL: - if (val.vbool && - nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) - goto value_nest_cancel; - break; - } - - nla_nest_end(msg, param_value_attr); - return 0; - -value_nest_cancel: - nla_nest_cancel(msg, param_value_attr); -nla_put_failure: - return -EMSGSIZE; -} - -static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, - unsigned int port_index, - struct devlink_param_item *param_item, - enum devlink_command cmd, - u32 portid, u32 seq, int flags) -{ - union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; - bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; - const struct devlink_param *param = param_item->param; - struct devlink_param_gset_ctx ctx; - struct nlattr *param_values_list; - struct nlattr *param_attr; - int nla_type; - void *hdr; - int err; - int i; - - /* Get value from driver part to driverinit configuration mode */ - for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { - if (!devlink_param_cmode_is_supported(param, i)) - continue; - if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { - if (!param_item->driverinit_value_valid) - return -EOPNOTSUPP; - param_value[i] = param_item->driverinit_value; - } else { - ctx.cmode = i; - err = devlink_param_get(devlink, param, &ctx); - if (err) - return err; - param_value[i] = ctx.val; - } - param_value_set[i] = true; - } - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - if (cmd == DEVLINK_CMD_PORT_PARAM_GET || - cmd == DEVLINK_CMD_PORT_PARAM_NEW || - cmd == DEVLINK_CMD_PORT_PARAM_DEL) - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) - goto genlmsg_cancel; - - param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM); - if (!param_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) - goto param_nest_cancel; - if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) - goto param_nest_cancel; - - nla_type = devlink_param_type_to_nla_type(param->type); - if (nla_type < 0) - goto param_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) - goto param_nest_cancel; - - param_values_list = nla_nest_start_noflag(msg, - DEVLINK_ATTR_PARAM_VALUES_LIST); - if (!param_values_list) - goto param_nest_cancel; - - for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { - if (!param_value_set[i]) - continue; - err = devlink_nl_param_value_fill_one(msg, param->type, - i, param_value[i]); - if (err) - goto values_list_nest_cancel; - } - - nla_nest_end(msg, param_values_list); - nla_nest_end(msg, param_attr); - genlmsg_end(msg, hdr); - return 0; - -values_list_nest_cancel: - nla_nest_end(msg, param_values_list); -param_nest_cancel: - nla_nest_cancel(msg, param_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_param_notify(struct devlink *devlink, - unsigned int port_index, - struct devlink_param_item *param_item, - enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && - cmd != DEVLINK_CMD_PORT_PARAM_NEW && - cmd != DEVLINK_CMD_PORT_PARAM_DEL); - ASSERT_DEVLINK_REGISTERED(devlink); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, - 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_param_item *param_item; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(param_item, &devlink->param_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -static int -devlink_param_type_get_from_info(struct genl_info *info, - enum devlink_param_type *param_type) -{ - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) - return -EINVAL; - - switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { - case NLA_U8: - *param_type = DEVLINK_PARAM_TYPE_U8; - break; - case NLA_U16: - *param_type = DEVLINK_PARAM_TYPE_U16; - break; - case NLA_U32: - *param_type = DEVLINK_PARAM_TYPE_U32; - break; - case NLA_STRING: - *param_type = DEVLINK_PARAM_TYPE_STRING; - break; - case NLA_FLAG: - *param_type = DEVLINK_PARAM_TYPE_BOOL; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int -devlink_param_value_get_from_info(const struct devlink_param *param, - struct genl_info *info, - union devlink_param_value *value) -{ - struct nlattr *param_data; - int len; - - param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; - - if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) - return -EINVAL; - - switch (param->type) { - case DEVLINK_PARAM_TYPE_U8: - if (nla_len(param_data) != sizeof(u8)) - return -EINVAL; - value->vu8 = nla_get_u8(param_data); - break; - case DEVLINK_PARAM_TYPE_U16: - if (nla_len(param_data) != sizeof(u16)) - return -EINVAL; - value->vu16 = nla_get_u16(param_data); - break; - case DEVLINK_PARAM_TYPE_U32: - if (nla_len(param_data) != sizeof(u32)) - return -EINVAL; - value->vu32 = nla_get_u32(param_data); - break; - case DEVLINK_PARAM_TYPE_STRING: - len = strnlen(nla_data(param_data), nla_len(param_data)); - if (len == nla_len(param_data) || - len >= __DEVLINK_PARAM_MAX_STRING_VALUE) - return -EINVAL; - strcpy(value->vstr, nla_data(param_data)); - break; - case DEVLINK_PARAM_TYPE_BOOL: - if (param_data && nla_len(param_data)) - return -EINVAL; - value->vbool = nla_get_flag(param_data); - break; - } - return 0; -} - -static struct devlink_param_item * -devlink_param_get_from_info(struct list_head *param_list, - struct genl_info *info) -{ - char *param_name; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME)) - return NULL; - - param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); - return devlink_param_find_by_name(param_list, param_name); -} - -static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_param_item *param_item; - struct sk_buff *msg; - int err; - - param_item = devlink_param_get_from_info(&devlink->param_list, info); - if (!param_item) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, - unsigned int port_index, - struct list_head *param_list, - struct genl_info *info, - enum devlink_command cmd) -{ - enum devlink_param_type param_type; - struct devlink_param_gset_ctx ctx; - enum devlink_param_cmode cmode; - struct devlink_param_item *param_item; - const struct devlink_param *param; - union devlink_param_value value; - int err = 0; - - param_item = devlink_param_get_from_info(param_list, info); - if (!param_item) - return -EINVAL; - param = param_item->param; - err = devlink_param_type_get_from_info(info, ¶m_type); - if (err) - return err; - if (param_type != param->type) - return -EINVAL; - err = devlink_param_value_get_from_info(param, info, &value); - if (err) - return err; - if (param->validate) { - err = param->validate(devlink, param->id, value, info->extack); - if (err) - return err; - } - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) - return -EINVAL; - cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); - if (!devlink_param_cmode_is_supported(param, cmode)) - return -EOPNOTSUPP; - - if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { - if (param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(param_item->driverinit_value.vstr, value.vstr); - else - param_item->driverinit_value = value; - param_item->driverinit_value_valid = true; - } else { - if (!param->set) - return -EOPNOTSUPP; - ctx.val = value; - ctx.cmode = cmode; - err = devlink_param_set(devlink, param, &ctx); - if (err) - return err; - } - - devlink_param_notify(devlink, port_index, param_item, cmd); - return 0; -} - -static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - - return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->param_list, - info, DEVLINK_CMD_PARAM_NEW); -} - -static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - NL_SET_ERR_MSG_MOD(cb->extack, "Port params are not supported"); - return msg->len; -} - -static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported"); - return -EINVAL; -} - -static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported"); - return -EINVAL; -} - -static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_snapshot *snapshot) -{ - struct nlattr *snap_attr; - int err; - - snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT); - if (!snap_attr) - return -EINVAL; - - err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); - if (err) - goto nla_put_failure; - - nla_nest_end(msg, snap_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, snap_attr); - return err; -} - -static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_region *region) -{ - struct devlink_snapshot *snapshot; - struct nlattr *snapshots_attr; - int err; - - snapshots_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_REGION_SNAPSHOTS); - if (!snapshots_attr) - return -EINVAL; - - list_for_each_entry(snapshot, ®ion->snapshot_list, list) { - err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); - if (err) - goto nla_put_failure; - } - - nla_nest_end(msg, snapshots_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, snapshots_attr); - return err; -} - -static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct devlink_region *region) -{ - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = devlink_nl_put_handle(msg, devlink); - if (err) - goto nla_put_failure; - - if (region->port) { - err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - region->port->index); - if (err) - goto nla_put_failure; - } - - err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); - if (err) - goto nla_put_failure; - - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, - region->size, - DEVLINK_ATTR_PAD); - if (err) - goto nla_put_failure; - - err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, - region->max_snapshots); - if (err) - goto nla_put_failure; - - err = devlink_nl_region_snapshots_id_put(msg, devlink, region); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -static struct sk_buff * -devlink_nl_region_notify_build(struct devlink_region *region, - struct devlink_snapshot *snapshot, - enum devlink_command cmd, u32 portid, u32 seq) -{ - struct devlink *devlink = region->devlink; - struct sk_buff *msg; - void *hdr; - int err; - - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return ERR_PTR(-ENOMEM); - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); - if (!hdr) { - err = -EMSGSIZE; - goto out_free_msg; - } - - err = devlink_nl_put_handle(msg, devlink); - if (err) - goto out_cancel_msg; - - if (region->port) { - err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - region->port->index); - if (err) - goto out_cancel_msg; - } - - err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, - region->ops->name); - if (err) - goto out_cancel_msg; - - if (snapshot) { - err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, - snapshot->id); - if (err) - goto out_cancel_msg; - } else { - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, - region->size, DEVLINK_ATTR_PAD); - if (err) - goto out_cancel_msg; - } - genlmsg_end(msg, hdr); - - return msg; - -out_cancel_msg: - genlmsg_cancel(msg, hdr); -out_free_msg: - nlmsg_free(msg); - return ERR_PTR(err); -} - -static void devlink_nl_region_notify(struct devlink_region *region, - struct devlink_snapshot *snapshot, - enum devlink_command cmd) -{ - struct devlink *devlink = region->devlink; - struct sk_buff *msg; - - WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); - if (IS_ERR(msg)) - return; - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -/** - * __devlink_snapshot_id_increment - Increment number of snapshots using an id - * @devlink: devlink instance - * @id: the snapshot id - * - * Track when a new snapshot begins using an id. Load the count for the - * given id from the snapshot xarray, increment it, and store it back. - * - * Called when a new snapshot is created with the given id. - * - * The id *must* have been previously allocated by - * devlink_region_snapshot_id_get(). - * - * Returns 0 on success, or an error on failure. - */ -static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) -{ - unsigned long count; - void *p; - int err; - - xa_lock(&devlink->snapshot_ids); - p = xa_load(&devlink->snapshot_ids, id); - if (WARN_ON(!p)) { - err = -EINVAL; - goto unlock; - } - - if (WARN_ON(!xa_is_value(p))) { - err = -EINVAL; - goto unlock; - } - - count = xa_to_value(p); - count++; - - err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), - GFP_ATOMIC)); -unlock: - xa_unlock(&devlink->snapshot_ids); - return err; -} - -/** - * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id - * @devlink: devlink instance - * @id: the snapshot id - * - * Track when a snapshot is deleted and stops using an id. Load the count - * for the given id from the snapshot xarray, decrement it, and store it - * back. - * - * If the count reaches zero, erase this id from the xarray, freeing it - * up for future re-use by devlink_region_snapshot_id_get(). - * - * Called when a snapshot using the given id is deleted, and when the - * initial allocator of the id is finished using it. - */ -static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) -{ - unsigned long count; - void *p; - - xa_lock(&devlink->snapshot_ids); - p = xa_load(&devlink->snapshot_ids, id); - if (WARN_ON(!p)) - goto unlock; - - if (WARN_ON(!xa_is_value(p))) - goto unlock; - - count = xa_to_value(p); - - if (count > 1) { - count--; - __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), - GFP_ATOMIC); - } else { - /* If this was the last user, we can erase this id */ - __xa_erase(&devlink->snapshot_ids, id); - } -unlock: - xa_unlock(&devlink->snapshot_ids); -} - -/** - * __devlink_snapshot_id_insert - Insert a specific snapshot ID - * @devlink: devlink instance - * @id: the snapshot id - * - * Mark the given snapshot id as used by inserting a zero value into the - * snapshot xarray. - * - * This must be called while holding the devlink instance lock. Unlike - * devlink_snapshot_id_get, the initial reference count is zero, not one. - * It is expected that the id will immediately be used before - * releasing the devlink instance lock. - * - * Returns zero on success, or an error code if the snapshot id could not - * be inserted. - */ -static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) -{ - int err; - - xa_lock(&devlink->snapshot_ids); - if (xa_load(&devlink->snapshot_ids, id)) { - xa_unlock(&devlink->snapshot_ids); - return -EEXIST; - } - err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), - GFP_ATOMIC)); - xa_unlock(&devlink->snapshot_ids); - return err; -} - -/** - * __devlink_region_snapshot_id_get - get snapshot ID - * @devlink: devlink instance - * @id: storage to return snapshot id - * - * Allocates a new snapshot id. Returns zero on success, or a negative - * error on failure. Must be called while holding the devlink instance - * lock. - * - * Snapshot IDs are tracked using an xarray which stores the number of - * users of the snapshot id. - * - * Note that the caller of this function counts as a 'user', in order to - * avoid race conditions. The caller must release its hold on the - * snapshot by using devlink_region_snapshot_id_put. - */ -static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) -{ - return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), - xa_limit_32b, GFP_KERNEL); -} - -/** - * __devlink_region_snapshot_create - create a new snapshot - * This will add a new snapshot of a region. The snapshot - * will be stored on the region struct and can be accessed - * from devlink. This is useful for future analyses of snapshots. - * Multiple snapshots can be created on a region. - * The @snapshot_id should be obtained using the getter function. - * - * Must be called only while holding the region snapshot lock. - * - * @region: devlink region of the snapshot - * @data: snapshot data - * @snapshot_id: snapshot id to be created - */ -static int -__devlink_region_snapshot_create(struct devlink_region *region, - u8 *data, u32 snapshot_id) -{ - struct devlink *devlink = region->devlink; - struct devlink_snapshot *snapshot; - int err; - - lockdep_assert_held(®ion->snapshot_lock); - - /* check if region can hold one more snapshot */ - if (region->cur_snapshots == region->max_snapshots) - return -ENOSPC; - - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) - return -EEXIST; - - snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); - if (!snapshot) - return -ENOMEM; - - err = __devlink_snapshot_id_increment(devlink, snapshot_id); - if (err) - goto err_snapshot_id_increment; - - snapshot->id = snapshot_id; - snapshot->region = region; - snapshot->data = data; - - list_add_tail(&snapshot->list, ®ion->snapshot_list); - - region->cur_snapshots++; - - devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); - return 0; - -err_snapshot_id_increment: - kfree(snapshot); - return err; -} - -static void devlink_region_snapshot_del(struct devlink_region *region, - struct devlink_snapshot *snapshot) -{ - struct devlink *devlink = region->devlink; - - lockdep_assert_held(®ion->snapshot_lock); - - devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); - region->cur_snapshots--; - list_del(&snapshot->list); - region->ops->destructor(snapshot->data); - __devlink_snapshot_id_decrement(devlink, snapshot->id); - kfree(snapshot); -} - -static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_port *port = NULL; - struct devlink_region *region; - const char *region_name; - struct sk_buff *msg; - unsigned int index; - int err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) - return -EINVAL; - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - port = devlink_port_get_by_index(devlink, index); - if (!port) - return -ENODEV; - } - - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); - if (port) - region = devlink_port_region_get_by_name(port, region_name); - else - region = devlink_region_get_by_name(devlink, region_name); - - if (!region) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, - info->snd_portid, info->snd_seq, 0, - region); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, - struct netlink_callback *cb, - struct devlink_port *port, - int *idx, - int start) -{ - struct devlink_region *region; - int err = 0; - - list_for_each_entry(region, &port->region_list, list) { - if (*idx < start) { - (*idx)++; - continue; - } - err = devlink_nl_region_fill(msg, port->devlink, - DEVLINK_CMD_REGION_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, region); - if (err) - goto out; - (*idx)++; - } - -out: - return err; -} - -static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, - struct netlink_callback *cb, - struct devlink *devlink, - int *idx, - int start) -{ - struct devlink_region *region; - struct devlink_port *port; - unsigned long port_index; - int err = 0; - - devl_lock(devlink); - list_for_each_entry(region, &devlink->region_list, list) { - if (*idx < start) { - (*idx)++; - continue; - } - err = devlink_nl_region_fill(msg, devlink, - DEVLINK_CMD_REGION_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, region); - if (err) - goto out; - (*idx)++; - } - - xa_for_each(&devlink->ports, port_index, port) { - err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, idx, - start); - if (err) - goto out; - } - -out: - devl_unlock(devlink); - return err; -} - -static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, - &idx, start); - devlink_put(devlink); - if (err) - goto out; - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int devlink_nl_cmd_region_del(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_snapshot *snapshot; - struct devlink_port *port = NULL; - struct devlink_region *region; - const char *region_name; - unsigned int index; - u32 snapshot_id; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) || - GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID)) - return -EINVAL; - - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); - snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - port = devlink_port_get_by_index(devlink, index); - if (!port) - return -ENODEV; - } - - if (port) - region = devlink_port_region_get_by_name(port, region_name); - else - region = devlink_region_get_by_name(devlink, region_name); - - if (!region) - return -EINVAL; - - mutex_lock(®ion->snapshot_lock); - snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (!snapshot) { - mutex_unlock(®ion->snapshot_lock); - return -EINVAL; - } - - devlink_region_snapshot_del(region, snapshot); - mutex_unlock(®ion->snapshot_lock); - return 0; -} - -static int -devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_snapshot *snapshot; - struct devlink_port *port = NULL; - struct nlattr *snapshot_id_attr; - struct devlink_region *region; - const char *region_name; - unsigned int index; - u32 snapshot_id; - u8 *data; - int err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { - NL_SET_ERR_MSG_MOD(info->extack, "No region name provided"); - return -EINVAL; - } - - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - port = devlink_port_get_by_index(devlink, index); - if (!port) - return -ENODEV; - } - - if (port) - region = devlink_port_region_get_by_name(port, region_name); - else - region = devlink_region_get_by_name(devlink, region_name); - - if (!region) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist"); - return -EINVAL; - } - - if (!region->ops->snapshot) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot"); - return -EOPNOTSUPP; - } - - mutex_lock(®ion->snapshot_lock); - - if (region->cur_snapshots == region->max_snapshots) { - NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots"); - err = -ENOSPC; - goto unlock; - } - - snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; - if (snapshot_id_attr) { - snapshot_id = nla_get_u32(snapshot_id_attr); - - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); - err = -EEXIST; - goto unlock; - } - - err = __devlink_snapshot_id_insert(devlink, snapshot_id); - if (err) - goto unlock; - } else { - err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); - if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id"); - goto unlock; - } - } - - if (port) - err = region->port_ops->snapshot(port, region->port_ops, - info->extack, &data); - else - err = region->ops->snapshot(devlink, region->ops, - info->extack, &data); - if (err) - goto err_snapshot_capture; - - err = __devlink_region_snapshot_create(region, data, snapshot_id); - if (err) - goto err_snapshot_create; - - if (!snapshot_id_attr) { - struct sk_buff *msg; - - snapshot = devlink_region_snapshot_get_by_id(region, - snapshot_id); - if (WARN_ON(!snapshot)) { - err = -EINVAL; - goto unlock; - } - - msg = devlink_nl_region_notify_build(region, snapshot, - DEVLINK_CMD_REGION_NEW, - info->snd_portid, - info->snd_seq); - err = PTR_ERR_OR_ZERO(msg); - if (err) - goto err_notify; - - err = genlmsg_reply(msg, info); - if (err) - goto err_notify; - } - - mutex_unlock(®ion->snapshot_lock); - return 0; - -err_snapshot_create: - region->ops->destructor(data); -err_snapshot_capture: - __devlink_snapshot_id_decrement(devlink, snapshot_id); - mutex_unlock(®ion->snapshot_lock); - return err; - -err_notify: - devlink_region_snapshot_del(region, snapshot); -unlock: - mutex_unlock(®ion->snapshot_lock); - return err; -} - -static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, - u8 *chunk, u32 chunk_size, - u64 addr) -{ - struct nlattr *chunk_attr; - int err; - - chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK); - if (!chunk_attr) - return -EINVAL; - - err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); - if (err) - goto nla_put_failure; - - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr, - DEVLINK_ATTR_PAD); - if (err) - goto nla_put_failure; - - nla_nest_end(msg, chunk_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, chunk_attr); - return err; -} - -#define DEVLINK_REGION_READ_CHUNK_SIZE 256 - -typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size, - u64 curr_offset, - struct netlink_ext_ack *extack); - -static int -devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb, - void *cb_priv, u64 start_offset, u64 end_offset, - u64 *new_offset, struct netlink_ext_ack *extack) -{ - u64 curr_offset = start_offset; - int err = 0; - u8 *data; - - /* Allocate and re-use a single buffer */ - data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL); - if (!data) - return -ENOMEM; - - *new_offset = start_offset; - - while (curr_offset < end_offset) { - u32 data_size; - - data_size = min_t(u32, end_offset - curr_offset, - DEVLINK_REGION_READ_CHUNK_SIZE); - - err = cb(cb_priv, data, data_size, curr_offset, extack); - if (err) - break; - - err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset); - if (err) - break; - - curr_offset += data_size; - } - *new_offset = curr_offset; - - kfree(data); - - return err; -} - -static int -devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, - u64 curr_offset, - struct netlink_ext_ack __always_unused *extack) -{ - struct devlink_snapshot *snapshot = cb_priv; - - memcpy(chunk, &snapshot->data[curr_offset], chunk_size); - - return 0; -} - -static int -devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, - u64 curr_offset, struct netlink_ext_ack *extack) -{ - struct devlink_region *region = cb_priv; - - return region->port_ops->read(region->port, region->port_ops, extack, - curr_offset, chunk_size, chunk); -} - -static int -devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, - u64 curr_offset, struct netlink_ext_ack *extack) -{ - struct devlink_region *region = cb_priv; - - return region->ops->read(region->devlink, region->ops, extack, - curr_offset, chunk_size, chunk); -} - -static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - const struct genl_dumpit_info *info = genl_dumpit_info(cb); - struct nlattr *chunks_attr, *region_attr, *snapshot_attr; - u64 ret_offset, start_offset, end_offset = U64_MAX; - struct nlattr **attrs = info->attrs; - struct devlink_port *port = NULL; - devlink_chunk_fill_t *region_cb; - struct devlink_region *region; - const char *region_name; - struct devlink *devlink; - unsigned int index; - void *region_cb_priv; - void *hdr; - int err; - - start_offset = *((u64 *)&cb->args[0]); - - devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); - if (IS_ERR(devlink)) - return PTR_ERR(devlink); - - devl_lock(devlink); - - if (!attrs[DEVLINK_ATTR_REGION_NAME]) { - NL_SET_ERR_MSG(cb->extack, "No region name provided"); - err = -EINVAL; - goto out_unlock; - } - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - port = devlink_port_get_by_index(devlink, index); - if (!port) { - err = -ENODEV; - goto out_unlock; - } - } - - region_attr = attrs[DEVLINK_ATTR_REGION_NAME]; - region_name = nla_data(region_attr); - - if (port) - region = devlink_port_region_get_by_name(port, region_name); - else - region = devlink_region_get_by_name(devlink, region_name); - - if (!region) { - NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist"); - err = -EINVAL; - goto out_unlock; - } - - snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; - if (!snapshot_attr) { - if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { - NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); - err = -EINVAL; - goto out_unlock; - } - - if (!region->ops->read) { - NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read"); - err = -EOPNOTSUPP; - goto out_unlock; - } - - if (port) - region_cb = &devlink_region_port_direct_fill; - else - region_cb = &devlink_region_direct_fill; - region_cb_priv = region; - } else { - struct devlink_snapshot *snapshot; - u32 snapshot_id; - - if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { - NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot"); - err = -EINVAL; - goto out_unlock; - } - - snapshot_id = nla_get_u32(snapshot_attr); - snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (!snapshot) { - NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); - err = -EINVAL; - goto out_unlock; - } - region_cb = &devlink_region_snapshot_fill; - region_cb_priv = snapshot; - } - - if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && - attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { - if (!start_offset) - start_offset = - nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - - end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); - } - - if (end_offset > region->size) - end_offset = region->size; - - /* return 0 if there is no further data to read */ - if (start_offset == end_offset) { - err = 0; - goto out_unlock; - } - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, - DEVLINK_CMD_REGION_READ); - if (!hdr) { - err = -EMSGSIZE; - goto out_unlock; - } - - err = devlink_nl_put_handle(skb, devlink); - if (err) - goto nla_put_failure; - - if (region->port) { - err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, - region->port->index); - if (err) - goto nla_put_failure; - } - - err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); - if (err) - goto nla_put_failure; - - chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS); - if (!chunks_attr) { - err = -EMSGSIZE; - goto nla_put_failure; - } - - err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv, - start_offset, end_offset, &ret_offset, - cb->extack); - - if (err && err != -EMSGSIZE) - goto nla_put_failure; - - /* Check if there was any progress done to prevent infinite loop */ - if (ret_offset == start_offset) { - err = -EINVAL; - goto nla_put_failure; - } - - *((u64 *)&cb->args[0]) = ret_offset; - - nla_nest_end(skb, chunks_attr); - genlmsg_end(skb, hdr); - devl_unlock(devlink); - devlink_put(devlink); - return skb->len; - -nla_put_failure: - genlmsg_cancel(skb, hdr); -out_unlock: - devl_unlock(devlink); - devlink_put(devlink); - return err; -} - -int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) -{ - if (!req->msg) - return 0; - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); -} -EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); - -int devlink_info_board_serial_number_put(struct devlink_info_req *req, - const char *bsn) -{ - if (!req->msg) - return 0; - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, - bsn); -} -EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); - -static int devlink_info_version_put(struct devlink_info_req *req, int attr, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - struct nlattr *nest; - int err; - - if (req->version_cb) - req->version_cb(version_name, version_type, - req->version_cb_priv); - - if (!req->msg) - return 0; - - nest = nla_nest_start_noflag(req->msg, attr); - if (!nest) - return -EMSGSIZE; - - err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, - version_name); - if (err) - goto nla_put_failure; - - err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, - version_value); - if (err) - goto nla_put_failure; - - nla_nest_end(req->msg, nest); - - return 0; - -nla_put_failure: - nla_nest_cancel(req->msg, nest); - return err; -} - -int devlink_info_version_fixed_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); - -int devlink_info_version_stored_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); - -int devlink_info_version_stored_put_ext(struct devlink_info_req *req, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, - version_name, version_value, - version_type); -} -EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext); - -int devlink_info_version_running_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, - version_name, version_value, - DEVLINK_INFO_VERSION_TYPE_NONE); -} -EXPORT_SYMBOL_GPL(devlink_info_version_running_put); - -int devlink_info_version_running_put_ext(struct devlink_info_req *req, - const char *version_name, - const char *version_value, - enum devlink_info_version_type version_type) -{ - return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, - version_name, version_value, - version_type); -} -EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); - -static int devlink_nl_driver_info_get(struct device_driver *drv, - struct devlink_info_req *req) -{ - if (!drv) - return 0; - - if (drv->name[0]) - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, - drv->name); - - return 0; -} - -static int -devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, struct netlink_ext_ack *extack) -{ - struct device *dev = devlink_to_dev(devlink); - struct devlink_info_req req = {}; - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto err_cancel_msg; - - req.msg = msg; - if (devlink->ops->info_get) { - err = devlink->ops->info_get(devlink, &req, extack); - if (err) - goto err_cancel_msg; - } - - err = devlink_nl_driver_info_get(dev->driver, &req); - if (err) - goto err_cancel_msg; - - genlmsg_end(msg, hdr); - return 0; - -err_cancel_msg: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, - info->snd_portid, info->snd_seq, 0, - info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start) - goto inc; - - devl_lock(devlink); - err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->extack); - devl_unlock(devlink); - if (err == -EOPNOTSUPP) - err = 0; - else if (err) { - devlink_put(devlink); - break; - } -inc: - idx++; - devlink_put(devlink); - } - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - -struct devlink_fmsg_item { - struct list_head list; - int attrtype; - u8 nla_type; - u16 len; - int value[]; -}; - -struct devlink_fmsg { - struct list_head item_list; - bool putting_binary; /* This flag forces enclosing of binary data - * in an array brackets. It forces using - * of designated API: - * devlink_fmsg_binary_pair_nest_start() - * devlink_fmsg_binary_pair_nest_end() - */ -}; - -static struct devlink_fmsg *devlink_fmsg_alloc(void) -{ - struct devlink_fmsg *fmsg; - - fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); - if (!fmsg) - return NULL; - - INIT_LIST_HEAD(&fmsg->item_list); - - return fmsg; -} - -static void devlink_fmsg_free(struct devlink_fmsg *fmsg) -{ - struct devlink_fmsg_item *item, *tmp; - - list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { - list_del(&item->list); - kfree(item); - } - kfree(fmsg); -} - -static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, - int attrtype) -{ - struct devlink_fmsg_item *item; - - item = kzalloc(sizeof(*item), GFP_KERNEL); - if (!item) - return -ENOMEM; - - item->attrtype = attrtype; - list_add_tail(&item->list, &fmsg->item_list); - - return 0; -} - -int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); - -static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); -} - -int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_nest_end(fmsg); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); - -#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) - -static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) -{ - struct devlink_fmsg_item *item; - - if (fmsg->putting_binary) - return -EINVAL; - - if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) - return -EMSGSIZE; - - item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); - if (!item) - return -ENOMEM; - - item->nla_type = NLA_NUL_STRING; - item->len = strlen(name) + 1; - item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; - memcpy(&item->value, name, item->len); - list_add_tail(&item->list, &fmsg->item_list); - - return 0; -} - -int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) -{ - int err; - - if (fmsg->putting_binary) - return -EINVAL; - - err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); - if (err) - return err; - - err = devlink_fmsg_put_name(fmsg, name); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); - -int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_nest_end(fmsg); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); - -int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name) -{ - int err; - - if (fmsg->putting_binary) - return -EINVAL; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); - -int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) -{ - int err; - - if (fmsg->putting_binary) - return -EINVAL; - - err = devlink_fmsg_nest_end(fmsg); - if (err) - return err; - - err = devlink_fmsg_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); - -int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name) -{ - int err; - - err = devlink_fmsg_arr_pair_nest_start(fmsg, name); - if (err) - return err; - - fmsg->putting_binary = true; - return err; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start); - -int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg) -{ - if (!fmsg->putting_binary) - return -EINVAL; - - fmsg->putting_binary = false; - return devlink_fmsg_arr_pair_nest_end(fmsg); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end); - -static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, - const void *value, u16 value_len, - u8 value_nla_type) -{ - struct devlink_fmsg_item *item; - - if (value_len > DEVLINK_FMSG_MAX_SIZE) - return -EMSGSIZE; - - item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); - if (!item) - return -ENOMEM; - - item->nla_type = value_nla_type; - item->len = value_len; - item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; - memcpy(&item->value, value, item->len); - list_add_tail(&item->list, &fmsg->item_list); - - return 0; -} - -static int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); -} - -static int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); -} - -int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); - -static int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); -} - -int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) -{ - if (fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, - NLA_NUL_STRING); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); - -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) -{ - if (!fmsg->putting_binary) - return -EINVAL; - - return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); -} -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); - -int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, - bool value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_bool_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); - -int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, - u8 value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_u8_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); - -int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, - u32 value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_u32_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); - -int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, - u64 value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_u64_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); - -int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, - const char *value) -{ - int err; - - err = devlink_fmsg_pair_nest_start(fmsg, name); - if (err) - return err; - - err = devlink_fmsg_string_put(fmsg, value); - if (err) - return err; - - err = devlink_fmsg_pair_nest_end(fmsg); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); - -int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u32 value_len) -{ - u32 data_size; - int end_err; - u32 offset; - int err; - - err = devlink_fmsg_binary_pair_nest_start(fmsg, name); - if (err) - return err; - - for (offset = 0; offset < value_len; offset += data_size) { - data_size = value_len - offset; - if (data_size > DEVLINK_FMSG_MAX_SIZE) - data_size = DEVLINK_FMSG_MAX_SIZE; - err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); - if (err) - break; - /* Exit from loop with a break (instead of - * return) to make sure putting_binary is turned off in - * devlink_fmsg_binary_pair_nest_end - */ - } - - end_err = devlink_fmsg_binary_pair_nest_end(fmsg); - if (end_err) - err = end_err; - - return err; -} -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); - -static int -devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) -{ - switch (msg->nla_type) { - case NLA_FLAG: - case NLA_U8: - case NLA_U32: - case NLA_U64: - case NLA_NUL_STRING: - case NLA_BINARY: - return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, - msg->nla_type); - default: - return -EINVAL; - } -} - -static int -devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) -{ - int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; - u8 tmp; - - switch (msg->nla_type) { - case NLA_FLAG: - /* Always provide flag data, regardless of its value */ - tmp = *(bool *) msg->value; - - return nla_put_u8(skb, attrtype, tmp); - case NLA_U8: - return nla_put_u8(skb, attrtype, *(u8 *) msg->value); - case NLA_U32: - return nla_put_u32(skb, attrtype, *(u32 *) msg->value); - case NLA_U64: - return nla_put_u64_64bit(skb, attrtype, *(u64 *) msg->value, - DEVLINK_ATTR_PAD); - case NLA_NUL_STRING: - return nla_put_string(skb, attrtype, (char *) &msg->value); - case NLA_BINARY: - return nla_put(skb, attrtype, msg->len, (void *) &msg->value); - default: - return -EINVAL; - } -} - -static int -devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, - int *start) -{ - struct devlink_fmsg_item *item; - struct nlattr *fmsg_nlattr; - int i = 0; - int err; - - fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG); - if (!fmsg_nlattr) - return -EMSGSIZE; - - list_for_each_entry(item, &fmsg->item_list, list) { - if (i < *start) { - i++; - continue; - } - - switch (item->attrtype) { - case DEVLINK_ATTR_FMSG_OBJ_NEST_START: - case DEVLINK_ATTR_FMSG_PAIR_NEST_START: - case DEVLINK_ATTR_FMSG_ARR_NEST_START: - case DEVLINK_ATTR_FMSG_NEST_END: - err = nla_put_flag(skb, item->attrtype); - break; - case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: - err = devlink_fmsg_item_fill_type(item, skb); - if (err) - break; - err = devlink_fmsg_item_fill_data(item, skb); - break; - case DEVLINK_ATTR_FMSG_OBJ_NAME: - err = nla_put_string(skb, item->attrtype, - (char *) &item->value); - break; - default: - err = -EINVAL; - break; - } - if (!err) - *start = ++i; - else - break; - } - - nla_nest_end(skb, fmsg_nlattr); - return err; -} - -static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, - struct genl_info *info, - enum devlink_command cmd, int flags) -{ - struct nlmsghdr *nlh; - struct sk_buff *skb; - bool last = false; - int index = 0; - void *hdr; - int err; - - while (!last) { - int tmp_index = index; - - skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, flags | NLM_F_MULTI, cmd); - if (!hdr) { - err = -EMSGSIZE; - goto nla_put_failure; - } - - err = devlink_fmsg_prepare_skb(fmsg, skb, &index); - if (!err) - last = true; - else if (err != -EMSGSIZE || tmp_index == index) - goto nla_put_failure; - - genlmsg_end(skb, hdr); - err = genlmsg_reply(skb, info); - if (err) - return err; - } - - skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return -ENOMEM; - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = -EMSGSIZE; - goto nla_put_failure; - } - - return genlmsg_reply(skb, info); - -nla_put_failure: - nlmsg_free(skb); - return err; -} - -static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, - struct netlink_callback *cb, - enum devlink_command cmd) -{ - int index = cb->args[0]; - int tmp_index = index; - void *hdr; - int err; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); - if (!hdr) { - err = -EMSGSIZE; - goto nla_put_failure; - } - - err = devlink_fmsg_prepare_skb(fmsg, skb, &index); - if ((err && err != -EMSGSIZE) || tmp_index == index) - goto nla_put_failure; - - cb->args[0] = index; - genlmsg_end(skb, hdr); - return skb->len; - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return err; -} - -struct devlink_health_reporter { - struct list_head list; - void *priv; - const struct devlink_health_reporter_ops *ops; - struct devlink *devlink; - struct devlink_port *devlink_port; - struct devlink_fmsg *dump_fmsg; - struct mutex dump_lock; /* lock parallel read/write from dump buffers */ - u64 graceful_period; - bool auto_recover; - bool auto_dump; - u8 health_state; - u64 dump_ts; - u64 dump_real_ts; - u64 error_count; - u64 recovery_count; - u64 last_recovery_ts; - refcount_t refcount; -}; - -void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter) -{ - return reporter->priv; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); - -static struct devlink_health_reporter * -__devlink_health_reporter_find_by_name(struct list_head *reporter_list, - struct mutex *list_lock, - const char *reporter_name) -{ - struct devlink_health_reporter *reporter; - - lockdep_assert_held(list_lock); - list_for_each_entry(reporter, reporter_list, list) - if (!strcmp(reporter->ops->name, reporter_name)) - return reporter; - return NULL; -} - -static struct devlink_health_reporter * -devlink_health_reporter_find_by_name(struct devlink *devlink, - const char *reporter_name) -{ - return __devlink_health_reporter_find_by_name(&devlink->reporter_list, - &devlink->reporters_lock, - reporter_name); -} - -static struct devlink_health_reporter * -devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, - const char *reporter_name) -{ - return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, - &devlink_port->reporters_lock, - reporter_name); -} - -static struct devlink_health_reporter * -__devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) -{ - struct devlink_health_reporter *reporter; - - if (WARN_ON(graceful_period && !ops->recover)) - return ERR_PTR(-EINVAL); - - reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); - if (!reporter) - return ERR_PTR(-ENOMEM); - - reporter->priv = priv; - reporter->ops = ops; - reporter->devlink = devlink; - reporter->graceful_period = graceful_period; - reporter->auto_recover = !!ops->recover; - reporter->auto_dump = !!ops->dump; - mutex_init(&reporter->dump_lock); - refcount_set(&reporter->refcount, 1); - return reporter; -} - -/** - * devlink_port_health_reporter_create - create devlink health reporter for - * specified port instance - * - * @port: devlink_port which should contain the new reporter - * @ops: ops - * @graceful_period: to avoid recovery loops, in msecs - * @priv: priv - */ -struct devlink_health_reporter * -devlink_port_health_reporter_create(struct devlink_port *port, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) -{ - struct devlink_health_reporter *reporter; - - mutex_lock(&port->reporters_lock); - if (__devlink_health_reporter_find_by_name(&port->reporter_list, - &port->reporters_lock, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } - - reporter = __devlink_health_reporter_create(port->devlink, ops, - graceful_period, priv); - if (IS_ERR(reporter)) - goto unlock; - - reporter->devlink_port = port; - list_add_tail(&reporter->list, &port->reporter_list); -unlock: - mutex_unlock(&port->reporters_lock); - return reporter; -} -EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); - -/** - * devlink_health_reporter_create - create devlink health reporter - * - * @devlink: devlink - * @ops: ops - * @graceful_period: to avoid recovery loops, in msecs - * @priv: priv - */ -struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) -{ - struct devlink_health_reporter *reporter; - - mutex_lock(&devlink->reporters_lock); - if (devlink_health_reporter_find_by_name(devlink, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } - - reporter = __devlink_health_reporter_create(devlink, ops, - graceful_period, priv); - if (IS_ERR(reporter)) - goto unlock; - - list_add_tail(&reporter->list, &devlink->reporter_list); -unlock: - mutex_unlock(&devlink->reporters_lock); - return reporter; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_create); - -static void -devlink_health_reporter_free(struct devlink_health_reporter *reporter) -{ - mutex_destroy(&reporter->dump_lock); - if (reporter->dump_fmsg) - devlink_fmsg_free(reporter->dump_fmsg); - kfree(reporter); -} - -static void -devlink_health_reporter_put(struct devlink_health_reporter *reporter) -{ - if (refcount_dec_and_test(&reporter->refcount)) - devlink_health_reporter_free(reporter); -} - -static void -__devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - list_del(&reporter->list); - devlink_health_reporter_put(reporter); -} - -/** - * devlink_health_reporter_destroy - destroy devlink health reporter - * - * @reporter: devlink health reporter to destroy - */ -void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - struct mutex *lock = &reporter->devlink->reporters_lock; - - mutex_lock(lock); - __devlink_health_reporter_destroy(reporter); - mutex_unlock(lock); -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); - -/** - * devlink_port_health_reporter_destroy - destroy devlink port health reporter - * - * @reporter: devlink health reporter to destroy - */ -void -devlink_port_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - struct mutex *lock = &reporter->devlink_port->reporters_lock; - - mutex_lock(lock); - __devlink_health_reporter_destroy(reporter); - mutex_unlock(lock); -} -EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); - -static int -devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink_health_reporter *reporter, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - struct devlink *devlink = reporter->devlink; - struct nlattr *reporter_attr; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - if (reporter->devlink_port) { - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index)) - goto genlmsg_cancel; - } - reporter_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_HEALTH_REPORTER); - if (!reporter_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, - reporter->ops->name)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, - reporter->health_state)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, - reporter->error_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, - reporter->recovery_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, - reporter->graceful_period, - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, - reporter->auto_recover)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, - jiffies_to_msecs(reporter->dump_ts), - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, - reporter->dump_real_ts, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->dump && - nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, - reporter->auto_dump)) - goto reporter_nest_cancel; - - nla_nest_end(msg, reporter_attr); - genlmsg_end(msg, hdr); - return 0; - -reporter_nest_cancel: - nla_nest_end(msg, reporter_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_recover_notify(struct devlink_health_reporter *reporter, - enum devlink_command cmd) -{ - struct devlink *devlink = reporter->devlink; - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -void -devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) -{ - reporter->recovery_count++; - reporter->last_recovery_ts = jiffies; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); - -static int -devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx, struct netlink_ext_ack *extack) -{ - int err; - - if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) - return 0; - - if (!reporter->ops->recover) - return -EOPNOTSUPP; - - err = reporter->ops->recover(reporter, priv_ctx, extack); - if (err) - return err; - - devlink_health_reporter_recovery_done(reporter); - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - - return 0; -} - -static void -devlink_health_dump_clear(struct devlink_health_reporter *reporter) -{ - if (!reporter->dump_fmsg) - return; - devlink_fmsg_free(reporter->dump_fmsg); - reporter->dump_fmsg = NULL; -} - -static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx, - struct netlink_ext_ack *extack) -{ - int err; - - if (!reporter->ops->dump) - return 0; - - if (reporter->dump_fmsg) - return 0; - - reporter->dump_fmsg = devlink_fmsg_alloc(); - if (!reporter->dump_fmsg) { - err = -ENOMEM; - return err; - } - - err = devlink_fmsg_obj_nest_start(reporter->dump_fmsg); - if (err) - goto dump_err; - - err = reporter->ops->dump(reporter, reporter->dump_fmsg, - priv_ctx, extack); - if (err) - goto dump_err; - - err = devlink_fmsg_obj_nest_end(reporter->dump_fmsg); - if (err) - goto dump_err; - - reporter->dump_ts = jiffies; - reporter->dump_real_ts = ktime_get_real_ns(); - - return 0; - -dump_err: - devlink_health_dump_clear(reporter); - return err; -} - -int devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx) -{ - enum devlink_health_reporter_state prev_health_state; - struct devlink *devlink = reporter->devlink; - unsigned long recover_ts_threshold; - int ret; - - /* write a log message of the current error */ - WARN_ON(!msg); - trace_devlink_health_report(devlink, reporter->ops->name, msg); - reporter->error_count++; - prev_health_state = reporter->health_state; - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - - /* abort if the previous error wasn't recovered */ - recover_ts_threshold = reporter->last_recovery_ts + - msecs_to_jiffies(reporter->graceful_period); - if (reporter->auto_recover && - (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || - (reporter->last_recovery_ts && reporter->recovery_count && - time_is_after_jiffies(recover_ts_threshold)))) { - trace_devlink_health_recover_aborted(devlink, - reporter->ops->name, - reporter->health_state, - jiffies - - reporter->last_recovery_ts); - return -ECANCELED; - } - - if (reporter->auto_dump) { - mutex_lock(&reporter->dump_lock); - /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx, NULL); - mutex_unlock(&reporter->dump_lock); - } - - if (!reporter->auto_recover) - return 0; - - devl_lock(devlink); - ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL); - devl_unlock(devlink); - - return ret; -} -EXPORT_SYMBOL_GPL(devlink_health_report); - -static struct devlink_health_reporter * -devlink_health_reporter_get_from_attrs(struct devlink *devlink, - struct nlattr **attrs) -{ - struct devlink_health_reporter *reporter; - struct devlink_port *devlink_port; - char *reporter_name; - - if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) - return NULL; - - reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); - devlink_port = devlink_port_get_from_attrs(devlink, attrs); - if (IS_ERR(devlink_port)) { - mutex_lock(&devlink->reporters_lock); - reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink->reporters_lock); - } else { - mutex_lock(&devlink_port->reporters_lock); - reporter = devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); - if (reporter) - refcount_inc(&reporter->refcount); - mutex_unlock(&devlink_port->reporters_lock); - } - - return reporter; -} - -static struct devlink_health_reporter * -devlink_health_reporter_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - return devlink_health_reporter_get_from_attrs(devlink, info->attrs); -} - -static struct devlink_health_reporter * -devlink_health_reporter_get_from_cb(struct netlink_callback *cb) -{ - const struct genl_dumpit_info *info = genl_dumpit_info(cb); - struct devlink_health_reporter *reporter; - struct nlattr **attrs = info->attrs; - struct devlink *devlink; - - devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); - if (IS_ERR(devlink)) - return NULL; - - reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); - devlink_put(devlink); - return reporter; -} - -void -devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, - enum devlink_health_reporter_state state) -{ - if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && - state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) - return; - - if (reporter->health_state == state) - return; - - reporter->health_state = state; - trace_devlink_health_reporter_state_update(reporter->devlink, - reporter->ops->name, state); - devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); - -static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - struct sk_buff *msg; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto out; - } - - err = devlink_nl_health_reporter_fill(msg, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - info->snd_portid, info->snd_seq, - 0); - if (err) { - nlmsg_free(msg); - goto out; - } - - err = genlmsg_reply(msg, info); -out: - devlink_health_reporter_put(reporter); - return err; -} - -static int -devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_health_reporter *reporter; - unsigned long index, port_index; - struct devlink_port *port; - struct devlink *devlink; - int start = cb->args[0]; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - mutex_lock(&devlink->reporters_lock); - list_for_each_entry(reporter, &devlink->reporter_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_health_reporter_fill( - msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - mutex_unlock(&devlink->reporters_lock); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&devlink->reporters_lock); - devlink_put(devlink); - } - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - xa_for_each(&devlink->ports, port_index, port) { - mutex_lock(&port->reporters_lock); - list_for_each_entry(reporter, &port->reporter_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_health_reporter_fill( - msg, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { - mutex_unlock(&port->reporters_lock); - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&port->reporters_lock); - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int -devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->recover && - (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) { - err = -EOPNOTSUPP; - goto out; - } - if (!reporter->ops->dump && - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { - err = -EOPNOTSUPP; - goto out; - } - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) - reporter->graceful_period = - nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) - reporter->auto_recover = - nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) - reporter->auto_dump = - nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); - - devlink_health_reporter_put(reporter); - return 0; -out: - devlink_health_reporter_put(reporter); - return err; -} - -static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - err = devlink_health_reporter_recover(reporter, NULL, info->extack); - - devlink_health_reporter_put(reporter); - return err; -} - -static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - struct devlink_fmsg *fmsg; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->diagnose) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; - } - - fmsg = devlink_fmsg_alloc(); - if (!fmsg) { - devlink_health_reporter_put(reporter); - return -ENOMEM; - } - - err = devlink_fmsg_obj_nest_start(fmsg); - if (err) - goto out; - - err = reporter->ops->diagnose(reporter, fmsg, info->extack); - if (err) - goto out; - - err = devlink_fmsg_obj_nest_end(fmsg); - if (err) - goto out; - - err = devlink_fmsg_snd(fmsg, info, - DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); - -out: - devlink_fmsg_free(fmsg); - devlink_health_reporter_put(reporter); - return err; -} - -static int -devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct devlink_health_reporter *reporter; - u64 start = cb->args[0]; - int err; - - reporter = devlink_health_reporter_get_from_cb(cb); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->dump) { - err = -EOPNOTSUPP; - goto out; - } - mutex_lock(&reporter->dump_lock); - if (!start) { - err = devlink_health_do_dump(reporter, NULL, cb->extack); - if (err) - goto unlock; - cb->args[1] = reporter->dump_ts; - } - if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) { - NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry"); - err = -EAGAIN; - goto unlock; - } - - err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); -unlock: - mutex_unlock(&reporter->dump_lock); -out: - devlink_health_reporter_put(reporter); - return err; -} - -static int -devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->dump) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; - } - - mutex_lock(&reporter->dump_lock); - devlink_health_dump_clear(reporter); - mutex_unlock(&reporter->dump_lock); - devlink_health_reporter_put(reporter); - return 0; -} - -static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->test) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; - } - - err = reporter->ops->test(reporter, info->extack); - - devlink_health_reporter_put(reporter); - return err; -} - -struct devlink_stats { - u64_stats_t rx_bytes; - u64_stats_t rx_packets; - struct u64_stats_sync syncp; -}; - -/** - * struct devlink_trap_policer_item - Packet trap policer attributes. - * @policer: Immutable packet trap policer attributes. - * @rate: Rate in packets / sec. - * @burst: Burst size in packets. - * @list: trap_policer_list member. - * - * Describes packet trap policer attributes. Created by devlink during trap - * policer registration. - */ -struct devlink_trap_policer_item { - const struct devlink_trap_policer *policer; - u64 rate; - u64 burst; - struct list_head list; -}; - -/** - * struct devlink_trap_group_item - Packet trap group attributes. - * @group: Immutable packet trap group attributes. - * @policer_item: Associated policer item. Can be NULL. - * @list: trap_group_list member. - * @stats: Trap group statistics. - * - * Describes packet trap group attributes. Created by devlink during trap - * group registration. - */ -struct devlink_trap_group_item { - const struct devlink_trap_group *group; - struct devlink_trap_policer_item *policer_item; - struct list_head list; - struct devlink_stats __percpu *stats; -}; - -/** - * struct devlink_trap_item - Packet trap attributes. - * @trap: Immutable packet trap attributes. - * @group_item: Associated group item. - * @list: trap_list member. - * @action: Trap action. - * @stats: Trap statistics. - * @priv: Driver private information. - * - * Describes both mutable and immutable packet trap attributes. Created by - * devlink during trap registration and used for all trap related operations. - */ -struct devlink_trap_item { - const struct devlink_trap *trap; - struct devlink_trap_group_item *group_item; - struct list_head list; - enum devlink_trap_action action; - struct devlink_stats __percpu *stats; - void *priv; -}; - -static struct devlink_trap_policer_item * -devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id) -{ - struct devlink_trap_policer_item *policer_item; - - list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { - if (policer_item->policer->id == id) - return policer_item; - } - - return NULL; -} - -static struct devlink_trap_item * -devlink_trap_item_lookup(struct devlink *devlink, const char *name) -{ - struct devlink_trap_item *trap_item; - - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (!strcmp(trap_item->trap->name, name)) - return trap_item; - } - - return NULL; -} - -static struct devlink_trap_item * -devlink_trap_item_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - struct nlattr *attr; - - if (!info->attrs[DEVLINK_ATTR_TRAP_NAME]) - return NULL; - attr = info->attrs[DEVLINK_ATTR_TRAP_NAME]; - - return devlink_trap_item_lookup(devlink, nla_data(attr)); -} - -static int -devlink_trap_action_get_from_info(struct genl_info *info, - enum devlink_trap_action *p_trap_action) -{ - u8 val; - - val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); - switch (val) { - case DEVLINK_TRAP_ACTION_DROP: - case DEVLINK_TRAP_ACTION_TRAP: - case DEVLINK_TRAP_ACTION_MIRROR: - *p_trap_action = val; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int devlink_trap_metadata_put(struct sk_buff *msg, - const struct devlink_trap *trap) -{ - struct nlattr *attr; - - attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA); - if (!attr) - return -EMSGSIZE; - - if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && - nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) - goto nla_put_failure; - if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) && - nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE)) - goto nla_put_failure; - - nla_nest_end(msg, attr); - - return 0; - -nla_put_failure: - nla_nest_cancel(msg, attr); - return -EMSGSIZE; -} - -static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, - struct devlink_stats *stats) -{ - int i; - - memset(stats, 0, sizeof(*stats)); - for_each_possible_cpu(i) { - struct devlink_stats *cpu_stats; - u64 rx_packets, rx_bytes; - unsigned int start; - - cpu_stats = per_cpu_ptr(trap_stats, i); - do { - start = u64_stats_fetch_begin(&cpu_stats->syncp); - rx_packets = u64_stats_read(&cpu_stats->rx_packets); - rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); - } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); - - u64_stats_add(&stats->rx_packets, rx_packets); - u64_stats_add(&stats->rx_bytes, rx_bytes); - } -} - -static int -devlink_trap_group_stats_put(struct sk_buff *msg, - struct devlink_stats __percpu *trap_stats) -{ - struct devlink_stats stats; - struct nlattr *attr; - - devlink_trap_stats_read(trap_stats, &stats); - - attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); - if (!attr) - return -EMSGSIZE; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, - u64_stats_read(&stats.rx_packets), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, - u64_stats_read(&stats.rx_bytes), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - nla_nest_end(msg, attr); - - return 0; - -nla_put_failure: - nla_nest_cancel(msg, attr); - return -EMSGSIZE; -} - -static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_item *trap_item) -{ - struct devlink_stats stats; - struct nlattr *attr; - u64 drops = 0; - int err; - - if (devlink->ops->trap_drop_counter_get) { - err = devlink->ops->trap_drop_counter_get(devlink, - trap_item->trap, - &drops); - if (err) - return err; - } - - devlink_trap_stats_read(trap_item->stats, &stats); - - attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); - if (!attr) - return -EMSGSIZE; - - if (devlink->ops->trap_drop_counter_get && - nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, - u64_stats_read(&stats.rx_packets), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, - u64_stats_read(&stats.rx_bytes), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - nla_nest_end(msg, attr); - - return 0; - -nla_put_failure: - nla_nest_cancel(msg, attr); - return -EMSGSIZE; -} - -static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_item *trap_item, - enum devlink_command cmd, u32 portid, u32 seq, - int flags) -{ - struct devlink_trap_group_item *group_item = trap_item->group_item; - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, - group_item->group->name)) - goto nla_put_failure; - - if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name)) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type)) - goto nla_put_failure; - - if (trap_item->trap->generic && - nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action)) - goto nla_put_failure; - - err = devlink_trap_metadata_put(msg, trap_item->trap); - if (err) - goto nla_put_failure; - - err = devlink_trap_stats_put(msg, devlink, trap_item); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_trap_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_trap_item *trap_item; - struct sk_buff *msg; - int err; - - if (list_empty(&devlink->trap_list)) - return -EOPNOTSUPP; - - trap_item = devlink_trap_item_get_from_info(devlink, info); - if (!trap_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap"); - return -ENOENT; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_trap_fill(msg, devlink, trap_item, - DEVLINK_CMD_TRAP_NEW, info->snd_portid, - info->snd_seq, 0); - if (err) - goto err_trap_fill; - - return genlmsg_reply(msg, info); - -err_trap_fill: - nlmsg_free(msg); - return err; -} - -static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_trap_item *trap_item; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_fill(msg, devlink, trap_item, - DEVLINK_CMD_TRAP_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int __devlink_trap_action_set(struct devlink *devlink, - struct devlink_trap_item *trap_item, - enum devlink_trap_action trap_action, - struct netlink_ext_ack *extack) -{ - int err; - - if (trap_item->action != trap_action && - trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) { - NL_SET_ERR_MSG_MOD(extack, "Cannot change action of non-drop traps. Skipping"); - return 0; - } - - err = devlink->ops->trap_action_set(devlink, trap_item->trap, - trap_action, extack); - if (err) - return err; - - trap_item->action = trap_action; - - return 0; -} - -static int devlink_trap_action_set(struct devlink *devlink, - struct devlink_trap_item *trap_item, - struct genl_info *info) -{ - enum devlink_trap_action trap_action; - int err; - - if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) - return 0; - - err = devlink_trap_action_get_from_info(info, &trap_action); - if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action"); - return -EINVAL; - } - - return __devlink_trap_action_set(devlink, trap_item, trap_action, - info->extack); -} - -static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_trap_item *trap_item; - - if (list_empty(&devlink->trap_list)) - return -EOPNOTSUPP; - - trap_item = devlink_trap_item_get_from_info(devlink, info); - if (!trap_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap"); - return -ENOENT; - } - - return devlink_trap_action_set(devlink, trap_item, info); -} - -static struct devlink_trap_group_item * -devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) -{ - struct devlink_trap_group_item *group_item; - - list_for_each_entry(group_item, &devlink->trap_group_list, list) { - if (!strcmp(group_item->group->name, name)) - return group_item; - } - - return NULL; -} - -static struct devlink_trap_group_item * -devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id) -{ - struct devlink_trap_group_item *group_item; - - list_for_each_entry(group_item, &devlink->trap_group_list, list) { - if (group_item->group->id == id) - return group_item; - } - - return NULL; -} - -static struct devlink_trap_group_item * -devlink_trap_group_item_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - char *name; - - if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]) - return NULL; - name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]); - - return devlink_trap_group_item_lookup(devlink, name); -} - -static int -devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_group_item *group_item, - enum devlink_command cmd, u32 portid, u32 seq, - int flags) -{ - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, - group_item->group->name)) - goto nla_put_failure; - - if (group_item->group->generic && - nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) - goto nla_put_failure; - - if (group_item->policer_item && - nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, - group_item->policer_item->policer->id)) - goto nla_put_failure; - - err = devlink_trap_group_stats_put(msg, group_item->stats); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_trap_group_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_trap_group_item *group_item; - struct sk_buff *msg; - int err; - - if (list_empty(&devlink->trap_group_list)) - return -EOPNOTSUPP; - - group_item = devlink_trap_group_item_get_from_info(devlink, info); - if (!group_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group"); - return -ENOENT; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_trap_group_fill(msg, devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) - goto err_trap_group_fill; - - return genlmsg_reply(msg, info); - -err_trap_group_fill: - nlmsg_free(msg); - return err; -} - -static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - enum devlink_command cmd = DEVLINK_CMD_TRAP_GROUP_NEW; - struct devlink_trap_group_item *group_item; - u32 portid = NETLINK_CB(cb->skb).portid; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(group_item, &devlink->trap_group_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_group_fill(msg, devlink, - group_item, cmd, - portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int -__devlink_trap_group_action_set(struct devlink *devlink, - struct devlink_trap_group_item *group_item, - enum devlink_trap_action trap_action, - struct netlink_ext_ack *extack) -{ - const char *group_name = group_item->group->name; - struct devlink_trap_item *trap_item; - int err; - - if (devlink->ops->trap_group_action_set) { - err = devlink->ops->trap_group_action_set(devlink, group_item->group, - trap_action, extack); - if (err) - return err; - - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (strcmp(trap_item->group_item->group->name, group_name)) - continue; - if (trap_item->action != trap_action && - trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) - continue; - trap_item->action = trap_action; - } - - return 0; - } - - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (strcmp(trap_item->group_item->group->name, group_name)) - continue; - err = __devlink_trap_action_set(devlink, trap_item, - trap_action, extack); - if (err) - return err; - } - - return 0; -} - -static int -devlink_trap_group_action_set(struct devlink *devlink, - struct devlink_trap_group_item *group_item, - struct genl_info *info, bool *p_modified) -{ - enum devlink_trap_action trap_action; - int err; - - if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) - return 0; - - err = devlink_trap_action_get_from_info(info, &trap_action); - if (err) { - NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action"); - return -EINVAL; - } - - err = __devlink_trap_group_action_set(devlink, group_item, trap_action, - info->extack); - if (err) - return err; - - *p_modified = true; - - return 0; -} - -static int devlink_trap_group_set(struct devlink *devlink, - struct devlink_trap_group_item *group_item, - struct genl_info *info) -{ - struct devlink_trap_policer_item *policer_item; - struct netlink_ext_ack *extack = info->extack; - const struct devlink_trap_policer *policer; - struct nlattr **attrs = info->attrs; - int err; - - if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) - return 0; - - if (!devlink->ops->trap_group_set) - return -EOPNOTSUPP; - - policer_item = group_item->policer_item; - if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) { - u32 policer_id; - - policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); - policer_item = devlink_trap_policer_item_lookup(devlink, - policer_id); - if (policer_id && !policer_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); - return -ENOENT; - } - } - policer = policer_item ? policer_item->policer : NULL; - - err = devlink->ops->trap_group_set(devlink, group_item->group, policer, - extack); - if (err) - return err; - - group_item->policer_item = policer_item; - - return 0; -} - -static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_trap_group_item *group_item; - bool modified = false; - int err; - - if (list_empty(&devlink->trap_group_list)) - return -EOPNOTSUPP; - - group_item = devlink_trap_group_item_get_from_info(devlink, info); - if (!group_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group"); - return -ENOENT; - } - - err = devlink_trap_group_action_set(devlink, group_item, info, - &modified); - if (err) - return err; - - err = devlink_trap_group_set(devlink, group_item, info); - if (err) - goto err_trap_group_set; - - return 0; - -err_trap_group_set: - if (modified) - NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already"); - return err; -} - -static struct devlink_trap_policer_item * -devlink_trap_policer_item_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - u32 id; - - if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) - return NULL; - id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); - - return devlink_trap_policer_item_lookup(devlink, id); -} - -static int -devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_policer *policer) -{ - struct nlattr *attr; - u64 drops; - int err; - - if (!devlink->ops->trap_policer_counter_get) - return 0; - - err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops); - if (err) - return err; - - attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); - if (!attr) - return -EMSGSIZE; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - nla_nest_end(msg, attr); - - return 0; - -nla_put_failure: - nla_nest_cancel(msg, attr); - return -EMSGSIZE; -} - -static int -devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_policer_item *policer_item, - enum devlink_command cmd, u32 portid, u32 seq, - int flags) -{ - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, - policer_item->policer->id)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, - policer_item->rate, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, - policer_item->burst, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - err = devlink_trap_policer_stats_put(msg, devlink, - policer_item->policer); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_trap_policer_item *policer_item; - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - if (list_empty(&devlink->trap_policer_list)) - return -EOPNOTSUPP; - - policer_item = devlink_trap_policer_item_get_from_info(devlink, info); - if (!policer_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); - return -ENOENT; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) - goto err_trap_policer_fill; - - return genlmsg_reply(msg, info); - -err_trap_policer_fill: - nlmsg_free(msg); - return err; -} - -static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW; - struct devlink_trap_policer_item *policer_item; - u32 portid = NETLINK_CB(cb->skb).portid; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(policer_item, &devlink->trap_policer_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_trap_policer_fill(msg, devlink, - policer_item, cmd, - portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - cb->args[0] = idx; - return msg->len; -} - -static int -devlink_trap_policer_set(struct devlink *devlink, - struct devlink_trap_policer_item *policer_item, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct nlattr **attrs = info->attrs; - u64 rate, burst; - int err; - - rate = policer_item->rate; - burst = policer_item->burst; - - if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]) - rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]); - - if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]) - burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); - - if (rate < policer_item->policer->min_rate) { - NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit"); - return -EINVAL; - } - - if (rate > policer_item->policer->max_rate) { - NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit"); - return -EINVAL; - } - - if (burst < policer_item->policer->min_burst) { - NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit"); - return -EINVAL; - } - - if (burst > policer_item->policer->max_burst) { - NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit"); - return -EINVAL; - } - - err = devlink->ops->trap_policer_set(devlink, policer_item->policer, - rate, burst, info->extack); - if (err) - return err; - - policer_item->rate = rate; - policer_item->burst = burst; - - return 0; -} - -static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_trap_policer_item *policer_item; - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - - if (list_empty(&devlink->trap_policer_list)) - return -EOPNOTSUPP; - - if (!devlink->ops->trap_policer_set) - return -EOPNOTSUPP; - - policer_item = devlink_trap_policer_item_get_from_info(devlink, info); - if (!policer_item) { - NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); - return -ENOENT; - } - - return devlink_trap_policer_set(devlink, policer_item, info); -} - -static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { - [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = - DEVLINK_ATTR_TRAP_POLICER_ID }, - [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO, - DEVLINK_PORT_TYPE_IB), - [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, - [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, - [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY, - DEVLINK_ESWITCH_MODE_SWITCHDEV), - [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, - [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, - [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, - [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, - [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, - [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, - [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = - NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), - [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, - [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, - [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, - [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, - [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, - [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, - [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_ACTION_MAX), - [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), - [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 }, - [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, - [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, - [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, - [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, - [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, - [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, - [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, - [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, - [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, - [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, - [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, -}; - -static const struct genl_small_ops devlink_nl_ops[] = { - { - .cmd = DEVLINK_CMD_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_get_doit, - .dumpit = devlink_nl_cmd_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_PORT_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_get_doit, - .dumpit = devlink_nl_cmd_port_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_PORT_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_RATE_GET, - .doit = devlink_nl_cmd_rate_get_doit, - .dumpit = devlink_nl_cmd_rate_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_RATE_SET, - .doit = devlink_nl_cmd_rate_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, - }, - { - .cmd = DEVLINK_CMD_RATE_NEW, - .doit = devlink_nl_cmd_rate_new_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RATE_DEL, - .doit = devlink_nl_cmd_rate_del_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE, - }, - { - .cmd = DEVLINK_CMD_PORT_SPLIT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_split_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_PORT_UNSPLIT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_unsplit_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_PORT_NEW, - .doit = devlink_nl_cmd_port_new_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PORT_DEL, - .doit = devlink_nl_cmd_port_del_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_LINECARD_GET, - .doit = devlink_nl_cmd_linecard_get_doit, - .dumpit = devlink_nl_cmd_linecard_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_LINECARD_SET, - .doit = devlink_nl_cmd_linecard_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, - }, - { - .cmd = DEVLINK_CMD_SB_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_get_doit, - .dumpit = devlink_nl_cmd_sb_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_SB_POOL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_pool_get_doit, - .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_SB_POOL_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_pool_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_port_pool_get_doit, - .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_port_pool_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, - .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_occ_snapshot_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_occ_max_clear_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_ESWITCH_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_eswitch_get_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_ESWITCH_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_eswitch_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_table_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_entries_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_headers_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_table_counters_set, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RESOURCE_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_resource_set, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RESOURCE_DUMP, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_resource_dump, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_RELOAD, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_reload, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PARAM_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_param_get_doit, - .dumpit = devlink_nl_cmd_param_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_PARAM_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_param_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PORT_PARAM_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_param_get_doit, - .dumpit = devlink_nl_cmd_port_param_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_PORT_PARAM_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_param_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_REGION_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_get_doit, - .dumpit = devlink_nl_cmd_region_get_dumpit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_REGION_NEW, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_new, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_REGION_DEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_del, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_REGION_READ, - .validate = GENL_DONT_VALIDATE_STRICT | - GENL_DONT_VALIDATE_DUMP_STRICT, - .dumpit = devlink_nl_cmd_region_read_dumpit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_INFO_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_info_get_doit, - .dumpit = devlink_nl_cmd_info_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_get_doit, - .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_recover_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_diagnose_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .validate = GENL_DONT_VALIDATE_STRICT | - GENL_DONT_VALIDATE_DUMP_STRICT, - .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_test_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_FLASH_UPDATE, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_flash_update, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_GET, - .doit = devlink_nl_cmd_trap_get_doit, - .dumpit = devlink_nl_cmd_trap_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_TRAP_SET, - .doit = devlink_nl_cmd_trap_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_GROUP_GET, - .doit = devlink_nl_cmd_trap_group_get_doit, - .dumpit = devlink_nl_cmd_trap_group_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_TRAP_GROUP_SET, - .doit = devlink_nl_cmd_trap_group_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_POLICER_GET, - .doit = devlink_nl_cmd_trap_policer_get_doit, - .dumpit = devlink_nl_cmd_trap_policer_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_TRAP_POLICER_SET, - .doit = devlink_nl_cmd_trap_policer_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SELFTESTS_GET, - .doit = devlink_nl_cmd_selftests_get_doit, - .dumpit = devlink_nl_cmd_selftests_get_dumpit - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_SELFTESTS_RUN, - .doit = devlink_nl_cmd_selftests_run, - .flags = GENL_ADMIN_PERM, - }, -}; - -static struct genl_family devlink_nl_family __ro_after_init = { - .name = DEVLINK_GENL_NAME, - .version = DEVLINK_GENL_VERSION, - .maxattr = DEVLINK_ATTR_MAX, - .policy = devlink_nl_policy, - .netnsok = true, - .parallel_ops = true, - .pre_doit = devlink_nl_pre_doit, - .post_doit = devlink_nl_post_doit, - .module = THIS_MODULE, - .small_ops = devlink_nl_ops, - .n_small_ops = ARRAY_SIZE(devlink_nl_ops), - .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, - .mcgrps = devlink_nl_mcgrps, - .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), -}; - -static bool devlink_reload_actions_valid(const struct devlink_ops *ops) -{ - const struct devlink_reload_combination *comb; - int i; - - if (!devlink_reload_supported(ops)) { - if (WARN_ON(ops->reload_actions)) - return false; - return true; - } - - if (WARN_ON(!ops->reload_actions || - ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || - ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) - return false; - - if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || - ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) - return false; - - for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { - comb = &devlink_reload_invalid_combinations[i]; - if (ops->reload_actions == BIT(comb->action) && - ops->reload_limits == BIT(comb->limit)) - return false; - } - return true; -} - -/** - * devlink_set_features - Set devlink supported features - * - * @devlink: devlink - * @features: devlink support features - * - * This interface allows us to set reload ops separatelly from - * the devlink_alloc. - */ -void devlink_set_features(struct devlink *devlink, u64 features) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - WARN_ON(features & DEVLINK_F_RELOAD && - !devlink_reload_supported(devlink->ops)); - devlink->features = features; -} -EXPORT_SYMBOL_GPL(devlink_set_features); - -static int devlink_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr); - -/** - * devlink_alloc_ns - Allocate new devlink instance resources - * in specific namespace - * - * @ops: ops - * @priv_size: size of user private data - * @net: net namespace - * @dev: parent device - * - * Allocate new devlink instance resources, including devlink index - * and name. - */ -struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, - size_t priv_size, struct net *net, - struct device *dev) -{ - struct devlink *devlink; - static u32 last_id; - int ret; - - WARN_ON(!ops || !dev); - if (!devlink_reload_actions_valid(ops)) - return NULL; - - devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); - if (!devlink) - return NULL; - - ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, - &last_id, GFP_KERNEL); - if (ret < 0) - goto err_xa_alloc; - - devlink->netdevice_nb.notifier_call = devlink_netdevice_event; - ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb); - if (ret) - goto err_register_netdevice_notifier; - - devlink->dev = dev; - devlink->ops = ops; - xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); - xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); - write_pnet(&devlink->_net, net); - INIT_LIST_HEAD(&devlink->rate_list); - INIT_LIST_HEAD(&devlink->linecard_list); - INIT_LIST_HEAD(&devlink->sb_list); - INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); - INIT_LIST_HEAD(&devlink->resource_list); - INIT_LIST_HEAD(&devlink->param_list); - INIT_LIST_HEAD(&devlink->region_list); - INIT_LIST_HEAD(&devlink->reporter_list); - INIT_LIST_HEAD(&devlink->trap_list); - INIT_LIST_HEAD(&devlink->trap_group_list); - INIT_LIST_HEAD(&devlink->trap_policer_list); - lockdep_register_key(&devlink->lock_key); - mutex_init(&devlink->lock); - lockdep_set_class(&devlink->lock, &devlink->lock_key); - mutex_init(&devlink->reporters_lock); - mutex_init(&devlink->linecards_lock); - refcount_set(&devlink->refcount, 1); - init_completion(&devlink->comp); - - return devlink; - -err_register_netdevice_notifier: - xa_erase(&devlinks, devlink->index); -err_xa_alloc: - kfree(devlink); - return NULL; -} -EXPORT_SYMBOL_GPL(devlink_alloc_ns); - -static void -devlink_trap_policer_notify(struct devlink *devlink, - const struct devlink_trap_policer_item *policer_item, - enum devlink_command cmd); -static void -devlink_trap_group_notify(struct devlink *devlink, - const struct devlink_trap_group_item *group_item, - enum devlink_command cmd); -static void devlink_trap_notify(struct devlink *devlink, - const struct devlink_trap_item *trap_item, - enum devlink_command cmd); - -static void devlink_notify_register(struct devlink *devlink) -{ - struct devlink_trap_policer_item *policer_item; - struct devlink_trap_group_item *group_item; - struct devlink_param_item *param_item; - struct devlink_trap_item *trap_item; - struct devlink_port *devlink_port; - struct devlink_linecard *linecard; - struct devlink_rate *rate_node; - struct devlink_region *region; - unsigned long port_index; - - devlink_notify(devlink, DEVLINK_CMD_NEW); - list_for_each_entry(linecard, &devlink->linecard_list, list) - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - - xa_for_each(&devlink->ports, port_index, devlink_port) - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); - - list_for_each_entry(policer_item, &devlink->trap_policer_list, list) - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_NEW); - - list_for_each_entry(group_item, &devlink->trap_group_list, list) - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW); - - list_for_each_entry(trap_item, &devlink->trap_list, list) - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); - - list_for_each_entry(rate_node, &devlink->rate_list, list) - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); - - list_for_each_entry(region, &devlink->region_list, list) - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - - list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_NEW); -} - -static void devlink_notify_unregister(struct devlink *devlink) -{ - struct devlink_trap_policer_item *policer_item; - struct devlink_trap_group_item *group_item; - struct devlink_param_item *param_item; - struct devlink_trap_item *trap_item; - struct devlink_port *devlink_port; - struct devlink_rate *rate_node; - struct devlink_region *region; - unsigned long port_index; - - list_for_each_entry_reverse(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_DEL); - - list_for_each_entry_reverse(region, &devlink->region_list, list) - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); - - list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); - - list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); - - list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_DEL); - list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, - list) - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_DEL); - - xa_for_each(&devlink->ports, port_index, devlink_port) - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - devlink_notify(devlink, DEVLINK_CMD_DEL); -} - -/** - * devlink_register - Register devlink instance - * - * @devlink: devlink - */ -void devlink_register(struct devlink *devlink) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - /* Make sure that we are in .probe() routine */ - - xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - devlink_notify_register(devlink); -} -EXPORT_SYMBOL_GPL(devlink_register); - -/** - * devlink_unregister - Unregister devlink instance - * - * @devlink: devlink - */ -void devlink_unregister(struct devlink *devlink) -{ - ASSERT_DEVLINK_REGISTERED(devlink); - /* Make sure that we are in .remove() routine */ - - xa_set_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); - devlink_put(devlink); - wait_for_completion(&devlink->comp); - - devlink_notify_unregister(devlink); - xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - xa_clear_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); -} -EXPORT_SYMBOL_GPL(devlink_unregister); - -/** - * devlink_free - Free devlink instance resources - * - * @devlink: devlink - */ -void devlink_free(struct devlink *devlink) -{ - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - mutex_destroy(&devlink->linecards_lock); - mutex_destroy(&devlink->reporters_lock); - mutex_destroy(&devlink->lock); - lockdep_unregister_key(&devlink->lock_key); - WARN_ON(!list_empty(&devlink->trap_policer_list)); - WARN_ON(!list_empty(&devlink->trap_group_list)); - WARN_ON(!list_empty(&devlink->trap_list)); - WARN_ON(!list_empty(&devlink->reporter_list)); - WARN_ON(!list_empty(&devlink->region_list)); - WARN_ON(!list_empty(&devlink->param_list)); - WARN_ON(!list_empty(&devlink->resource_list)); - WARN_ON(!list_empty(&devlink->dpipe_table_list)); - WARN_ON(!list_empty(&devlink->sb_list)); - WARN_ON(!list_empty(&devlink->rate_list)); - WARN_ON(!list_empty(&devlink->linecard_list)); - WARN_ON(!xa_empty(&devlink->ports)); - - xa_destroy(&devlink->snapshot_ids); - xa_destroy(&devlink->ports); - - WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), - &devlink->netdevice_nb)); - - xa_erase(&devlinks, devlink->index); - - kfree(devlink); -} -EXPORT_SYMBOL_GPL(devlink_free); - -static void devlink_port_type_warn(struct work_struct *work) -{ - WARN(true, "Type was not set for devlink port."); -} - -static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) -{ - /* Ignore CPU and DSA flavours. */ - return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; -} - -#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) - -static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) -{ - if (!devlink_port_type_should_warn(devlink_port)) - return; - /* Schedule a work to WARN in case driver does not set port - * type within timeout. - */ - schedule_delayed_work(&devlink_port->type_warn_dw, - DEVLINK_PORT_TYPE_WARN_TIMEOUT); -} - -static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) -{ - if (!devlink_port_type_should_warn(devlink_port)) - return; - cancel_delayed_work_sync(&devlink_port->type_warn_dw); -} - -/** - * devlink_port_init() - Init devlink port - * - * @devlink: devlink - * @devlink_port: devlink port - * - * Initialize essencial stuff that is needed for functions - * that may be called before devlink port registration. - * Call to this function is optional and not needed - * in case the driver does not use such functions. - */ -void devlink_port_init(struct devlink *devlink, - struct devlink_port *devlink_port) -{ - if (devlink_port->initialized) - return; - devlink_port->devlink = devlink; - INIT_LIST_HEAD(&devlink_port->region_list); - devlink_port->initialized = true; -} -EXPORT_SYMBOL_GPL(devlink_port_init); - -/** - * devlink_port_fini() - Deinitialize devlink port - * - * @devlink_port: devlink port - * - * Deinitialize essencial stuff that is in use for functions - * that may be called after devlink port unregistration. - * Call to this function is optional and not needed - * in case the driver does not use such functions. - */ -void devlink_port_fini(struct devlink_port *devlink_port) -{ - WARN_ON(!list_empty(&devlink_port->region_list)); -} -EXPORT_SYMBOL_GPL(devlink_port_fini); - -/** - * devl_port_register() - Register devlink port - * - * @devlink: devlink - * @devlink_port: devlink port - * @port_index: driver-specific numerical identifier of the port - * - * Register devlink port with provided port index. User can use - * any indexing, even hw-related one. devlink_port structure - * is convenient to be embedded inside user driver private structure. - * Note that the caller should take care of zeroing the devlink_port - * structure. - */ -int devl_port_register(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index) -{ - int err; - - devl_assert_locked(devlink); - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - devlink_port_init(devlink, devlink_port); - devlink_port->registered = true; - devlink_port->index = port_index; - spin_lock_init(&devlink_port->type_lock); - INIT_LIST_HEAD(&devlink_port->reporter_list); - mutex_init(&devlink_port->reporters_lock); - err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); - if (err) { - mutex_destroy(&devlink_port->reporters_lock); - return err; - } - - INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); - devlink_port_type_warn_schedule(devlink_port); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); - return 0; -} -EXPORT_SYMBOL_GPL(devl_port_register); - -/** - * devlink_port_register - Register devlink port - * - * @devlink: devlink - * @devlink_port: devlink port - * @port_index: driver-specific numerical identifier of the port - * - * Register devlink port with provided port index. User can use - * any indexing, even hw-related one. devlink_port structure - * is convenient to be embedded inside user driver private structure. - * Note that the caller should take care of zeroing the devlink_port - * structure. - * - * Context: Takes and release devlink->lock <mutex>. - */ -int devlink_port_register(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index) -{ - int err; - - devl_lock(devlink); - err = devl_port_register(devlink, devlink_port, port_index); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_port_register); - -/** - * devl_port_unregister() - Unregister devlink port - * - * @devlink_port: devlink port - */ -void devl_port_unregister(struct devlink_port *devlink_port) -{ - lockdep_assert_held(&devlink_port->devlink->lock); - WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); - - devlink_port_type_warn_cancel(devlink_port); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - xa_erase(&devlink_port->devlink->ports, devlink_port->index); - WARN_ON(!list_empty(&devlink_port->reporter_list)); - mutex_destroy(&devlink_port->reporters_lock); - devlink_port->registered = false; -} -EXPORT_SYMBOL_GPL(devl_port_unregister); - -/** - * devlink_port_unregister - Unregister devlink port - * - * @devlink_port: devlink port - * - * Context: Takes and release devlink->lock <mutex>. - */ -void devlink_port_unregister(struct devlink_port *devlink_port) -{ - struct devlink *devlink = devlink_port->devlink; - - devl_lock(devlink); - devl_port_unregister(devlink_port); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_port_unregister); - -static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, - struct net_device *netdev) -{ - const struct net_device_ops *ops = netdev->netdev_ops; - - /* If driver registers devlink port, it should set devlink port - * attributes accordingly so the compat functions are called - * and the original ops are not used. - */ - if (ops->ndo_get_phys_port_name) { - /* Some drivers use the same set of ndos for netdevs - * that have devlink_port registered and also for - * those who don't. Make sure that ndo_get_phys_port_name - * returns -EOPNOTSUPP here in case it is defined. - * Warn if not. - */ - char name[IFNAMSIZ]; - int err; - - err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); - WARN_ON(err != -EOPNOTSUPP); - } - if (ops->ndo_get_port_parent_id) { - /* Some drivers use the same set of ndos for netdevs - * that have devlink_port registered and also for - * those who don't. Make sure that ndo_get_port_parent_id - * returns -EOPNOTSUPP here in case it is defined. - * Warn if not. - */ - struct netdev_phys_item_id ppid; - int err; - - err = ops->ndo_get_port_parent_id(netdev, &ppid); - WARN_ON(err != -EOPNOTSUPP); - } -} - -static void __devlink_port_type_set(struct devlink_port *devlink_port, - enum devlink_port_type type, - void *type_dev) -{ - struct net_device *netdev = type_dev; - - ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); - - if (type == DEVLINK_PORT_TYPE_NOTSET) { - devlink_port_type_warn_schedule(devlink_port); - } else { - devlink_port_type_warn_cancel(devlink_port); - if (type == DEVLINK_PORT_TYPE_ETH && netdev) - devlink_port_type_netdev_checks(devlink_port, netdev); - } - - spin_lock_bh(&devlink_port->type_lock); - devlink_port->type = type; - switch (type) { - case DEVLINK_PORT_TYPE_ETH: - devlink_port->type_eth.netdev = netdev; - if (netdev) { - ASSERT_RTNL(); - devlink_port->type_eth.ifindex = netdev->ifindex; - BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != - sizeof(netdev->name)); - strcpy(devlink_port->type_eth.ifname, netdev->name); - } - break; - case DEVLINK_PORT_TYPE_IB: - devlink_port->type_ib.ibdev = type_dev; - break; - default: - break; - } - spin_unlock_bh(&devlink_port->type_lock); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); -} - -/** - * devlink_port_type_eth_set - Set port type to Ethernet - * - * @devlink_port: devlink port - * - * If driver is calling this, most likely it is doing something wrong. - */ -void devlink_port_type_eth_set(struct devlink_port *devlink_port) -{ - dev_warn(devlink_port->devlink->dev, - "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", - devlink_port->index); - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); -} -EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); - -/** - * devlink_port_type_ib_set - Set port type to InfiniBand - * - * @devlink_port: devlink port - * @ibdev: related IB device - */ -void devlink_port_type_ib_set(struct devlink_port *devlink_port, - struct ib_device *ibdev) -{ - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); -} -EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); - -/** - * devlink_port_type_clear - Clear port type - * - * @devlink_port: devlink port - * - * If driver is calling this for clearing Ethernet type, most likely - * it is doing something wrong. - */ -void devlink_port_type_clear(struct devlink_port *devlink_port) -{ - if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) - dev_warn(devlink_port->devlink->dev, - "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", - devlink_port->index); - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); -} -EXPORT_SYMBOL_GPL(devlink_port_type_clear); - -static int devlink_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr) -{ - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct devlink_port *devlink_port = netdev->devlink_port; - struct devlink *devlink; - - devlink = container_of(nb, struct devlink, netdevice_nb); - - if (!devlink_port || devlink_port->devlink != devlink) - return NOTIFY_OK; - - switch (event) { - case NETDEV_POST_INIT: - /* Set the type but not netdev pointer. It is going to be set - * later on by NETDEV_REGISTER event. Happens once during - * netdevice register - */ - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, - NULL); - break; - case NETDEV_REGISTER: - case NETDEV_CHANGENAME: - /* Set the netdev on top of previously set type. Note this - * event happens also during net namespace change so here - * we take into account netdev pointer appearing in this - * namespace. - */ - __devlink_port_type_set(devlink_port, devlink_port->type, - netdev); - break; - case NETDEV_UNREGISTER: - /* Clear netdev pointer, but not the type. This event happens - * also during net namespace change so we need to clear - * pointer to netdev that is going to another net namespace. - */ - __devlink_port_type_set(devlink_port, devlink_port->type, - NULL); - break; - case NETDEV_PRE_UNINIT: - /* Clear the type and the netdev pointer. Happens one during - * netdevice unregister. - */ - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, - NULL); - break; - } - - return NOTIFY_OK; -} - -static int __devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - - devlink_port->attrs_set = true; - attrs->flavour = flavour; - if (attrs->switch_id.id_len) { - devlink_port->switch_port = true; - if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) - attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; - } else { - devlink_port->switch_port = false; - } - return 0; -} - -/** - * devlink_port_attrs_set - Set port attributes - * - * @devlink_port: devlink port - * @attrs: devlink port attrs - */ -void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *attrs) -{ - int ret; - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - devlink_port->attrs = *attrs; - ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); - if (ret) - return; - WARN_ON(attrs->splittable && attrs->split); -} -EXPORT_SYMBOL_GPL(devlink_port_attrs_set); - -/** - * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes - * - * @devlink_port: devlink port - * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @external: indicates if the port is for an external controller - */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, bool external) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_PF); - if (ret) - return; - attrs->pci_pf.controller = controller; - attrs->pci_pf.pf = pf; - attrs->pci_pf.external = external; -} -EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); - -/** - * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes - * - * @devlink_port: devlink port - * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @vf: associated VF of a PF for the devlink port instance - * @external: indicates if the port is for an external controller - */ -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u16 vf, bool external) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_VF); - if (ret) - return; - attrs->pci_vf.controller = controller; - attrs->pci_vf.pf = pf; - attrs->pci_vf.vf = vf; - attrs->pci_vf.external = external; -} -EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); - -/** - * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes - * - * @devlink_port: devlink port - * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @sf: associated SF of a PF for the devlink port instance - * @external: indicates if the port is for an external controller - */ -void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u32 sf, bool external) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_SF); - if (ret) - return; - attrs->pci_sf.controller = controller; - attrs->pci_sf.pf = pf; - attrs->pci_sf.sf = sf; - attrs->pci_sf.external = external; -} -EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); - -/** - * devl_rate_node_create - create devlink rate node - * @devlink: devlink instance - * @priv: driver private data - * @node_name: name of the resulting node - * @parent: parent devlink_rate struct - * - * Create devlink rate object of type node - */ -struct devlink_rate * -devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, - struct devlink_rate *parent) -{ - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_by_name(devlink, node_name); - if (!IS_ERR(rate_node)) - return ERR_PTR(-EEXIST); - - rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); - if (!rate_node) - return ERR_PTR(-ENOMEM); - - if (parent) { - rate_node->parent = parent; - refcount_inc(&rate_node->parent->refcnt); - } - - rate_node->type = DEVLINK_RATE_TYPE_NODE; - rate_node->devlink = devlink; - rate_node->priv = priv; - - rate_node->name = kstrdup(node_name, GFP_KERNEL); - if (!rate_node->name) { - kfree(rate_node); - return ERR_PTR(-ENOMEM); - } - - refcount_set(&rate_node->refcnt, 1); - list_add(&rate_node->list, &devlink->rate_list); - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); - return rate_node; -} -EXPORT_SYMBOL_GPL(devl_rate_node_create); - -/** - * devl_rate_leaf_create - create devlink rate leaf - * @devlink_port: devlink port object to create rate object on - * @priv: driver private data - * @parent: parent devlink_rate struct - * - * Create devlink rate object of type leaf on provided @devlink_port. - */ -int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, - struct devlink_rate *parent) -{ - struct devlink *devlink = devlink_port->devlink; - struct devlink_rate *devlink_rate; - - devl_assert_locked(devlink_port->devlink); - - if (WARN_ON(devlink_port->devlink_rate)) - return -EBUSY; - - devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); - if (!devlink_rate) - return -ENOMEM; - - if (parent) { - devlink_rate->parent = parent; - refcount_inc(&devlink_rate->parent->refcnt); - } - - devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; - devlink_rate->devlink = devlink; - devlink_rate->devlink_port = devlink_port; - devlink_rate->priv = priv; - list_add_tail(&devlink_rate->list, &devlink->rate_list); - devlink_port->devlink_rate = devlink_rate; - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); - - return 0; -} -EXPORT_SYMBOL_GPL(devl_rate_leaf_create); - -/** - * devl_rate_leaf_destroy - destroy devlink rate leaf - * - * @devlink_port: devlink port linked to the rate object - * - * Destroy the devlink rate object of type leaf on provided @devlink_port. - */ -void devl_rate_leaf_destroy(struct devlink_port *devlink_port) -{ - struct devlink_rate *devlink_rate = devlink_port->devlink_rate; - - devl_assert_locked(devlink_port->devlink); - if (!devlink_rate) - return; - - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); - if (devlink_rate->parent) - refcount_dec(&devlink_rate->parent->refcnt); - list_del(&devlink_rate->list); - devlink_port->devlink_rate = NULL; - kfree(devlink_rate); -} -EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); - -/** - * devl_rate_nodes_destroy - destroy all devlink rate nodes on device - * @devlink: devlink instance - * - * Unset parent for all rate objects and destroy all rate nodes - * on specified device. - */ -void devl_rate_nodes_destroy(struct devlink *devlink) -{ - static struct devlink_rate *devlink_rate, *tmp; - const struct devlink_ops *ops = devlink->ops; - - devl_assert_locked(devlink); - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - if (!devlink_rate->parent) - continue; - - refcount_dec(&devlink_rate->parent->refcnt); - if (devlink_rate_is_leaf(devlink_rate)) - ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, - NULL, NULL); - else if (devlink_rate_is_node(devlink_rate)) - ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, - NULL, NULL); - } - list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { - if (devlink_rate_is_node(devlink_rate)) { - ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); - list_del(&devlink_rate->list); - kfree(devlink_rate->name); - kfree(devlink_rate); - } - } -} -EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); - -/** - * devlink_port_linecard_set - Link port with a linecard - * - * @devlink_port: devlink port - * @linecard: devlink linecard - */ -void devlink_port_linecard_set(struct devlink_port *devlink_port, - struct devlink_linecard *linecard) -{ - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - devlink_port->linecard = linecard; -} -EXPORT_SYMBOL_GPL(devlink_port_linecard_set); - -static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, - char *name, size_t len) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - int n = 0; - - if (!devlink_port->attrs_set) - return -EOPNOTSUPP; - - switch (attrs->flavour) { - case DEVLINK_PORT_FLAVOUR_PHYSICAL: - if (devlink_port->linecard) - n = snprintf(name, len, "l%u", - devlink_port->linecard->index); - if (n < len) - n += snprintf(name + n, len - n, "p%u", - attrs->phys.port_number); - if (n < len && attrs->split) - n += snprintf(name + n, len - n, "s%u", - attrs->phys.split_subport_number); - break; - case DEVLINK_PORT_FLAVOUR_CPU: - case DEVLINK_PORT_FLAVOUR_DSA: - case DEVLINK_PORT_FLAVOUR_UNUSED: - /* As CPU and DSA ports do not have a netdevice associated - * case should not ever happen. - */ - WARN_ON(1); - return -EINVAL; - case DEVLINK_PORT_FLAVOUR_PCI_PF: - if (attrs->pci_pf.external) { - n = snprintf(name, len, "c%u", attrs->pci_pf.controller); - if (n >= len) - return -EINVAL; - len -= n; - name += n; - } - n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); - break; - case DEVLINK_PORT_FLAVOUR_PCI_VF: - if (attrs->pci_vf.external) { - n = snprintf(name, len, "c%u", attrs->pci_vf.controller); - if (n >= len) - return -EINVAL; - len -= n; - name += n; - } - n = snprintf(name, len, "pf%uvf%u", - attrs->pci_vf.pf, attrs->pci_vf.vf); - break; - case DEVLINK_PORT_FLAVOUR_PCI_SF: - if (attrs->pci_sf.external) { - n = snprintf(name, len, "c%u", attrs->pci_sf.controller); - if (n >= len) - return -EINVAL; - len -= n; - name += n; - } - n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, - attrs->pci_sf.sf); - break; - case DEVLINK_PORT_FLAVOUR_VIRTUAL: - return -EOPNOTSUPP; - } - - if (n >= len) - return -EINVAL; - - return 0; -} - -static int devlink_linecard_types_init(struct devlink_linecard *linecard) -{ - struct devlink_linecard_type *linecard_type; - unsigned int count; - int i; - - count = linecard->ops->types_count(linecard, linecard->priv); - linecard->types = kmalloc_array(count, sizeof(*linecard_type), - GFP_KERNEL); - if (!linecard->types) - return -ENOMEM; - linecard->types_count = count; - - for (i = 0; i < count; i++) { - linecard_type = &linecard->types[i]; - linecard->ops->types_get(linecard, linecard->priv, i, - &linecard_type->type, - &linecard_type->priv); - } - return 0; -} - -static void devlink_linecard_types_fini(struct devlink_linecard *linecard) -{ - kfree(linecard->types); -} - -/** - * devlink_linecard_create - Create devlink linecard - * - * @devlink: devlink - * @linecard_index: driver-specific numerical identifier of the linecard - * @ops: linecards ops - * @priv: user priv pointer - * - * Create devlink linecard instance with provided linecard index. - * Caller can use any indexing, even hw-related one. - * - * Return: Line card structure or an ERR_PTR() encoded error code. - */ -struct devlink_linecard * -devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, - const struct devlink_linecard_ops *ops, void *priv) -{ - struct devlink_linecard *linecard; - int err; - - if (WARN_ON(!ops || !ops->provision || !ops->unprovision || - !ops->types_count || !ops->types_get)) - return ERR_PTR(-EINVAL); - - mutex_lock(&devlink->linecards_lock); - if (devlink_linecard_index_exists(devlink, linecard_index)) { - mutex_unlock(&devlink->linecards_lock); - return ERR_PTR(-EEXIST); - } - - linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); - if (!linecard) { - mutex_unlock(&devlink->linecards_lock); - return ERR_PTR(-ENOMEM); - } - - linecard->devlink = devlink; - linecard->index = linecard_index; - linecard->ops = ops; - linecard->priv = priv; - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - mutex_init(&linecard->state_lock); - - err = devlink_linecard_types_init(linecard); - if (err) { - mutex_destroy(&linecard->state_lock); - kfree(linecard); - mutex_unlock(&devlink->linecards_lock); - return ERR_PTR(err); - } - - list_add_tail(&linecard->list, &devlink->linecard_list); - refcount_set(&linecard->refcount, 1); - mutex_unlock(&devlink->linecards_lock); - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - return linecard; -} -EXPORT_SYMBOL_GPL(devlink_linecard_create); - -/** - * devlink_linecard_destroy - Destroy devlink linecard - * - * @linecard: devlink linecard - */ -void devlink_linecard_destroy(struct devlink_linecard *linecard) -{ - struct devlink *devlink = linecard->devlink; - - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); - mutex_lock(&devlink->linecards_lock); - list_del(&linecard->list); - devlink_linecard_types_fini(linecard); - mutex_unlock(&devlink->linecards_lock); - devlink_linecard_put(linecard); -} -EXPORT_SYMBOL_GPL(devlink_linecard_destroy); - -/** - * devlink_linecard_provision_set - Set provisioning on linecard - * - * @linecard: devlink linecard - * @type: linecard type - * - * This is either called directly from the provision() op call or - * as a result of the provision() op call asynchronously. - */ -void devlink_linecard_provision_set(struct devlink_linecard *linecard, - const char *type) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->type && strcmp(linecard->type, type)); - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; - linecard->type = type; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); - -/** - * devlink_linecard_provision_clear - Clear provisioning on linecard - * - * @linecard: devlink linecard - * - * This is either called directly from the unprovision() op call or - * as a result of the unprovision() op call asynchronously. - */ -void devlink_linecard_provision_clear(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->nested_devlink); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); - -/** - * devlink_linecard_provision_fail - Fail provisioning on linecard - * - * @linecard: devlink linecard - * - * This is either called directly from the provision() op call or - * as a result of the provision() op call asynchronously. - */ -void devlink_linecard_provision_fail(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->nested_devlink); - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); - -/** - * devlink_linecard_activate - Set linecard active - * - * @linecard: devlink linecard - */ -void devlink_linecard_activate(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); - linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_activate); - -/** - * devlink_linecard_deactivate - Set linecard inactive - * - * @linecard: devlink linecard - */ -void devlink_linecard_deactivate(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - switch (linecard->state) { - case DEVLINK_LINECARD_STATE_ACTIVE: - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - break; - case DEVLINK_LINECARD_STATE_UNPROVISIONING: - /* Line card is being deactivated as part - * of unprovisioning flow. - */ - break; - default: - WARN_ON(1); - break; - } - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); - -/** - * devlink_linecard_nested_dl_set - Attach/detach nested devlink - * instance to linecard. - * - * @linecard: devlink linecard - * @nested_devlink: devlink instance to attach or NULL to detach - */ -void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, - struct devlink *nested_devlink) -{ - mutex_lock(&linecard->state_lock); - linecard->nested_devlink = nested_devlink; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); - -int devl_sb_register(struct devlink *devlink, unsigned int sb_index, - u32 size, u16 ingress_pools_count, - u16 egress_pools_count, u16 ingress_tc_count, - u16 egress_tc_count) -{ - struct devlink_sb *devlink_sb; - - lockdep_assert_held(&devlink->lock); - - if (devlink_sb_index_exists(devlink, sb_index)) - return -EEXIST; - - devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); - if (!devlink_sb) - return -ENOMEM; - devlink_sb->index = sb_index; - devlink_sb->size = size; - devlink_sb->ingress_pools_count = ingress_pools_count; - devlink_sb->egress_pools_count = egress_pools_count; - devlink_sb->ingress_tc_count = ingress_tc_count; - devlink_sb->egress_tc_count = egress_tc_count; - list_add_tail(&devlink_sb->list, &devlink->sb_list); - return 0; -} -EXPORT_SYMBOL_GPL(devl_sb_register); - -int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, - u32 size, u16 ingress_pools_count, - u16 egress_pools_count, u16 ingress_tc_count, - u16 egress_tc_count) -{ - int err; - - devl_lock(devlink); - err = devl_sb_register(devlink, sb_index, size, ingress_pools_count, - egress_pools_count, ingress_tc_count, - egress_tc_count); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_sb_register); - -void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index) -{ - struct devlink_sb *devlink_sb; - - lockdep_assert_held(&devlink->lock); - - devlink_sb = devlink_sb_get_by_index(devlink, sb_index); - WARN_ON(!devlink_sb); - list_del(&devlink_sb->list); - kfree(devlink_sb); -} -EXPORT_SYMBOL_GPL(devl_sb_unregister); - -void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) -{ - devl_lock(devlink); - devl_sb_unregister(devlink, sb_index); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_sb_unregister); - -/** - * devl_dpipe_headers_register - register dpipe headers - * - * @devlink: devlink - * @dpipe_headers: dpipe header array - * - * Register the headers supported by hardware. - */ -void devl_dpipe_headers_register(struct devlink *devlink, - struct devlink_dpipe_headers *dpipe_headers) -{ - lockdep_assert_held(&devlink->lock); - - devlink->dpipe_headers = dpipe_headers; -} -EXPORT_SYMBOL_GPL(devl_dpipe_headers_register); - -/** - * devl_dpipe_headers_unregister - unregister dpipe headers - * - * @devlink: devlink - * - * Unregister the headers supported by hardware. - */ -void devl_dpipe_headers_unregister(struct devlink *devlink) -{ - lockdep_assert_held(&devlink->lock); - - devlink->dpipe_headers = NULL; -} -EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister); - -/** - * devlink_dpipe_table_counter_enabled - check if counter allocation - * required - * @devlink: devlink - * @table_name: tables name - * - * Used by driver to check if counter allocation is required. - * After counter allocation is turned on the table entries - * are updated to include counter statistics. - * - * After that point on the driver must respect the counter - * state so that each entry added to the table is added - * with a counter. - */ -bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, - const char *table_name) -{ - struct devlink_dpipe_table *table; - bool enabled; - - rcu_read_lock(); - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - enabled = false; - if (table) - enabled = table->counters_enabled; - rcu_read_unlock(); - return enabled; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled); - -/** - * devl_dpipe_table_register - register dpipe table - * - * @devlink: devlink - * @table_name: table name - * @table_ops: table ops - * @priv: priv - * @counter_control_extern: external control for counters - */ -int devl_dpipe_table_register(struct devlink *devlink, - const char *table_name, - struct devlink_dpipe_table_ops *table_ops, - void *priv, bool counter_control_extern) -{ - struct devlink_dpipe_table *table; - - lockdep_assert_held(&devlink->lock); - - if (WARN_ON(!table_ops->size_get)) - return -EINVAL; - - if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, - devlink)) - return -EEXIST; - - table = kzalloc(sizeof(*table), GFP_KERNEL); - if (!table) - return -ENOMEM; - - table->name = table_name; - table->table_ops = table_ops; - table->priv = priv; - table->counter_control_extern = counter_control_extern; - - list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); - - return 0; -} -EXPORT_SYMBOL_GPL(devl_dpipe_table_register); - -/** - * devl_dpipe_table_unregister - unregister dpipe table - * - * @devlink: devlink - * @table_name: table name - */ -void devl_dpipe_table_unregister(struct devlink *devlink, - const char *table_name) -{ - struct devlink_dpipe_table *table; - - lockdep_assert_held(&devlink->lock); - - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - if (!table) - return; - list_del_rcu(&table->list); - kfree_rcu(table, rcu); -} -EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister); - -/** - * devl_resource_register - devlink resource register - * - * @devlink: devlink - * @resource_name: resource's name - * @resource_size: resource's size - * @resource_id: resource's id - * @parent_resource_id: resource's parent id - * @size_params: size parameters - * - * Generic resources should reuse the same names across drivers. - * Please see the generic resources list at: - * Documentation/networking/devlink/devlink-resource.rst - */ -int devl_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - struct devlink_resource *resource; - struct list_head *resource_list; - bool top_hierarchy; - - lockdep_assert_held(&devlink->lock); - - top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (resource) - return -EINVAL; - - resource = kzalloc(sizeof(*resource), GFP_KERNEL); - if (!resource) - return -ENOMEM; - - if (top_hierarchy) { - resource_list = &devlink->resource_list; - } else { - struct devlink_resource *parent_resource; - - parent_resource = devlink_resource_find(devlink, NULL, - parent_resource_id); - if (parent_resource) { - resource_list = &parent_resource->resource_list; - resource->parent = parent_resource; - } else { - kfree(resource); - return -EINVAL; - } - } - - resource->name = resource_name; - resource->size = resource_size; - resource->size_new = resource_size; - resource->id = resource_id; - resource->size_valid = true; - memcpy(&resource->size_params, size_params, - sizeof(resource->size_params)); - INIT_LIST_HEAD(&resource->resource_list); - list_add_tail(&resource->list, resource_list); - - return 0; -} -EXPORT_SYMBOL_GPL(devl_resource_register); - -/** - * devlink_resource_register - devlink resource register - * - * @devlink: devlink - * @resource_name: resource's name - * @resource_size: resource's size - * @resource_id: resource's id - * @parent_resource_id: resource's parent id - * @size_params: size parameters - * - * Generic resources should reuse the same names across drivers. - * Please see the generic resources list at: - * Documentation/networking/devlink/devlink-resource.rst - * - * Context: Takes and release devlink->lock <mutex>. - */ -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - int err; - - devl_lock(devlink); - err = devl_resource_register(devlink, resource_name, resource_size, - resource_id, parent_resource_id, size_params); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_resource_register); - -static void devlink_resource_unregister(struct devlink *devlink, - struct devlink_resource *resource) -{ - struct devlink_resource *tmp, *child_resource; - - list_for_each_entry_safe(child_resource, tmp, &resource->resource_list, - list) { - devlink_resource_unregister(devlink, child_resource); - list_del(&child_resource->list); - kfree(child_resource); - } -} - -/** - * devl_resources_unregister - free all resources - * - * @devlink: devlink - */ -void devl_resources_unregister(struct devlink *devlink) -{ - struct devlink_resource *tmp, *child_resource; - - lockdep_assert_held(&devlink->lock); - - list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list, - list) { - devlink_resource_unregister(devlink, child_resource); - list_del(&child_resource->list); - kfree(child_resource); - } -} -EXPORT_SYMBOL_GPL(devl_resources_unregister); - -/** - * devlink_resources_unregister - free all resources - * - * @devlink: devlink - * - * Context: Takes and release devlink->lock <mutex>. - */ -void devlink_resources_unregister(struct devlink *devlink) -{ - devl_lock(devlink); - devl_resources_unregister(devlink); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resources_unregister); - -/** - * devl_resource_size_get - get and update size - * - * @devlink: devlink - * @resource_id: the requested resource id - * @p_resource_size: ptr to update - */ -int devl_resource_size_get(struct devlink *devlink, - u64 resource_id, - u64 *p_resource_size) -{ - struct devlink_resource *resource; - - lockdep_assert_held(&devlink->lock); - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (!resource) - return -EINVAL; - *p_resource_size = resource->size_new; - resource->size = resource->size_new; - return 0; -} -EXPORT_SYMBOL_GPL(devl_resource_size_get); - -/** - * devl_dpipe_table_resource_set - set the resource id - * - * @devlink: devlink - * @table_name: table name - * @resource_id: resource id - * @resource_units: number of resource's units consumed per table's entry - */ -int devl_dpipe_table_resource_set(struct devlink *devlink, - const char *table_name, u64 resource_id, - u64 resource_units) -{ - struct devlink_dpipe_table *table; - - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - if (!table) - return -EINVAL; - - table->resource_id = resource_id; - table->resource_units = resource_units; - table->resource_valid = true; - return 0; -} -EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set); - -/** - * devl_resource_occ_get_register - register occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * @occ_get: occupancy getter callback - * @occ_get_priv: occupancy getter callback priv - */ -void devl_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ - struct devlink_resource *resource; - - lockdep_assert_held(&devlink->lock); - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (WARN_ON(!resource)) - return; - WARN_ON(resource->occ_get); - - resource->occ_get = occ_get; - resource->occ_get_priv = occ_get_priv; -} -EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); - -/** - * devlink_resource_occ_get_register - register occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * @occ_get: occupancy getter callback - * @occ_get_priv: occupancy getter callback priv - * - * Context: Takes and release devlink->lock <mutex>. - */ -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ - devl_lock(devlink); - devl_resource_occ_get_register(devlink, resource_id, - occ_get, occ_get_priv); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register); - -/** - * devl_resource_occ_get_unregister - unregister occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - */ -void devl_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id) -{ - struct devlink_resource *resource; - - lockdep_assert_held(&devlink->lock); - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (WARN_ON(!resource)) - return; - WARN_ON(!resource->occ_get); - - resource->occ_get = NULL; - resource->occ_get_priv = NULL; -} -EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister); - -/** - * devlink_resource_occ_get_unregister - unregister occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * - * Context: Takes and release devlink->lock <mutex>. - */ -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id) -{ - devl_lock(devlink); - devl_resource_occ_get_unregister(devlink, resource_id); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); - -static int devlink_param_verify(const struct devlink_param *param) -{ - if (!param || !param->name || !param->supported_cmodes) - return -EINVAL; - if (param->generic) - return devlink_param_generic_verify(param); - else - return devlink_param_driver_verify(param); -} - -/** - * devlink_params_register - register configuration parameters - * - * @devlink: devlink - * @params: configuration parameters array - * @params_count: number of parameters provided - * - * Register the configuration parameters supported by the driver. - */ -int devlink_params_register(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - const struct devlink_param *param = params; - int i, err; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - for (i = 0; i < params_count; i++, param++) { - err = devlink_param_register(devlink, param); - if (err) - goto rollback; - } - return 0; - -rollback: - if (!i) - return err; - - for (param--; i > 0; i--, param--) - devlink_param_unregister(devlink, param); - return err; -} -EXPORT_SYMBOL_GPL(devlink_params_register); - -/** - * devlink_params_unregister - unregister configuration parameters - * @devlink: devlink - * @params: configuration parameters to unregister - * @params_count: number of parameters provided - */ -void devlink_params_unregister(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - const struct devlink_param *param = params; - int i; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - for (i = 0; i < params_count; i++, param++) - devlink_param_unregister(devlink, param); -} -EXPORT_SYMBOL_GPL(devlink_params_unregister); - -/** - * devlink_param_register - register one configuration parameter - * - * @devlink: devlink - * @param: one configuration parameter - * - * Register the configuration parameter supported by the driver. - * Return: returns 0 on successful registration or error code otherwise. - */ -int devlink_param_register(struct devlink *devlink, - const struct devlink_param *param) -{ - struct devlink_param_item *param_item; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - WARN_ON(devlink_param_verify(param)); - WARN_ON(devlink_param_find_by_name(&devlink->param_list, param->name)); - - if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) - WARN_ON(param->get || param->set); - else - WARN_ON(!param->get || !param->set); - - param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); - if (!param_item) - return -ENOMEM; - - param_item->param = param; - - list_add_tail(¶m_item->list, &devlink->param_list); - return 0; -} -EXPORT_SYMBOL_GPL(devlink_param_register); - -/** - * devlink_param_unregister - unregister one configuration parameter - * @devlink: devlink - * @param: configuration parameter to unregister - */ -void devlink_param_unregister(struct devlink *devlink, - const struct devlink_param *param) -{ - struct devlink_param_item *param_item; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - param_item = - devlink_param_find_by_name(&devlink->param_list, param->name); - WARN_ON(!param_item); - list_del(¶m_item->list); - kfree(param_item); -} -EXPORT_SYMBOL_GPL(devlink_param_unregister); - -/** - * devlink_param_driverinit_value_get - get configuration parameter - * value for driver initializing - * - * @devlink: devlink - * @param_id: parameter ID - * @init_val: value of parameter in driverinit configuration mode - * - * This function should be used by the driver to get driverinit - * configuration for initialization after reload command. - */ -int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *init_val) -{ - struct devlink_param_item *param_item; - - if (!devlink_reload_supported(devlink->ops)) - return -EOPNOTSUPP; - - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); - if (!param_item) - return -EINVAL; - - if (!param_item->driverinit_value_valid || - !devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) - return -EOPNOTSUPP; - - if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(init_val->vstr, param_item->driverinit_value.vstr); - else - *init_val = param_item->driverinit_value; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); - -/** - * devlink_param_driverinit_value_set - set value of configuration - * parameter for driverinit - * configuration mode - * - * @devlink: devlink - * @param_id: parameter ID - * @init_val: value of parameter to set for driverinit configuration mode - * - * This function should be used by the driver to set driverinit - * configuration mode default value. - */ -int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, - union devlink_param_value init_val) -{ - struct devlink_param_item *param_item; - - ASSERT_DEVLINK_NOT_REGISTERED(devlink); - - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); - if (!param_item) - return -EINVAL; - - if (!devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) - return -EOPNOTSUPP; - - if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(param_item->driverinit_value.vstr, init_val.vstr); - else - param_item->driverinit_value = init_val; - param_item->driverinit_value_valid = true; - return 0; -} -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); - -/** - * devlink_param_value_changed - notify devlink on a parameter's value - * change. Should be called by the driver - * right after the change. - * - * @devlink: devlink - * @param_id: parameter ID - * - * This function should be used by the driver to notify devlink on value - * change, excluding driverinit configuration mode. - * For driverinit configuration mode driver should use the function - */ -void devlink_param_value_changed(struct devlink *devlink, u32 param_id) -{ - struct devlink_param_item *param_item; - - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); - WARN_ON(!param_item); - - devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); -} -EXPORT_SYMBOL_GPL(devlink_param_value_changed); - -/** - * devl_region_create - create a new address region - * - * @devlink: devlink - * @ops: region operations and name - * @region_max_snapshots: Maximum supported number of snapshots for region - * @region_size: size of region - */ -struct devlink_region *devl_region_create(struct devlink *devlink, - const struct devlink_region_ops *ops, - u32 region_max_snapshots, - u64 region_size) -{ - struct devlink_region *region; - - devl_assert_locked(devlink); - - if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) - return ERR_PTR(-EINVAL); - - if (devlink_region_get_by_name(devlink, ops->name)) - return ERR_PTR(-EEXIST); - - region = kzalloc(sizeof(*region), GFP_KERNEL); - if (!region) - return ERR_PTR(-ENOMEM); - - region->devlink = devlink; - region->max_snapshots = region_max_snapshots; - region->ops = ops; - region->size = region_size; - INIT_LIST_HEAD(®ion->snapshot_list); - mutex_init(®ion->snapshot_lock); - list_add_tail(®ion->list, &devlink->region_list); - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - - return region; -} -EXPORT_SYMBOL_GPL(devl_region_create); - -/** - * devlink_region_create - create a new address region - * - * @devlink: devlink - * @ops: region operations and name - * @region_max_snapshots: Maximum supported number of snapshots for region - * @region_size: size of region - * - * Context: Takes and release devlink->lock <mutex>. - */ -struct devlink_region * -devlink_region_create(struct devlink *devlink, - const struct devlink_region_ops *ops, - u32 region_max_snapshots, u64 region_size) -{ - struct devlink_region *region; - - devl_lock(devlink); - region = devl_region_create(devlink, ops, region_max_snapshots, - region_size); - devl_unlock(devlink); - return region; -} -EXPORT_SYMBOL_GPL(devlink_region_create); - -/** - * devlink_port_region_create - create a new address region for a port - * - * @port: devlink port - * @ops: region operations and name - * @region_max_snapshots: Maximum supported number of snapshots for region - * @region_size: size of region - * - * Context: Takes and release devlink->lock <mutex>. - */ -struct devlink_region * -devlink_port_region_create(struct devlink_port *port, - const struct devlink_port_region_ops *ops, - u32 region_max_snapshots, u64 region_size) -{ - struct devlink *devlink = port->devlink; - struct devlink_region *region; - int err = 0; - - ASSERT_DEVLINK_PORT_INITIALIZED(port); - - if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) - return ERR_PTR(-EINVAL); - - devl_lock(devlink); - - if (devlink_port_region_get_by_name(port, ops->name)) { - err = -EEXIST; - goto unlock; - } - - region = kzalloc(sizeof(*region), GFP_KERNEL); - if (!region) { - err = -ENOMEM; - goto unlock; - } - - region->devlink = devlink; - region->port = port; - region->max_snapshots = region_max_snapshots; - region->port_ops = ops; - region->size = region_size; - INIT_LIST_HEAD(®ion->snapshot_list); - mutex_init(®ion->snapshot_lock); - list_add_tail(®ion->list, &port->region_list); - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - - devl_unlock(devlink); - return region; - -unlock: - devl_unlock(devlink); - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(devlink_port_region_create); - -/** - * devl_region_destroy - destroy address region - * - * @region: devlink region to destroy - */ -void devl_region_destroy(struct devlink_region *region) -{ - struct devlink *devlink = region->devlink; - struct devlink_snapshot *snapshot, *ts; - - devl_assert_locked(devlink); - - /* Free all snapshots of region */ - mutex_lock(®ion->snapshot_lock); - list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) - devlink_region_snapshot_del(region, snapshot); - mutex_unlock(®ion->snapshot_lock); - - list_del(®ion->list); - mutex_destroy(®ion->snapshot_lock); - - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); - kfree(region); -} -EXPORT_SYMBOL_GPL(devl_region_destroy); - -/** - * devlink_region_destroy - destroy address region - * - * @region: devlink region to destroy - * - * Context: Takes and release devlink->lock <mutex>. - */ -void devlink_region_destroy(struct devlink_region *region) -{ - struct devlink *devlink = region->devlink; - - devl_lock(devlink); - devl_region_destroy(region); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_region_destroy); - -/** - * devlink_region_snapshot_id_get - get snapshot ID - * - * This callback should be called when adding a new snapshot, - * Driver should use the same id for multiple snapshots taken - * on multiple regions at the same time/by the same trigger. - * - * The caller of this function must use devlink_region_snapshot_id_put - * when finished creating regions using this id. - * - * Returns zero on success, or a negative error code on failure. - * - * @devlink: devlink - * @id: storage to return id - */ -int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) -{ - return __devlink_region_snapshot_id_get(devlink, id); -} -EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); - -/** - * devlink_region_snapshot_id_put - put snapshot ID reference - * - * This should be called by a driver after finishing creating snapshots - * with an id. Doing so ensures that the ID can later be released in the - * event that all snapshots using it have been destroyed. - * - * @devlink: devlink - * @id: id to release reference on - */ -void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) -{ - __devlink_snapshot_id_decrement(devlink, id); -} -EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); - -/** - * devlink_region_snapshot_create - create a new snapshot - * This will add a new snapshot of a region. The snapshot - * will be stored on the region struct and can be accessed - * from devlink. This is useful for future analyses of snapshots. - * Multiple snapshots can be created on a region. - * The @snapshot_id should be obtained using the getter function. - * - * @region: devlink region of the snapshot - * @data: snapshot data - * @snapshot_id: snapshot id to be created - */ -int devlink_region_snapshot_create(struct devlink_region *region, - u8 *data, u32 snapshot_id) -{ - int err; - - mutex_lock(®ion->snapshot_lock); - err = __devlink_region_snapshot_create(region, data, snapshot_id); - mutex_unlock(®ion->snapshot_lock); - return err; -} -EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); - -#define DEVLINK_TRAP(_id, _type) \ - { \ - .type = DEVLINK_TRAP_TYPE_##_type, \ - .id = DEVLINK_TRAP_GENERIC_ID_##_id, \ - .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \ - } - -static const struct devlink_trap devlink_trap_generic[] = { - DEVLINK_TRAP(SMAC_MC, DROP), - DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP), - DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP), - DEVLINK_TRAP(INGRESS_STP_FILTER, DROP), - DEVLINK_TRAP(EMPTY_TX_LIST, DROP), - DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP), - DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), - DEVLINK_TRAP(TTL_ERROR, EXCEPTION), - DEVLINK_TRAP(TAIL_DROP, DROP), - DEVLINK_TRAP(NON_IP_PACKET, DROP), - DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), - DEVLINK_TRAP(DIP_LB, DROP), - DEVLINK_TRAP(SIP_MC, DROP), - DEVLINK_TRAP(SIP_LB, DROP), - DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), - DEVLINK_TRAP(IPV4_SIP_BC, DROP), - DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), - DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), - DEVLINK_TRAP(MTU_ERROR, EXCEPTION), - DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), - DEVLINK_TRAP(RPF, EXCEPTION), - DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), - DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), - DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), - DEVLINK_TRAP(NON_ROUTABLE, DROP), - DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), - DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), - DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), - DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), - DEVLINK_TRAP(STP, CONTROL), - DEVLINK_TRAP(LACP, CONTROL), - DEVLINK_TRAP(LLDP, CONTROL), - DEVLINK_TRAP(IGMP_QUERY, CONTROL), - DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), - DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), - DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), - DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), - DEVLINK_TRAP(MLD_QUERY, CONTROL), - DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), - DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), - DEVLINK_TRAP(MLD_V1_DONE, CONTROL), - DEVLINK_TRAP(IPV4_DHCP, CONTROL), - DEVLINK_TRAP(IPV6_DHCP, CONTROL), - DEVLINK_TRAP(ARP_REQUEST, CONTROL), - DEVLINK_TRAP(ARP_RESPONSE, CONTROL), - DEVLINK_TRAP(ARP_OVERLAY, CONTROL), - DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), - DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), - DEVLINK_TRAP(IPV4_BFD, CONTROL), - DEVLINK_TRAP(IPV6_BFD, CONTROL), - DEVLINK_TRAP(IPV4_OSPF, CONTROL), - DEVLINK_TRAP(IPV6_OSPF, CONTROL), - DEVLINK_TRAP(IPV4_BGP, CONTROL), - DEVLINK_TRAP(IPV6_BGP, CONTROL), - DEVLINK_TRAP(IPV4_VRRP, CONTROL), - DEVLINK_TRAP(IPV6_VRRP, CONTROL), - DEVLINK_TRAP(IPV4_PIM, CONTROL), - DEVLINK_TRAP(IPV6_PIM, CONTROL), - DEVLINK_TRAP(UC_LB, CONTROL), - DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), - DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), - DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), - DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), - DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), - DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), - DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), - DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), - DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), - DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), - DEVLINK_TRAP(PTP_EVENT, CONTROL), - DEVLINK_TRAP(PTP_GENERAL, CONTROL), - DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), - DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), - DEVLINK_TRAP(EARLY_DROP, DROP), - DEVLINK_TRAP(VXLAN_PARSING, DROP), - DEVLINK_TRAP(LLC_SNAP_PARSING, DROP), - DEVLINK_TRAP(VLAN_PARSING, DROP), - DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP), - DEVLINK_TRAP(MPLS_PARSING, DROP), - DEVLINK_TRAP(ARP_PARSING, DROP), - DEVLINK_TRAP(IP_1_PARSING, DROP), - DEVLINK_TRAP(IP_N_PARSING, DROP), - DEVLINK_TRAP(GRE_PARSING, DROP), - DEVLINK_TRAP(UDP_PARSING, DROP), - DEVLINK_TRAP(TCP_PARSING, DROP), - DEVLINK_TRAP(IPSEC_PARSING, DROP), - DEVLINK_TRAP(SCTP_PARSING, DROP), - DEVLINK_TRAP(DCCP_PARSING, DROP), - DEVLINK_TRAP(GTP_PARSING, DROP), - DEVLINK_TRAP(ESP_PARSING, DROP), - DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), - DEVLINK_TRAP(DMAC_FILTER, DROP), - DEVLINK_TRAP(EAPOL, CONTROL), - DEVLINK_TRAP(LOCKED_PORT, DROP), -}; - -#define DEVLINK_TRAP_GROUP(_id) \ - { \ - .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \ - .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \ - } - -static const struct devlink_trap_group devlink_trap_group_generic[] = { - DEVLINK_TRAP_GROUP(L2_DROPS), - DEVLINK_TRAP_GROUP(L3_DROPS), - DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), - DEVLINK_TRAP_GROUP(BUFFER_DROPS), - DEVLINK_TRAP_GROUP(TUNNEL_DROPS), - DEVLINK_TRAP_GROUP(ACL_DROPS), - DEVLINK_TRAP_GROUP(STP), - DEVLINK_TRAP_GROUP(LACP), - DEVLINK_TRAP_GROUP(LLDP), - DEVLINK_TRAP_GROUP(MC_SNOOPING), - DEVLINK_TRAP_GROUP(DHCP), - DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), - DEVLINK_TRAP_GROUP(BFD), - DEVLINK_TRAP_GROUP(OSPF), - DEVLINK_TRAP_GROUP(BGP), - DEVLINK_TRAP_GROUP(VRRP), - DEVLINK_TRAP_GROUP(PIM), - DEVLINK_TRAP_GROUP(UC_LB), - DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), - DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY), - DEVLINK_TRAP_GROUP(IPV6), - DEVLINK_TRAP_GROUP(PTP_EVENT), - DEVLINK_TRAP_GROUP(PTP_GENERAL), - DEVLINK_TRAP_GROUP(ACL_SAMPLE), - DEVLINK_TRAP_GROUP(ACL_TRAP), - DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS), - DEVLINK_TRAP_GROUP(EAPOL), -}; - -static int devlink_trap_generic_verify(const struct devlink_trap *trap) -{ - if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX) - return -EINVAL; - - if (strcmp(trap->name, devlink_trap_generic[trap->id].name)) - return -EINVAL; - - if (trap->type != devlink_trap_generic[trap->id].type) - return -EINVAL; - - return 0; -} - -static int devlink_trap_driver_verify(const struct devlink_trap *trap) -{ - int i; - - if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) { - if (!strcmp(trap->name, devlink_trap_generic[i].name)) - return -EEXIST; - } - - return 0; -} - -static int devlink_trap_verify(const struct devlink_trap *trap) -{ - if (!trap || !trap->name) - return -EINVAL; - - if (trap->generic) - return devlink_trap_generic_verify(trap); - else - return devlink_trap_driver_verify(trap); -} - -static int -devlink_trap_group_generic_verify(const struct devlink_trap_group *group) -{ - if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) - return -EINVAL; - - if (strcmp(group->name, devlink_trap_group_generic[group->id].name)) - return -EINVAL; - - return 0; -} - -static int -devlink_trap_group_driver_verify(const struct devlink_trap_group *group) -{ - int i; - - if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) { - if (!strcmp(group->name, devlink_trap_group_generic[i].name)) - return -EEXIST; - } - - return 0; -} - -static int devlink_trap_group_verify(const struct devlink_trap_group *group) -{ - if (group->generic) - return devlink_trap_group_generic_verify(group); - else - return devlink_trap_group_driver_verify(group); -} - -static void -devlink_trap_group_notify(struct devlink *devlink, - const struct devlink_trap_group_item *group_item, - enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && - cmd != DEVLINK_CMD_TRAP_GROUP_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0, - 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static int -devlink_trap_item_group_link(struct devlink *devlink, - struct devlink_trap_item *trap_item) -{ - u16 group_id = trap_item->trap->init_group_id; - struct devlink_trap_group_item *group_item; - - group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id); - if (WARN_ON_ONCE(!group_item)) - return -EINVAL; - - trap_item->group_item = group_item; - - return 0; -} - -static void devlink_trap_notify(struct devlink *devlink, - const struct devlink_trap_item *trap_item, - enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && - cmd != DEVLINK_CMD_TRAP_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static int -devlink_trap_register(struct devlink *devlink, - const struct devlink_trap *trap, void *priv) -{ - struct devlink_trap_item *trap_item; - int err; - - if (devlink_trap_item_lookup(devlink, trap->name)) - return -EEXIST; - - trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL); - if (!trap_item) - return -ENOMEM; - - trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); - if (!trap_item->stats) { - err = -ENOMEM; - goto err_stats_alloc; - } - - trap_item->trap = trap; - trap_item->action = trap->init_action; - trap_item->priv = priv; - - err = devlink_trap_item_group_link(devlink, trap_item); - if (err) - goto err_group_link; - - err = devlink->ops->trap_init(devlink, trap, trap_item); - if (err) - goto err_trap_init; - - list_add_tail(&trap_item->list, &devlink->trap_list); - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); - - return 0; - -err_trap_init: -err_group_link: - free_percpu(trap_item->stats); -err_stats_alloc: - kfree(trap_item); - return err; -} - -static void devlink_trap_unregister(struct devlink *devlink, - const struct devlink_trap *trap) -{ - struct devlink_trap_item *trap_item; - - trap_item = devlink_trap_item_lookup(devlink, trap->name); - if (WARN_ON_ONCE(!trap_item)) - return; - - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); - list_del(&trap_item->list); - if (devlink->ops->trap_fini) - devlink->ops->trap_fini(devlink, trap, trap_item); - free_percpu(trap_item->stats); - kfree(trap_item); -} - -static void devlink_trap_disable(struct devlink *devlink, - const struct devlink_trap *trap) -{ - struct devlink_trap_item *trap_item; - - trap_item = devlink_trap_item_lookup(devlink, trap->name); - if (WARN_ON_ONCE(!trap_item)) - return; - - devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP, - NULL); - trap_item->action = DEVLINK_TRAP_ACTION_DROP; -} - -/** - * devl_traps_register - Register packet traps with devlink. - * @devlink: devlink. - * @traps: Packet traps. - * @traps_count: Count of provided packet traps. - * @priv: Driver private information. - * - * Return: Non-zero value on failure. - */ -int devl_traps_register(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count, void *priv) -{ - int i, err; - - if (!devlink->ops->trap_init || !devlink->ops->trap_action_set) - return -EINVAL; - - devl_assert_locked(devlink); - for (i = 0; i < traps_count; i++) { - const struct devlink_trap *trap = &traps[i]; - - err = devlink_trap_verify(trap); - if (err) - goto err_trap_verify; - - err = devlink_trap_register(devlink, trap, priv); - if (err) - goto err_trap_register; - } - - return 0; - -err_trap_register: -err_trap_verify: - for (i--; i >= 0; i--) - devlink_trap_unregister(devlink, &traps[i]); - return err; -} -EXPORT_SYMBOL_GPL(devl_traps_register); - -/** - * devlink_traps_register - Register packet traps with devlink. - * @devlink: devlink. - * @traps: Packet traps. - * @traps_count: Count of provided packet traps. - * @priv: Driver private information. - * - * Context: Takes and release devlink->lock <mutex>. - * - * Return: Non-zero value on failure. - */ -int devlink_traps_register(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count, void *priv) -{ - int err; - - devl_lock(devlink); - err = devl_traps_register(devlink, traps, traps_count, priv); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_traps_register); - -/** - * devl_traps_unregister - Unregister packet traps from devlink. - * @devlink: devlink. - * @traps: Packet traps. - * @traps_count: Count of provided packet traps. - */ -void devl_traps_unregister(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count) -{ - int i; - - devl_assert_locked(devlink); - /* Make sure we do not have any packets in-flight while unregistering - * traps by disabling all of them and waiting for a grace period. - */ - for (i = traps_count - 1; i >= 0; i--) - devlink_trap_disable(devlink, &traps[i]); - synchronize_rcu(); - for (i = traps_count - 1; i >= 0; i--) - devlink_trap_unregister(devlink, &traps[i]); -} -EXPORT_SYMBOL_GPL(devl_traps_unregister); - -/** - * devlink_traps_unregister - Unregister packet traps from devlink. - * @devlink: devlink. - * @traps: Packet traps. - * @traps_count: Count of provided packet traps. - * - * Context: Takes and release devlink->lock <mutex>. - */ -void devlink_traps_unregister(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count) -{ - devl_lock(devlink); - devl_traps_unregister(devlink, traps, traps_count); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_traps_unregister); - -static void -devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, - size_t skb_len) -{ - struct devlink_stats *stats; - - stats = this_cpu_ptr(trap_stats); - u64_stats_update_begin(&stats->syncp); - u64_stats_add(&stats->rx_bytes, skb_len); - u64_stats_inc(&stats->rx_packets); - u64_stats_update_end(&stats->syncp); -} - -static void -devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata, - const struct devlink_trap_item *trap_item, - struct devlink_port *in_devlink_port, - const struct flow_action_cookie *fa_cookie) -{ - metadata->trap_name = trap_item->trap->name; - metadata->trap_group_name = trap_item->group_item->group->name; - metadata->fa_cookie = fa_cookie; - metadata->trap_type = trap_item->trap->type; - - spin_lock(&in_devlink_port->type_lock); - if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) - metadata->input_dev = in_devlink_port->type_eth.netdev; - spin_unlock(&in_devlink_port->type_lock); -} - -/** - * devlink_trap_report - Report trapped packet to drop monitor. - * @devlink: devlink. - * @skb: Trapped packet. - * @trap_ctx: Trap context. - * @in_devlink_port: Input devlink port. - * @fa_cookie: Flow action cookie. Could be NULL. - */ -void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, - void *trap_ctx, struct devlink_port *in_devlink_port, - const struct flow_action_cookie *fa_cookie) - -{ - struct devlink_trap_item *trap_item = trap_ctx; - - devlink_trap_stats_update(trap_item->stats, skb->len); - devlink_trap_stats_update(trap_item->group_item->stats, skb->len); - - if (trace_devlink_trap_report_enabled()) { - struct devlink_trap_metadata metadata = {}; - - devlink_trap_report_metadata_set(&metadata, trap_item, - in_devlink_port, fa_cookie); - trace_devlink_trap_report(devlink, skb, &metadata); - } -} -EXPORT_SYMBOL_GPL(devlink_trap_report); - -/** - * devlink_trap_ctx_priv - Trap context to driver private information. - * @trap_ctx: Trap context. - * - * Return: Driver private information passed during registration. - */ -void *devlink_trap_ctx_priv(void *trap_ctx) -{ - struct devlink_trap_item *trap_item = trap_ctx; - - return trap_item->priv; -} -EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); - -static int -devlink_trap_group_item_policer_link(struct devlink *devlink, - struct devlink_trap_group_item *group_item) -{ - u32 policer_id = group_item->group->init_policer_id; - struct devlink_trap_policer_item *policer_item; - - if (policer_id == 0) - return 0; - - policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); - if (WARN_ON_ONCE(!policer_item)) - return -EINVAL; - - group_item->policer_item = policer_item; - - return 0; -} - -static int -devlink_trap_group_register(struct devlink *devlink, - const struct devlink_trap_group *group) -{ - struct devlink_trap_group_item *group_item; - int err; - - if (devlink_trap_group_item_lookup(devlink, group->name)) - return -EEXIST; - - group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); - if (!group_item) - return -ENOMEM; - - group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); - if (!group_item->stats) { - err = -ENOMEM; - goto err_stats_alloc; - } - - group_item->group = group; - - err = devlink_trap_group_item_policer_link(devlink, group_item); - if (err) - goto err_policer_link; - - if (devlink->ops->trap_group_init) { - err = devlink->ops->trap_group_init(devlink, group); - if (err) - goto err_group_init; - } - - list_add_tail(&group_item->list, &devlink->trap_group_list); - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW); - - return 0; - -err_group_init: -err_policer_link: - free_percpu(group_item->stats); -err_stats_alloc: - kfree(group_item); - return err; -} - -static void -devlink_trap_group_unregister(struct devlink *devlink, - const struct devlink_trap_group *group) -{ - struct devlink_trap_group_item *group_item; - - group_item = devlink_trap_group_item_lookup(devlink, group->name); - if (WARN_ON_ONCE(!group_item)) - return; - - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_DEL); - list_del(&group_item->list); - free_percpu(group_item->stats); - kfree(group_item); -} - -/** - * devl_trap_groups_register - Register packet trap groups with devlink. - * @devlink: devlink. - * @groups: Packet trap groups. - * @groups_count: Count of provided packet trap groups. - * - * Return: Non-zero value on failure. - */ -int devl_trap_groups_register(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) -{ - int i, err; - - devl_assert_locked(devlink); - for (i = 0; i < groups_count; i++) { - const struct devlink_trap_group *group = &groups[i]; - - err = devlink_trap_group_verify(group); - if (err) - goto err_trap_group_verify; - - err = devlink_trap_group_register(devlink, group); - if (err) - goto err_trap_group_register; - } - - return 0; - -err_trap_group_register: -err_trap_group_verify: - for (i--; i >= 0; i--) - devlink_trap_group_unregister(devlink, &groups[i]); - return err; -} -EXPORT_SYMBOL_GPL(devl_trap_groups_register); - -/** - * devlink_trap_groups_register - Register packet trap groups with devlink. - * @devlink: devlink. - * @groups: Packet trap groups. - * @groups_count: Count of provided packet trap groups. - * - * Context: Takes and release devlink->lock <mutex>. - * - * Return: Non-zero value on failure. - */ -int devlink_trap_groups_register(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) -{ - int err; - - devl_lock(devlink); - err = devl_trap_groups_register(devlink, groups, groups_count); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_trap_groups_register); - -/** - * devl_trap_groups_unregister - Unregister packet trap groups from devlink. - * @devlink: devlink. - * @groups: Packet trap groups. - * @groups_count: Count of provided packet trap groups. - */ -void devl_trap_groups_unregister(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) -{ - int i; - - devl_assert_locked(devlink); - for (i = groups_count - 1; i >= 0; i--) - devlink_trap_group_unregister(devlink, &groups[i]); -} -EXPORT_SYMBOL_GPL(devl_trap_groups_unregister); - -/** - * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. - * @devlink: devlink. - * @groups: Packet trap groups. - * @groups_count: Count of provided packet trap groups. - * - * Context: Takes and release devlink->lock <mutex>. - */ -void devlink_trap_groups_unregister(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) -{ - devl_lock(devlink); - devl_trap_groups_unregister(devlink, groups, groups_count); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); - -static void -devlink_trap_policer_notify(struct devlink *devlink, - const struct devlink_trap_policer_item *policer_item, - enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && - cmd != DEVLINK_CMD_TRAP_POLICER_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0, - 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static int -devlink_trap_policer_register(struct devlink *devlink, - const struct devlink_trap_policer *policer) -{ - struct devlink_trap_policer_item *policer_item; - int err; - - if (devlink_trap_policer_item_lookup(devlink, policer->id)) - return -EEXIST; - - policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); - if (!policer_item) - return -ENOMEM; - - policer_item->policer = policer; - policer_item->rate = policer->init_rate; - policer_item->burst = policer->init_burst; - - if (devlink->ops->trap_policer_init) { - err = devlink->ops->trap_policer_init(devlink, policer); - if (err) - goto err_policer_init; - } - - list_add_tail(&policer_item->list, &devlink->trap_policer_list); - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_NEW); - - return 0; - -err_policer_init: - kfree(policer_item); - return err; -} - -static void -devlink_trap_policer_unregister(struct devlink *devlink, - const struct devlink_trap_policer *policer) -{ - struct devlink_trap_policer_item *policer_item; - - policer_item = devlink_trap_policer_item_lookup(devlink, policer->id); - if (WARN_ON_ONCE(!policer_item)) - return; - - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_DEL); - list_del(&policer_item->list); - if (devlink->ops->trap_policer_fini) - devlink->ops->trap_policer_fini(devlink, policer); - kfree(policer_item); -} - -/** - * devl_trap_policers_register - Register packet trap policers with devlink. - * @devlink: devlink. - * @policers: Packet trap policers. - * @policers_count: Count of provided packet trap policers. - * - * Return: Non-zero value on failure. - */ -int -devl_trap_policers_register(struct devlink *devlink, - const struct devlink_trap_policer *policers, - size_t policers_count) -{ - int i, err; - - devl_assert_locked(devlink); - for (i = 0; i < policers_count; i++) { - const struct devlink_trap_policer *policer = &policers[i]; - - if (WARN_ON(policer->id == 0 || - policer->max_rate < policer->min_rate || - policer->max_burst < policer->min_burst)) { - err = -EINVAL; - goto err_trap_policer_verify; - } - - err = devlink_trap_policer_register(devlink, policer); - if (err) - goto err_trap_policer_register; - } - return 0; - -err_trap_policer_register: -err_trap_policer_verify: - for (i--; i >= 0; i--) - devlink_trap_policer_unregister(devlink, &policers[i]); - return err; -} -EXPORT_SYMBOL_GPL(devl_trap_policers_register); - -/** - * devl_trap_policers_unregister - Unregister packet trap policers from devlink. - * @devlink: devlink. - * @policers: Packet trap policers. - * @policers_count: Count of provided packet trap policers. - */ -void -devl_trap_policers_unregister(struct devlink *devlink, - const struct devlink_trap_policer *policers, - size_t policers_count) -{ - int i; - - devl_assert_locked(devlink); - for (i = policers_count - 1; i >= 0; i--) - devlink_trap_policer_unregister(devlink, &policers[i]); -} -EXPORT_SYMBOL_GPL(devl_trap_policers_unregister); - -static void __devlink_compat_running_version(struct devlink *devlink, - char *buf, size_t len) -{ - struct devlink_info_req req = {}; - const struct nlattr *nlattr; - struct sk_buff *msg; - int rem, err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - req.msg = msg; - err = devlink->ops->info_get(devlink, &req, NULL); - if (err) - goto free_msg; - - nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { - const struct nlattr *kv; - int rem_kv; - - if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) - continue; - - nla_for_each_nested(kv, nlattr, rem_kv) { - if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) - continue; - - strlcat(buf, nla_data(kv), len); - strlcat(buf, " ", len); - } - } -free_msg: - nlmsg_free(msg); -} - -void devlink_compat_running_version(struct devlink *devlink, - char *buf, size_t len) -{ - if (!devlink->ops->info_get) - return; - - devl_lock(devlink); - __devlink_compat_running_version(devlink, buf, len); - devl_unlock(devlink); -} - -int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) -{ - struct devlink_flash_update_params params = {}; - int ret; - - if (!devlink->ops->flash_update) - return -EOPNOTSUPP; - - ret = request_firmware(¶ms.fw, file_name, devlink->dev); - if (ret) - return ret; - - devl_lock(devlink); - devlink_flash_update_begin_notify(devlink); - ret = devlink->ops->flash_update(devlink, ¶ms, NULL); - devlink_flash_update_end_notify(devlink); - devl_unlock(devlink); - - release_firmware(params.fw); - - return ret; -} - -int devlink_compat_phys_port_name_get(struct net_device *dev, - char *name, size_t len) -{ - struct devlink_port *devlink_port; - - /* RTNL mutex is held here which ensures that devlink_port - * instance cannot disappear in the middle. No need to take - * any devlink lock as only permanent values are accessed. - */ - ASSERT_RTNL(); - - devlink_port = dev->devlink_port; - if (!devlink_port) - return -EOPNOTSUPP; - - return __devlink_port_phys_port_name_get(devlink_port, name, len); -} - -int devlink_compat_switch_id_get(struct net_device *dev, - struct netdev_phys_item_id *ppid) -{ - struct devlink_port *devlink_port; - - /* Caller must hold RTNL mutex or reference to dev, which ensures that - * devlink_port instance cannot disappear in the middle. No need to take - * any devlink lock as only permanent values are accessed. - */ - devlink_port = dev->devlink_port; - if (!devlink_port || !devlink_port->switch_port) - return -EOPNOTSUPP; - - memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); - - return 0; -} - -static void __net_exit devlink_pernet_pre_exit(struct net *net) -{ - struct devlink *devlink; - u32 actions_performed; - unsigned long index; - int err; - - /* In case network namespace is getting destroyed, reload - * all devlink instances from this namespace into init_net. - */ - devlinks_xa_for_each_registered_get(net, index, devlink) { - WARN_ON(!(devlink->features & DEVLINK_F_RELOAD)); - mutex_lock(&devlink->lock); - err = devlink_reload(devlink, &init_net, - DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_LIMIT_UNSPEC, - &actions_performed, NULL); - mutex_unlock(&devlink->lock); - if (err && err != -EOPNOTSUPP) - pr_warn("Failed to reload devlink instance into init_net\n"); - devlink_put(devlink); - } -} - -static struct pernet_operations devlink_pernet_ops __net_initdata = { - .pre_exit = devlink_pernet_pre_exit, -}; - -static int __init devlink_init(void) -{ - int err; - - err = genl_register_family(&devlink_nl_family); - if (err) - goto out; - err = register_pernet_subsys(&devlink_pernet_ops); - -out: - WARN_ON(err); - return err; -} - -subsys_initcall(devlink_init); diff --git a/net/core/devmem.c b/net/core/devmem.c new file mode 100644 index 000000000000..ec4217d6c0b4 --- /dev/null +++ b/net/core/devmem.c @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Devmem TCP + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemdebruijn.kernel@gmail.com> + * Kaiyuan Zhang <kaiyuanz@google.com + */ + +#include <linux/dma-buf.h> +#include <linux/genalloc.h> +#include <linux/mm.h> +#include <linux/netdevice.h> +#include <linux/types.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <trace/events/page_pool.h> + +#include "devmem.h" +#include "mp_dmabuf_devmem.h" +#include "page_pool_priv.h" + +/* Device memory support */ + +static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); + +static const struct memory_provider_ops dmabuf_devmem_ops; + +bool net_is_devmem_iov(struct net_iov *niov) +{ + return niov->type == NET_IOV_DMABUF; +} + +static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, + struct gen_pool_chunk *chunk, + void *not_used) +{ + struct dmabuf_genpool_chunk_owner *owner = chunk->owner; + + kvfree(owner->area.niovs); + kfree(owner); +} + +static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) +{ + struct dmabuf_genpool_chunk_owner *owner; + + owner = net_devmem_iov_to_chunk_owner(niov); + return owner->base_dma_addr + + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); +} + +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) +{ + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + + size_t size, avail; + + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + + size = gen_pool_size(binding->chunk_pool); + avail = gen_pool_avail(binding->chunk_pool); + + if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu", + size, avail)) + gen_pool_destroy(binding->chunk_pool); + + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + binding->direction); + dma_buf_detach(binding->dmabuf, binding->attachment); + dma_buf_put(binding->dmabuf); + xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); + kfree(binding); +} + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct dmabuf_genpool_chunk_owner *owner; + unsigned long dma_addr; + struct net_iov *niov; + ssize_t offset; + ssize_t index; + + dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, + (void **)&owner); + if (!dma_addr) + return NULL; + + offset = dma_addr - owner->base_dma_addr; + index = offset / PAGE_SIZE; + niov = &owner->area.niovs[index]; + + niov->desc.pp_magic = 0; + niov->desc.pp = NULL; + atomic_long_set(&niov->desc.pp_ref_count, 0); + + return niov; +} + +void net_devmem_free_dmabuf(struct net_iov *niov) +{ + struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); + unsigned long dma_addr = net_devmem_get_dma_addr(niov); + + if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, + PAGE_SIZE))) + return; + + gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); +} + +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct netdev_rx_queue *rxq; + unsigned long xa_idx; + unsigned int rxq_idx; + + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase. + */ + synchronize_net(); + + if (binding->list.next) + list_del(&binding->list); + + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { + const struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; + + rxq_idx = get_netdev_rx_queue_index(rxq); + + __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); + } + + net_devmem_dmabuf_binding_put(binding); +} + +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) +{ + struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; + struct netdev_rx_queue *rxq; + u32 xa_idx; + int err; + + err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); + if (err) + return err; + + rxq = __netif_get_rx_queue(dev, rxq_idx); + err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, + GFP_KERNEL); + if (err) + goto err_close_rxq; + + return 0; + +err_close_rxq: + __net_mp_close_rxq(dev, rxq_idx, &mp_params); + return err; +} + +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack) +{ + struct net_devmem_dmabuf_binding *binding; + static u32 id_alloc_next; + struct scatterlist *sg; + struct dma_buf *dmabuf; + unsigned int sg_idx, i; + unsigned long virtual; + int err; + + if (!dma_dev) { + NL_SET_ERR_MSG(extack, "Device doesn't support DMA"); + return ERR_PTR(-EOPNOTSUPP); + } + + dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!binding) { + err = -ENOMEM; + goto err_put_dmabuf; + } + + binding->dev = dev; + xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); + + refcount_set(&binding->ref, 1); + + mutex_init(&binding->lock); + + binding->dmabuf = dmabuf; + binding->direction = direction; + + binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev); + if (IS_ERR(binding->attachment)) { + err = PTR_ERR(binding->attachment); + NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); + goto err_free_binding; + } + + binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, + direction); + if (IS_ERR(binding->sgt)) { + err = PTR_ERR(binding->sgt); + NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); + goto err_detach; + } + + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + + /* For simplicity we expect to make PAGE_SIZE allocations, but the + * binding can be much more flexible than that. We may be able to + * allocate MTU sized chunks here. Leave that for future work... + */ + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); + if (!binding->chunk_pool) { + err = -ENOMEM; + goto err_tx_vec; + } + + virtual = 0; + for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { + dma_addr_t dma_addr = sg_dma_address(sg); + struct dmabuf_genpool_chunk_owner *owner; + size_t len = sg_dma_len(sg); + struct net_iov *niov; + + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!owner) { + err = -ENOMEM; + goto err_free_chunks; + } + + owner->area.base_virtual = virtual; + owner->base_dma_addr = dma_addr; + owner->area.num_niovs = len / PAGE_SIZE; + owner->binding = binding; + + err = gen_pool_add_owner(binding->chunk_pool, dma_addr, + dma_addr, len, dev_to_node(&dev->dev), + owner); + if (err) { + kfree(owner); + err = -EINVAL; + goto err_free_chunks; + } + + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { + err = -ENOMEM; + goto err_free_chunks; + } + + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->type = NET_IOV_DMABUF; + niov->owner = &owner->area; + page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), + net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; + } + + virtual += len; + } + + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_chunks; + + list_add(&binding->list, &priv->bindings); + + return binding; + +err_free_chunks: + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); +err_unmap: + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + direction); +err_detach: + dma_buf_detach(dmabuf, binding->attachment); +err_free_binding: + kfree(binding); +err_put_dmabuf: + dma_buf_put(dmabuf); + return ERR_PTR(err); +} + +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); +} + +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct net_device *dst_dev; + struct dst_entry *dst; + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + rcu_read_lock(); + dst = __sk_dst_get(sk); + /* If dst is NULL (route expired), attempt to rebuild it. */ + if (unlikely(!dst)) { + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { + err = -EHOSTUNREACH; + goto out_unlock; + } + dst = __sk_dst_get(sk); + if (unlikely(!dst)) { + err = -ENODEV; + goto out_unlock; + } + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + dst_dev = dst_dev_rcu(dst); + if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { + err = -ENODEV; + goto out_unlock; + } + + rcu_read_unlock(); + return binding; + +out_unlock: + rcu_read_unlock(); +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + if (virt_addr >= binding->dmabuf->size) + return NULL; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[virt_addr / PAGE_SIZE]; +} + +/*** "Dmabuf devmem memory provider" ***/ + +int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + if (!binding) + return -EINVAL; + + /* dma-buf dma addresses do not need and should not be used with + * dma_sync_for_cpu/device. Force disable dma_sync. + */ + pool->dma_sync = false; + pool->dma_sync_for_cpu = false; + + if (pool->p.order != 0) + return -E2BIG; + + net_devmem_dmabuf_binding_get(binding); + return 0; +} + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + struct net_iov *niov; + netmem_ref netmem; + + niov = net_devmem_alloc_dmabuf(binding); + if (!niov) + return 0; + + netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); + return netmem; +} + +void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + net_devmem_dmabuf_binding_put(binding); +} + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem)); + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + if (WARN_ON_ONCE(refcount != 1)) + return false; + + page_pool_clear_pp_info(netmem); + + net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); + + /* We don't want the page pool put_page()ing our net_iovs. */ + return false; +} + +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + if (xa_empty(&binding->bound_rxqs)) { + mutex_lock(&binding->lock); + binding->dev = NULL; + mutex_unlock(&binding->lock); + } + break; + } + } +} + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, +}; diff --git a/net/core/devmem.h b/net/core/devmem.h new file mode 100644 index 000000000000..0b43a648cd2e --- /dev/null +++ b/net/core/devmem.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Device memory TCP support + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemb@google.com> + * Kaiyuan Zhang <kaiyuanz@google.com> + * + */ +#ifndef _NET_DEVMEM_H +#define _NET_DEVMEM_H + +#include <net/netmem.h> +#include <net/netdev_netlink.h> + +struct netlink_ext_ack; + +struct net_devmem_dmabuf_binding { + struct dma_buf *dmabuf; + struct dma_buf_attachment *attachment; + struct sg_table *sgt; + struct net_device *dev; + struct gen_pool *chunk_pool; + /* Protect dev */ + struct mutex lock; + + /* The user holds a ref (via the netlink API) for as long as they want + * the binding to remain alive. Each page pool using this binding holds + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. + * + * The binding undos itself and unmaps the underlying dmabuf once all + * those refs are dropped and the binding is no longer desired or in + * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. + */ + refcount_t ref; + + /* The list of bindings currently active. Used for netlink to notify us + * of the user dropping the bind. + */ + struct list_head list; + + /* rxq's this binding is active on. */ + struct xarray bound_rxqs; + + /* ID of this binding. Globally unique to all bindings currently + * active. + */ + u32 id; + + /* DMA direction, FROM_DEVICE for Rx binding, TO_DEVICE for Tx. */ + enum dma_data_direction direction; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; +}; + +#if defined(CONFIG_NET_DEVMEM) +/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist + * entry from the dmabuf is inserted into the genpool as a chunk, and needs + * this owner struct to keep track of some metadata necessary to create + * allocations from this chunk. + */ +struct dmabuf_genpool_chunk_owner { + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; + + /* dma_addr of the start of the chunk. */ + dma_addr_t base_dma_addr; +}; + +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack); + +static inline struct dmabuf_genpool_chunk_owner * +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return net_devmem_iov_to_chunk_owner(niov)->binding; +} + +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) +{ + return net_devmem_iov_binding(niov)->id; +} + +static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return owner->base_virtual + + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); +} + +static inline bool +net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) +{ + return refcount_inc_not_zero(&binding->ref); +} + +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ + if (!refcount_dec_and_test(&binding->ref)) + return; + + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); +} + +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); +void net_devmem_free_dmabuf(struct net_iov *ppiov); + +bool net_is_devmem_iov(struct net_iov *niov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); + +#else +struct net_devmem_dmabuf_binding; + +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) +{ +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, + struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + +static inline void +net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline int +net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) + +{ + return -EOPNOTSUPP; +} + +static inline struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + return NULL; +} + +static inline void net_devmem_free_dmabuf(struct net_iov *ppiov) +{ +} + +static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) +{ + return 0; +} + +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) +{ + return 0; +} + +static inline bool net_is_devmem_iov(struct net_iov *niov) +{ + return false; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return NULL; +} +#endif + +#endif /* _NET_DEVMEM_H */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 5a782d1d8fd3..60d31c2feed3 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -21,6 +21,7 @@ #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> +#include <linux/bitfield.h> #include <linux/percpu.h> #include <linux/timer.h> #include <linux/bitops.h> @@ -29,13 +30,14 @@ #include <net/genetlink.h> #include <net/netevent.h> #include <net/flow_offload.h> +#include <net/dropreason.h> #include <net/devlink.h> #include <trace/events/skb.h> #include <trace/events/napi.h> #include <trace/events/devlink.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #define TRACE_ON 1 #define TRACE_OFF 0 @@ -72,7 +74,7 @@ struct net_dm_hw_entries { }; struct per_cpu_dm_data { - spinlock_t lock; /* Protects 'skb', 'hw_entries' and + raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { @@ -107,7 +109,8 @@ static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason); + enum skb_drop_reason reason, + struct sock *rx_sk); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); @@ -166,9 +169,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: - spin_lock_irqsave(&data->lock, flags); + raw_spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; @@ -181,7 +184,7 @@ out: } static const struct genl_multicast_group dropmon_mcgrps[] = { - { .name = "events", }, + { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, }, }; static void send_dm_alert(struct work_struct *work) @@ -205,7 +208,7 @@ static void send_dm_alert(struct work_struct *work) */ static void sched_send_work(struct timer_list *t) { - struct per_cpu_dm_data *data = from_timer(data, t, send_timer); + struct per_cpu_dm_data *data = timer_container_of(data, t, send_timer); schedule_work(&data->dm_alert_work); } @@ -223,7 +226,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); - spin_lock(&data->lock); + raw_spin_lock(&data->lock); dskb = data->skb; if (!dskb) @@ -257,12 +260,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location) } out: - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason) + enum skb_drop_reason reason, + struct sock *rx_sk) { trace_drop_common(skb, location); } @@ -312,9 +316,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } @@ -446,7 +450,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) @@ -475,7 +479,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, } out: - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { @@ -489,7 +493,8 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = { static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, - enum skb_drop_reason reason) + enum skb_drop_reason reason, + struct sock *rx_sk) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; @@ -504,8 +509,6 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, if (!nskb) return; - if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0)) - reason = SKB_DROP_REASON_NOT_SPECIFIED; cb = NET_DM_SKB_CB(nskb); cb->reason = reason; cb->pc = location; @@ -552,9 +555,9 @@ static size_t net_dm_in_port_size(void) } #define NET_DM_MAX_SYMBOL_LEN 40 +#define NET_DM_MAX_REASON_LEN 50 -static size_t net_dm_packet_report_size(size_t payload_len, - enum skb_drop_reason reason) +static size_t net_dm_packet_report_size(size_t payload_len) { size_t size; @@ -576,7 +579,7 @@ static size_t net_dm_packet_report_size(size_t payload_len, /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_REASON */ - nla_total_size(strlen(drop_reasons[reason]) + 1) + + nla_total_size(NET_DM_MAX_REASON_LEN + 1) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } @@ -610,6 +613,8 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); + const struct drop_reason_list *list = NULL; + unsigned int subsys, subsys_reason; char buf[NET_DM_MAX_SYMBOL_LEN]; struct nlattr *attr; void *hdr; @@ -627,9 +632,24 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, NET_DM_ATTR_PAD)) goto nla_put_failure; + rcu_read_lock(); + subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK); + if (subsys < SKB_DROP_REASON_SUBSYS_NUM) + list = rcu_dereference(drop_reasons_by_subsys[subsys]); + subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK; + if (!list || + subsys_reason >= list->n_reasons || + !list->reasons[subsys_reason] || + strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) { + list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]); + subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED; + } if (nla_put_string(msg, NET_DM_ATTR_REASON, - drop_reasons[cb->reason])) + list->reasons[subsys_reason])) { + rcu_read_unlock(); goto nla_put_failure; + } + rcu_read_unlock(); snprintf(buf, sizeof(buf), "%pS", cb->pc); if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) @@ -687,9 +707,7 @@ static void net_dm_packet_report(struct sk_buff *skb) if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); - msg = nlmsg_new(net_dm_packet_report_size(payload_len, - NET_DM_SKB_CB(skb)->reason), - GFP_KERNEL); + msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); if (!msg) goto out; @@ -1070,7 +1088,7 @@ err_module_put: struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&hw_data->send_timer); + timer_delete_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; @@ -1104,7 +1122,7 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&hw_data->send_timer); + timer_delete_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; @@ -1165,7 +1183,7 @@ err_module_put: struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&data->send_timer); + timer_delete_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); @@ -1193,7 +1211,7 @@ static void net_dm_trace_off_set(void) struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&data->send_timer); + timer_delete_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); @@ -1604,11 +1622,13 @@ static const struct genl_small_ops dropmon_ops[] = { .cmd = NET_DM_CMD_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_CONFIG_GET, @@ -1656,7 +1676,7 @@ static struct notifier_block dropmon_net_notifier = { static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { - spin_lock_init(&data->lock); + raw_spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } @@ -1714,30 +1734,30 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family(&net_drop_monitor_family); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); + return rc; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; - for_each_possible_cpu(cpu) { - net_dm_cpu_data_init(cpu); - net_dm_hw_cpu_data_init(cpu); - } - goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } @@ -1746,19 +1766,18 @@ static void exit_net_drop_monitor(void) { int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); diff --git a/net/core/dst.c b/net/core/dst.c index 6d2dd03dafa8..e9d35f49c9e7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -45,7 +45,7 @@ const struct dst_metrics dst_default_metrics = { EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, - struct net_device *dev, int initial_ref, int initial_obsolete, + struct net_device *dev, int initial_obsolete, unsigned short flags) { dst->dev = dev; @@ -66,7 +66,8 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->tclassid = 0; #endif dst->lwtstate = NULL; - atomic_set(&dst->__refcnt, initial_ref); + rcuref_init(&dst->__rcuref, 1); + INIT_LIST_HEAD(&dst->rt_uncached); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; @@ -76,30 +77,26 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, EXPORT_SYMBOL(dst_init); void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, unsigned short flags) + int initial_obsolete, unsigned short flags) { struct dst_entry *dst; if (ops->gc && !(flags & DST_NOCOUNT) && - dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) { - pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); - return NULL; - } - } + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) return NULL; - dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); + dst_init(dst, ops, dev, initial_obsolete, flags); return dst; } EXPORT_SYMBOL(dst_alloc); -struct dst_entry *dst_destroy(struct dst_entry * dst) +static void dst_destroy(struct dst_entry *dst) { struct dst_entry *child = NULL; @@ -112,9 +109,6 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) child = xdst->child; } #endif - if (!(dst->flags & DST_NOCOUNT)) - dst_entries_add(dst->ops, -1); - if (dst->ops->destroy) dst->ops->destroy(dst); netdev_put(dst->dev, &dst->dev_tracker); @@ -129,15 +123,13 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) dst = child; if (dst) dst_release_immediate(dst); - return NULL; } -EXPORT_SYMBOL(dst_destroy); static void dst_destroy_rcu(struct rcu_head *head) { struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst = dst_destroy(dst); + dst_destroy(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced @@ -153,43 +145,45 @@ void dst_dev_put(struct dst_entry *dst) { struct net_device *dev = dst->dev; - dst->obsolete = DST_OBSOLETE_DEAD; + WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD); if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, true); - dst->input = dst_discard; - dst->output = dst_discard_out; - dst->dev = blackhole_netdev; + dst->ops->ifdown(dst, dev); + WRITE_ONCE(dst->input, dst_discard); + WRITE_ONCE(dst->output, dst_discard_out); + rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } EXPORT_SYMBOL(dst_dev_put); +static void dst_count_dec(struct dst_entry *dst) +{ + if (!(dst->flags & DST_NOCOUNT)) + dst_entries_add(dst->ops, -1); +} + void dst_release(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) - call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); + if (dst && rcuref_put(&dst->__rcuref)) { +#ifdef CONFIG_DST_CACHE + if (dst->flags & DST_METADATA) { + struct metadata_dst *md_dst = (struct metadata_dst *)dst; + + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_reset_now(&md_dst->u.tun_info.dst_cache); + } +#endif + dst_count_dec(dst); + call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) - dst_destroy(dst); + if (dst && rcuref_put(&dst->__rcuref)) { + dst_count_dec(dst); + dst_destroy(dst); } } EXPORT_SYMBOL(dst_release_immediate); @@ -269,7 +263,7 @@ unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - return mtu ? : dst->dev->mtu; + return mtu ? : dst_dev(dst)->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); @@ -289,7 +283,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, struct dst_entry *dst; dst = &md_dst->dst; - dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, + dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE, DST_METADATA | DST_NOCOUNT); memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); md_dst->type = type; @@ -300,7 +294,8 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, { struct metadata_dst *md_dst; - md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen), + flags); if (!md_dst) return NULL; @@ -328,7 +323,8 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) int cpu; struct metadata_dst __percpu *md_dst; - md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options, + optslen), __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 0ccfd5fa5cb9..9ab4902324e1 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -17,6 +17,7 @@ struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; + local_lock_t bh_lock; u32 cookie; union { struct in_addr in_saddr; @@ -27,6 +28,7 @@ struct dst_cache_pcpu { static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, struct dst_entry *dst, u32 cookie) { + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst_release(dst_cache->dst); if (dst) dst_hold(dst); @@ -40,6 +42,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, { struct dst_entry *dst; + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst = idst->dst; if (!dst) goto fail; @@ -47,8 +50,9 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, /* the cache already hold a dst reference; it can't go away */ dst_hold(dst); - if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || - (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + if (unlikely(!time_after(idst->refresh_ts, + READ_ONCE(dst_cache->reset_ts)) || + (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); goto fail; @@ -62,10 +66,15 @@ fail: struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { + struct dst_entry *dst; + if (!dst_cache->cache) return NULL; - return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_lock_nested_bh(&dst_cache->cache->bh_lock); + dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; } EXPORT_SYMBOL_GPL(dst_cache_get); @@ -77,13 +86,17 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in_saddr.s_addr; - return container_of(dst, struct rtable, dst); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -95,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); @@ -110,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); - dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, - rt6_get_cookie((struct rt6_info *)dst)); + dst_cache_per_cpu_dst_set(idst, dst, + rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -126,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in6_saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); @@ -139,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6); int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { + unsigned int i; + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; + for_each_possible_cpu(i) + local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); dst_cache_reset(dst_cache); return 0; @@ -170,7 +197,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache) if (!dst_cache->cache) return; - dst_cache->reset_ts = jiffies; + dst_cache_reset(dst_cache); for_each_possible_cpu(i) { struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); struct dst_entry *dst = idst->dst; diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index fc96259807b6..5cdca49b1d7c 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -43,7 +43,6 @@ static unsigned int fib_seq_sum(struct net *net) struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - rtnl_lock(); rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) @@ -52,7 +51,6 @@ static unsigned int fib_seq_sum(struct net *net) module_put(ops->owner); } rcu_read_unlock(); - rtnl_unlock(); return fib_seq; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 75282222e0b4..8ca634964e36 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -11,6 +11,7 @@ #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> +#include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> @@ -36,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = { bool fib_rule_matchall(const struct fib_rule *rule) { - if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || - rule->flags) + if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || + rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; @@ -53,7 +54,7 @@ bool fib_rule_matchall(const struct fib_rule *rule) EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, - u32 pref, u32 table, u32 flags) + u32 pref, u32 table) { struct fib_rule *r; @@ -65,7 +66,6 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; - r->flags = flags; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; @@ -73,7 +73,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; - /* The lock is not required here, the list in unreacheable + /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; @@ -101,7 +101,8 @@ static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); -static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +static struct fib_rules_ops *lookup_rules_ops(const struct net *net, + int family) { struct fib_rules_ops *ops; @@ -256,16 +257,36 @@ static int nla_put_port_range(struct sk_buff *skb, int attrtype, return nla_put(skb, attrtype, sizeof(*range), range); } +static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex, + const struct flowi *fl) +{ + u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master); + + return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) : + fl->flowi_iif == iifindex; +} + +static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex, + const struct flowi *fl) +{ + u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master); + + return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) : + fl->flowi_oif == oifindex; +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { - int ret = 0; + int iifindex, oifindex, ret = 0; - if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + iifindex = READ_ONCE(rule->iifindex); + if (iifindex && !fib_rule_iif_match(rule, iifindex, fl)) goto out; - if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + oifindex = READ_ONCE(rule->oifindex); + if (oifindex && !fib_rule_oif_match(rule, oifindex, fl)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) @@ -370,7 +391,10 @@ static int call_fib_rule_notifiers(struct net *net, .rule = rule, }; - ops->fib_rules_seq++; + ASSERT_RTNL_NET(net); + + /* Paired with READ_ONCE() in fib_rules_seq() */ + WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } @@ -397,17 +421,16 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, } EXPORT_SYMBOL_GPL(fib_rules_dump); -unsigned int fib_rules_seq_read(struct net *net, int family) +unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; - ASSERT_RTNL(); - ops = lookup_rules_ops(net, family); if (!ops) return 0; - fib_rules_seq = ops->fib_rules_seq; + /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ + fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; @@ -457,9 +480,6 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->tun_id && r->tun_id != rule->tun_id) continue; - if (r->fr_net != rule->fr_net) - continue; - if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; @@ -479,11 +499,17 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, &rule->sport_range)) continue; + if (rule->sport_mask && r->sport_mask != rule->sport_mask) + continue; + if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (rule->dport_mask && r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return r; @@ -513,14 +539,40 @@ static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, } #endif -static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, +static int fib_nl2rule_port_mask(const struct nlattr *mask_attr, + const struct fib_rule_port_range *range, + u16 *port_mask, + struct netlink_ext_ack *extack) +{ + if (!fib_rule_port_range_valid(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask without port value"); + return -EINVAL; + } + + if (fib_rule_port_is_range(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask for port range"); + return -EINVAL; + } + + if (range->start & ~nla_get_u16(mask_attr)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask"); + return -EINVAL; + } + + *port_mask = nla_get_u16(mask_attr); + + return 0; +} + +static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { - struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; @@ -552,31 +604,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; - } else { - nlrule->pref = fib_default_rule_pref(ops); } - nlrule->proto = tb[FRA_PROTOCOL] ? - nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { - struct net_device *dev; - nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, nlrule->iifname); - if (dev) - nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { - struct net_device *dev; - nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, nlrule->oifname); - if (dev) - nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { @@ -594,7 +633,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); - err = -EINVAL; if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; @@ -619,11 +657,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, } nlrule->target = nla_get_u32(tb[FRA_GOTO]); - /* Backward jumps are prohibited to avoid endless loops */ - if (nlrule->target <= nlrule->pref) { - NL_SET_ERR_MSG(extack, "Backward goto not supported"); - goto errout_free; - } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; @@ -662,6 +695,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->sport_range)) + nlrule->sport_mask = U16_MAX; + } + + if (tb[FRA_SPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK], + &nlrule->sport_range, + &nlrule->sport_mask, extack); + if (err) + goto errout_free; } if (tb[FRA_DPORT_RANGE]) { @@ -671,6 +714,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->dport_range)) + nlrule->dport_mask = U16_MAX; + } + + if (tb[FRA_DPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK], + &nlrule->dport_range, + &nlrule->dport_mask, extack); + if (err) + goto errout_free; } *rule = nlrule; @@ -683,6 +736,43 @@ errout: return err; } +static int fib_nl2rule_rtnl(struct fib_rule *nlrule, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + if (!tb[FRA_PRIORITY]) + nlrule->pref = fib_default_rule_pref(ops); + + /* Backward jumps are prohibited to avoid endless loops */ + if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); + return -EINVAL; + } + + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname); + if (dev) { + nlrule->iifindex = dev->ifindex; + nlrule->iif_is_l3_master = netif_is_l3_master(dev); + } + } + + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname); + if (dev) { + nlrule->oifindex = dev->ifindex; + nlrule->oif_is_l3_master = netif_is_l3_master(dev); + } + } + + return 0; +} + static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { @@ -719,9 +809,6 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, if (r->tun_id != rule->tun_id) continue; - if (r->fr_net != rule->fr_net) - continue; - if (r->l3mdev != rule->l3mdev) continue; @@ -739,10 +826,16 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, &rule->sport_range)) continue; + if (r->sport_mask != rule->sport_mask) + continue; + if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -768,21 +861,27 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, - [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } + [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, + [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), + [FRA_FLOWLABEL] = { .type = NLA_BE32 }, + [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, + [FRA_SPORT_MASK] = { .type = NLA_U16 }, + [FRA_DPORT_MASK] = { .type = NLA_U16 }, + [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2), }; -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) { - struct net *net = sock_net(skb->sk); - struct fib_rule_hdr *frh = nlmsg_data(nlh); - struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; - struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; + struct fib_rules_ops *ops = NULL; + struct nlattr *tb[FRA_MAX + 1]; bool user_priority = false; + struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -801,10 +900,17 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); + err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(rule, ops, tb, extack); + if (err) + goto errout_free; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -866,31 +972,45 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); + fib_rule_get(rule); + + if (!rtnl_held) + rtnl_net_unlock(net); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); + fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); kfree(rule); errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_newrule); +EXPORT_SYMBOL_GPL(fib_newrule); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); - struct fib_rule_hdr *frh = nlmsg_data(nlh); + return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false); +} + +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) +{ + struct fib_rule *rule = NULL, *nlrule = NULL; struct fib_rules_ops *ops = NULL; - struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL; bool user_priority = false; + struct fib_rule_hdr *frh; + int err = -EINVAL; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -909,25 +1029,32 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); + err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(nlrule, ops, tb, extack); + if (err) + goto errout_free; + rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; - goto errout; + goto errout_free; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; - goto errout; + goto errout_free; } if (ops->delete) { err = ops->delete(rule); if (err) - goto errout; + goto errout_free; } if (rule->tun_id) @@ -949,7 +1076,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { - struct fib_rule *n; + struct fib_rule *n, *r; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) @@ -963,22 +1090,33 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } } - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, - NULL); - notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).portid); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); + + if (!rtnl_held) + rtnl_net_unlock(net); + + notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; -errout: +errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); kfree(nlrule); +errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_delrule); +EXPORT_SYMBOL_GPL(fib_delrule); + +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false); +} static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) @@ -997,7 +1135,9 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ - + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */ + + nla_total_size(2) /* FRA_SPORT_MASK */ + + nla_total_size(2); /* FRA_DPORT_MASK */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -1038,14 +1178,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; - if (rule->iifindex == -1) + if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; - if (rule->oifindex == -1) + if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } @@ -1065,8 +1205,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK, + rule->sport_mask)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK, + rule->dport_mask)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; @@ -1118,12 +1262,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, { struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } - frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, @@ -1144,10 +1288,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; - int idx = 0, family; + int err, idx = 0, family; if (cb->strict_check) { - int err = fib_valid_dumprule_req(nlh, cb->extack); + err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; @@ -1160,17 +1304,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - dump_rules(skb, cb, ops); - - return skb->len; + return dump_rules(skb, cb, ops); } + err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; - if (dump_rules(skb, cb, ops) < 0) + err = dump_rules(skb, cb, ops); + if (err < 0) break; cb->args[1] = 0; @@ -1180,7 +1324,7 @@ skip: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static void notify_rule_change(int event, struct fib_rule *rule, @@ -1207,8 +1351,7 @@ static void notify_rule_change(int event, struct fib_rule *rule, rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, ops->nlgroup, err); + rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) @@ -1217,11 +1360,17 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && - strcmp(dev->name, rule->iifname) == 0) - rule->iifindex = dev->ifindex; + strcmp(dev->name, rule->iifname) == 0) { + WRITE_ONCE(rule->iifindex, dev->ifindex); + WRITE_ONCE(rule->iif_is_l3_master, + netif_is_l3_master(dev)); + } if (rule->oifindex == -1 && - strcmp(dev->name, rule->oifname) == 0) - rule->oifindex = dev->ifindex; + strcmp(dev->name, rule->oifname) == 0) { + WRITE_ONCE(rule->oifindex, dev->ifindex); + WRITE_ONCE(rule->oif_is_l3_master, + netif_is_l3_master(dev)); + } } } @@ -1230,10 +1379,14 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) { - if (rule->iifindex == dev->ifindex) - rule->iifindex = -1; - if (rule->oifindex == dev->ifindex) - rule->oifindex = -1; + if (rule->iifindex == dev->ifindex) { + WRITE_ONCE(rule->iifindex, -1); + WRITE_ONCE(rule->iif_is_l3_master, false); + } + if (rule->oifindex == dev->ifindex) { + WRITE_ONCE(rule->oifindex, -1); + WRITE_ONCE(rule->oif_is_l3_master, false); + } } } @@ -1290,12 +1443,20 @@ static struct pernet_operations fib_rules_net_ops = { .exit = fib_rules_net_exit, }; +static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, +}; + static int __init fib_rules_init(void) { int err; - rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); + + rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) @@ -1310,9 +1471,7 @@ static int __init fib_rules_init(void) fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: - rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); - rtnl_unregister(PF_UNSPEC, RTM_DELRULE); - rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } diff --git a/net/core/filter.c b/net/core/filter.c index 43cc1fe58a2c..616e0520a0bb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -42,7 +42,7 @@ #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> @@ -81,9 +81,18 @@ #include <net/xdp.h> #include <net/mptcp.h> #include <net/netfilter/nf_conntrack_bpf.h> +#include <net/netkit.h> +#include <linux/un.h> +#include <net/xdp_sock_drv.h> +#include <net/inet_dscp.h> + +#include "dev.h" + +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id); +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -113,6 +122,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet + * @reason: record drop reason on errors (negative return value) * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller @@ -121,7 +131,8 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * be accepted or -EPERM if the packet should be tossed. * */ -int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, + unsigned int cap, enum skb_drop_reason *reason) { int err; struct sk_filter *filter; @@ -133,15 +144,20 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); + *reason = SKB_DROP_REASON_PFMEMALLOC; return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SOCKET_FILTER; return err; + } err = security_sock_rcv_skb(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SECURITY_HOOK; return err; + } rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); @@ -153,6 +169,8 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; + if (err) + *reason = SKB_DROP_REASON_SOCKET_FILTER; } rcu_read_unlock(); @@ -199,7 +217,7 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; nla = (struct nlattr *) &skb->data[a]; - if (nla->nla_len > skb->len - a) + if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); @@ -209,24 +227,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) +{ + if (likely(offset >= 0)) + return offset; + + if (offset >= SKF_NET_OFF) + return offset - SKF_NET_OFF + skb_network_offset(skb); + + if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) + return offset - SKF_LL_OFF + skb_mac_offset(skb); + + return INT_MIN; +} + BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u8 tmp, *ptr; + u8 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return *(u8 *)(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return tmp; - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return *(u8 *)ptr; - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, @@ -239,21 +269,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be16 tmp, *ptr; + __be16 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return get_unaligned_be16(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be16_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be16(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, @@ -266,21 +294,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be32 tmp, *ptr; + __be32 tmp; const int len = sizeof(tmp); - if (likely(offset >= 0)) { - if (headlen - offset >= len) - return get_unaligned_be32(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be32_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be32(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, @@ -773,7 +799,7 @@ jmp_rest: BPF_EMIT_JMP; break; - /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ + /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, @@ -799,7 +825,7 @@ jmp_rest: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } - /* RET_K is remaped into 2 insns. RET_A case doesn't need an + /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: @@ -1215,8 +1241,8 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); - int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && @@ -1257,8 +1283,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ - old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), - GFP_KERNEL | __GFP_NOWARN); + old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), + GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; @@ -1546,12 +1572,13 @@ EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); - int err; + int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1590,7 +1617,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; - int err; + int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; @@ -1618,7 +1645,8 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } @@ -1643,18 +1671,14 @@ void sk_reuseport_prog_free(struct bpf_prog *prog) bpf_prog_destroy(prog); } -struct bpf_scratchpad { - union { - __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; - u8 buff[MAX_BPF_STACK]; - }; -}; - -static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); - static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { +#ifdef CONFIG_DEBUG_NET + /* Avoid a splat in pskb_may_pull_reason() */ + if (write_len > INT_MAX) + return -EINVAL; +#endif return skb_ensure_writable(skb, write_len); } @@ -1721,6 +1745,12 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, + u32 len, u64 flags) +{ + return ____bpf_skb_store_bytes(skb, offset, from, len, flags); +} + BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, void *, to, u32, len) { @@ -1751,6 +1781,11 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) +{ + return ____bpf_skb_load_bytes(skb, offset, to, len); +} + BPF_CALL_4(bpf_flow_dissector_load_bytes, const struct bpf_flow_dissector *, ctx, u32, offset, void *, to, u32, len) @@ -1942,10 +1977,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; + bool is_ipv6 = flags & BPF_F_IPV6; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | - BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; @@ -1961,7 +1997,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, if (unlikely(from != 0)) return -EINVAL; - inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); @@ -1992,10 +2028,6 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = { BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { - struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); - u32 diff_size = from_size + to_size; - int i, j = 0; - /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data @@ -2004,16 +2036,19 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, * * Even for diffing, from_size and to_size don't need to be equal. */ - if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || - diff_size > sizeof(sp->diff))) - return -EINVAL; - for (i = 0; i < from_size / sizeof(__be32); i++, j++) - sp->diff[j] = ~from[i]; - for (i = 0; i < to_size / sizeof(__be32); i++, j++) - sp->diff[j] = to[i]; + __wsum ret = seed; - return csum_partial(sp->diff, diff_size, seed); + if (from_size && to_size) + ret = csum_sub(csum_partial(to, to_size, ret), + csum_partial(from, from_size, 0)); + else if (to_size) + ret = csum_partial(to, to_size, ret); + + else if (from_size) + ret = ~csum_partial(from, from_size, ~ret); + + return csum_from32to16((__force unsigned int)ret); } static const struct bpf_func_proto bpf_csum_diff_proto = { @@ -2111,6 +2146,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) } skb->dev = dev; + skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb)); skb_clear_tstamp(skb); dev_xmit_recursion_inc(); @@ -2193,10 +2229,10 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, return -ENOMEM; } - rcu_read_lock_bh(); + rcu_read_lock(); if (!nh) { dst = skb_dst(skb); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; @@ -2206,13 +2242,15 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, int ret; sock_confirm_neigh(skb, neigh); + local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, false); dev_xmit_recursion_dec(); - rcu_read_unlock_bh(); + local_bh_enable(); + rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); + rcu_read_unlock(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: @@ -2243,6 +2281,7 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(dst)) goto out_drop; + skb_dst_drop(skb); skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; @@ -2250,12 +2289,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2291,10 +2330,9 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, return -ENOMEM; } - rcu_read_lock_bh(); + rcu_read_lock(); if (!nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); + struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { @@ -2303,7 +2341,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, } else if (nh->nh_family == AF_INET) { neigh = ip_neigh_gw4(dev, nh->ipv4_nh); } else { - rcu_read_unlock_bh(); + rcu_read_unlock(); goto out_drop; } @@ -2311,13 +2349,15 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, int ret; sock_confirm_neigh(skb, neigh); + local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, is_v6gw); dev_xmit_recursion_dec(); - rcu_read_unlock_bh(); + local_bh_enable(); + rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); + rcu_read_unlock(); out_drop: kfree_skb(skb); return -ENETDOWN; @@ -2334,7 +2374,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_dscp = ip4h_dscp(ip4h), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -2350,17 +2390,18 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, goto out_drop; } + skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2400,9 +2441,9 @@ out: /* Internal, non-exposed redirect flags. */ enum { - BPF_F_NEIGH = (1ULL << 1), - BPF_F_PEER = (1ULL << 2), - BPF_F_NEXTHOP = (1ULL << 3), + BPF_F_NEIGH = (1ULL << 16), + BPF_F_PEER = (1ULL << 17), + BPF_F_NEXTHOP = (1ULL << 18), #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; @@ -2412,9 +2453,18 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) struct sk_buff *clone; int ret; + BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; + /* BPF test infra's convert___skb_to_skb() can create type-less + * GSO packets. gso_features_check() will detect this as a bad + * offload. However, lets not leak them out in the first place. + */ + if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type)) + return -EBADMSG; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; @@ -2446,12 +2496,19 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); -EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); +static struct net_device *skb_get_peer_dev(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (likely(ops->ndo_get_peer_dev)) + return INDIRECT_CALL_1(ops->ndo_get_peer_dev, + netkit_peer_dev, dev); + return NULL; +} int skb_do_redirect(struct sk_buff *skb) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; @@ -2462,17 +2519,16 @@ int skb_do_redirect(struct sk_buff *skb) if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { - const struct net_device_ops *ops = dev->netdev_ops; - - if (unlikely(!ops->ndo_get_peer_dev || - !skb_at_tc_ingress(skb))) + if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; - dev = ops->ndo_get_peer_dev(dev); + dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; + dev_sw_netstats_rx_add(dev, skb->len); + skb_scrub_packet(skb, false); return -EAGAIN; } return flags & BPF_F_NEIGH ? @@ -2486,7 +2542,7 @@ out_drop: BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; @@ -2507,7 +2563,7 @@ static const struct bpf_func_proto bpf_redirect_proto = { BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; @@ -2529,7 +2585,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; @@ -2574,6 +2630,20 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) return 0; } +static void sk_msg_reset_curr(struct sk_msg *msg) +{ + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else { + u32 i = msg->sg.end; + + sk_msg_iter_var_prev(i); + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } +} + static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, @@ -2693,6 +2763,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: + sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; @@ -2733,7 +2804,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); } while (i != msg->sg.end); - if (start >= offset + l) + if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2758,6 +2829,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, raw = page_address(page); + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; @@ -2774,7 +2847,13 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, } put_page(sg_page(psge)); - } else if (start - offset) { + new = i; + goto place_new; + } + + if (start - offset) { + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); @@ -2785,39 +2864,44 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); - sk_msg_iter_next(msg, end); } /* Slot(s) to place newly allocated data */ + sk_msg_iter_next(msg, end); new = i; + sk_msg_iter_var_next(i); + + if (i == msg->sg.end) { + if (!rsge.length) + goto place_new; + sk_msg_iter_next(msg, end); + goto place_new; + } /* Shift one or two slots as needed */ - if (!copy) { - sge = sk_msg_elem_cpy(msg, i); + sge = sk_msg_elem_cpy(msg, new); + sg_unmark_end(&sge); + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { sk_msg_iter_var_next(i); - sg_unmark_end(&sge); + nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); + } - nsge = sk_msg_elem_cpy(msg, i); + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); if (rsge.length) { - sk_msg_iter_var_next(i); + nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); - } - - while (i != msg->sg.end) { - msg->sg.data[i] = sge; - sge = nsge; - sk_msg_iter_var_next(i); - if (rsge.length) { - nsge = nnsge; - nnsge = sk_msg_elem_cpy(msg, i); - } else { - nsge = sk_msg_elem_cpy(msg, i); - } + } else { + nsge = sk_msg_elem_cpy(msg, i); } } +place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; @@ -2829,6 +2913,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, msg->sg.data[new] = rsge; } + sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } @@ -2845,8 +2930,10 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { static void sk_msg_shift_left(struct sk_msg *msg, int i) { + struct scatterlist *sge = sk_msg_elem(msg, i); int prev; + put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); @@ -2883,6 +2970,9 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { @@ -2895,7 +2985,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ - if (start >= offset + l || last >= msg->sg.size) + if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2919,17 +3009,17 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to - * the rigth free'ing the next element in ring to place B, leaving + * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); - int a = start; + int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); - if (pop < sge->length - a) { + if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); @@ -2948,7 +3038,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(!page)) return -ENOMEM; - sge->length = a; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); @@ -2958,7 +3047,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, put_page(orig); } pop = 0; - } else if (pop >= sge->length - a) { + } else { pop -= (sge->length - a); sge->length = a; } @@ -2992,11 +3081,11 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, pop -= sge->length; sk_msg_shift_left(msg, i); } - sk_msg_iter_var_next(i); } sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); + sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } @@ -3128,6 +3217,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); + skb_reset_mac_len(skb); bpf_compute_data_pointers(skb); return ret; @@ -3161,13 +3251,20 @@ static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) +{ + skb->protocol = htons(proto); + if (skb_valid_dst(skb)) + skb_dst_drop(skb); +} + static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { - /* Caller already did skb_cow() with len as headroom, + /* Caller already did skb_cow() with meta_len+len as headroom, * so no need to do it here. */ skb_push(skb, len); - memmove(skb->data, skb->data + len, off); + skb_postpush_data_move(skb, len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) @@ -3191,7 +3288,7 @@ static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); - memmove(skb->data, old_data, off); + skb_postpull_data_move(skb, len, off); return 0; } @@ -3236,10 +3333,11 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + const u8 meta_len = skb_metadata_len(skb); u32 off = skb_mac_header_len(skb); int ret; - ret = skb_cow(skb, len_diff); + ret = skb_cow(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; @@ -3257,7 +3355,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) } } - skb->protocol = htons(ETH_P_IPV6); + bpf_skb_change_protocol(skb, ETH_P_IPV6); skb_clear_hash(skb); return 0; @@ -3287,7 +3385,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) } } - skb->protocol = htons(ETH_P_IP); + bpf_skb_change_protocol(skb, ETH_P_IP); skb_clear_hash(skb); return 0; @@ -3381,13 +3479,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) +#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ - BPF_ADJ_ROOM_ENCAP_L2_MASK)) + BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ + BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) @@ -3395,6 +3497,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; + const u8 meta_len = skb_metadata_len(skb); unsigned int gso_type = SKB_GSO_DODGY; int ret; @@ -3405,7 +3508,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return -ENOTSUPP; } - ret = skb_cow_head(skb, len_diff); + ret = skb_cow_head(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; @@ -3474,22 +3577,29 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) - skb->protocol = htons(ETH_P_IPV6); + bpf_skb_change_protocol(skb, ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) - skb->protocol = htons(ETH_P_IP); + bpf_skb_change_protocol(skb, ETH_P_IP); } if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Due to header grow, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; + + /* Due to header growth, MSS needs to be downgraded. + * There is a BUG_ON() when segmenting the frag_list with + * head_frag true, so linearize the skb after downgrading + * the MSS. + */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { + skb_decrease_gso_size(shinfo, len_diff); + if (shinfo->frag_list) + return skb_linearize(skb); + } } return 0; @@ -3501,6 +3611,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; @@ -3519,6 +3630,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + bpf_skb_change_protocol(skb, ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) + bpf_skb_change_protocol(skb, ETH_P_IP); + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -3608,6 +3727,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOTSUPP; } + if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + if (!shrink) + return -EINVAL; + + switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: + len_min = sizeof(struct iphdr); + break; + case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: + len_min = sizeof(struct ipv6hdr); + break; + default: + return -EINVAL; + } + } + len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || @@ -3636,13 +3771,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } @@ -3738,15 +3882,17 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = { static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { + const u8 meta_len = skb_metadata_len(skb); u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; - if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + if (unlikely(flags || (int)head_room < 0 || + (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; - ret = skb_cow(skb, head_room); + ret = skb_cow(skb, meta_len + head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that @@ -3758,6 +3904,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, * for redirection into L2 device. */ __skb_push(skb, head_room); + skb_postpush_data_move(skb, head_room, 0); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); @@ -3799,7 +3946,7 @@ static const struct bpf_func_proto sk_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) +BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) { return xdp_get_buff_len(xdp); } @@ -3854,8 +4001,8 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; -static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, - void *buf, unsigned long len, bool flush) +void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush) { unsigned long ptr_len, ptr_off = 0; skb_frag_t *next_frag, *end_frag; @@ -3901,22 +4048,23 @@ static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, } } -static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { - struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); u32 size = xdp->data_end - xdp->data; + struct skb_shared_info *sinfo; void *addr = xdp->data; int i; if (unlikely(offset > 0xffff || len > 0xffff)) return ERR_PTR(-EFAULT); - if (offset + len > xdp_get_buff_len(xdp)) + if (unlikely(offset + len > xdp_get_buff_len(xdp))) return ERR_PTR(-EINVAL); - if (offset < size) /* linear area */ + if (likely(offset < size)) /* linear area */ goto out; + sinfo = xdp_get_shared_info_from_buff(xdp); offset -= size; for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ u32 frag_size = skb_frag_size(&sinfo->frags[i]); @@ -3959,6 +4107,11 @@ static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) +{ + return ____bpf_xdp_load_bytes(xdp, offset, buf, len); +} + BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { @@ -3986,6 +4139,11 @@ static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) +{ + return ____bpf_xdp_store_bytes(xdp, offset, buf, len); +} + static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -4003,10 +4161,54 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); skb_frag_size_add(frag, offset); sinfo->xdp_frags_size += offset; + if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + xsk_buff_get_tail(xdp)->data_end += offset; return 0; } +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) +{ + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); + + if (release) { + xsk_buff_del_frag(zc_frag); + } else { + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; + } + + return zc_frag; +} + +static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, + int shrink, bool tail) +{ + enum xdp_mem_type mem_type = xdp->rxq->mem.type; + bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; + + if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); + } + + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } + + return release; +} + static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -4021,23 +4223,15 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - - if (skb_frag_size(frag) == shrink) { - struct page *page = skb_frag_page(frag); - - __xdp_return(page_address(page), &xdp->rxq->mem, - false, NULL); + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); - break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } @@ -4060,12 +4254,6 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) if (unlikely(data_end > data_hard_end)) return -EINVAL; - /* ALL drivers MUST init xdp->frame_sz, chicken check below */ - if (unlikely(xdp->frame_sz > PAGE_SIZE)) { - WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); - return -EINVAL; - } - if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; @@ -4128,9 +4316,13 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * - * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), - * which will flush all the different bulk queues, thus completing the - * redirect. + * 3. Before exiting its NAPI poll loop, the driver will call + * xdp_do_flush(), which will flush all the different bulk queues, + * thus completing the redirect. Note that xdp_do_flush() must be + * called before napi_complete_done() in the driver, as the + * XDP_REDIRECT logic relies on being inside a single NAPI instance + * through to the xdp_do_flush() call for RCU protection of all + * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of @@ -4147,36 +4339,50 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { */ void xdp_do_flush(void) { - __dev_flush(); - __cpu_map_flush(); - __xsk_map_flush(); + struct list_head *lh_map, *lh_dev, *lh_xsk; + + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) + __dev_flush(lh_dev); + if (lh_map) + __cpu_map_flush(lh_map); + if (lh_xsk) + __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); -void bpf_clear_redirect_map(struct bpf_map *map) +#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) +void xdp_do_check_flushed(struct napi_struct *napi) { - struct bpf_redirect_info *ri; - int cpu; + struct list_head *lh_map, *lh_dev, *lh_xsk; + bool missed = false; - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) { + __dev_flush(lh_dev); + missed = true; + } + if (lh_map) { + __cpu_map_flush(lh_map); + missed = true; + } + if (lh_xsk) { + __xsk_map_flush(lh_xsk); + missed = true; } + + WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", + napi->poll); } +#endif DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); @@ -4196,9 +4402,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, - struct net_device *dev, + const struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4219,18 +4425,20 @@ err: return err; } -static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, - struct net_device *dev, - struct xdp_frame *xdpf, - struct bpf_prog *xdp_prog) +static __always_inline int +__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { @@ -4242,11 +4450,20 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by dev_map_free() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } @@ -4280,19 +4497,11 @@ err: } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; - /* XDP_REDIRECT is not fully supported yet for xdp frags since - * not all XDP capable drivers can map non-linear xdp_frame in - * ndo_xdp_xmit. - */ - if (unlikely(xdp_buff_has_frags(xdp) && - map_type != BPF_MAP_TYPE_CPUMAP)) - return -EOPNOTSUPP; - if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); @@ -4302,9 +4511,10 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, - struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4317,11 +4527,11 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, - void *fwd, - enum bpf_map_type map_type, u32 map_id) + const struct bpf_prog *xdp_prog, + void *fwd, enum bpf_map_type map_type, + u32 map_id, u32 flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; @@ -4329,11 +4539,20 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by dev_map_free() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } @@ -4364,15 +4583,18 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { @@ -4392,7 +4614,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, return 0; } - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; @@ -4400,7 +4622,7 @@ err: BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; @@ -4537,7 +4759,7 @@ set_compat: to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) - to->tunnel_flags = info->key.tun_flags; + to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; @@ -4580,7 +4802,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) int err; if (unlikely(!info || - !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } @@ -4618,7 +4840,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | + BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -4649,13 +4872,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; - if (flags & BPF_F_DONT_FRAGMENT) - info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; - if (flags & BPF_F_SEQ_NUMBER) - info->key.tun_flags |= TUNNEL_SEQ; + __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); + __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, + flags & BPF_F_DONT_FRAGMENT); + __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, + !(flags & BPF_F_ZERO_CSUM_TX)); + __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, + flags & BPF_F_SEQ_NUMBER); + __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, + !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -4693,13 +4918,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); + IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); + ip_tunnel_set_options_present(present); + ip_tunnel_info_opts_set(info, from, size, present); return 0; } @@ -4944,7 +5171,7 @@ const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { .func = bpf_get_socket_ptr_cookie, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, }; BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) @@ -4966,6 +5193,17 @@ static u64 __bpf_get_netns_cookie(struct sock *sk) return net->net_cookie; } +BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) +{ + return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_proto = { + .func = bpf_get_netns_cookie, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); @@ -5032,6 +5270,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) +{ + u32 sk_bpf_cb_flags; + + if (getopt) { + *(u32 *)optval = sk->sk_bpf_cb_flags; + return 0; + } + + sk_bpf_cb_flags = *(u32 *)optval; + + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) + return -EINVAL; + + sk->sk_bpf_cb_flags = sk_bpf_cb_flags; + + return 0; +} + static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5048,6 +5305,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname, case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: + case SK_BPF_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5057,6 +5315,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname, return -EINVAL; } + if (optname == SK_BPF_CB_FLAGS) + return sk_bpf_set_get_cb_flags(sk, optval, getopt); + if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; @@ -5069,6 +5330,38 @@ static int sol_socket_sockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), *optlen); } +static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + if (optlen != sizeof(int)) + return -EINVAL; + + switch (optname) { + case TCP_BPF_SOCK_OPS_CB_FLAGS: { + int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags; + + memcpy(optval, &cb_flags, optlen); + break; + } + case TCP_BPF_RTO_MIN: { + int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min); + + memcpy(optval, &rto_min_us, optlen); + break; + } + case TCP_BPF_DELACK_MAX: { + int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max); + + memcpy(optval, &delack_max_us, optlen); + break; + } + default: + return -EINVAL; + } + + return 0; +} + static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { @@ -5108,6 +5401,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; + case TCP_BPF_SOCK_OPS_CB_FLAGS: + if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) + return -EINVAL; + tp->bpf_sock_ops_cb_flags = val; + break; default: return -EINVAL; } @@ -5172,7 +5470,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { - if (sk->sk_prot->setsockopt != tcp_setsockopt) + if (sk->sk_protocol != IPPROTO_TCP) return -EINVAL; switch (optname) { @@ -5187,6 +5485,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: + case TCP_RTO_MAX_MS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5198,7 +5497,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, break; default: if (getopt) - return -EINVAL; + return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen); return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); } @@ -5294,6 +5593,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } +static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) +{ + return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; +} + static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { @@ -5441,9 +5745,83 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, + char *optval, int optlen, + bool getopt) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + if (!sk_has_account(sk)) + return -EOPNOTSUPP; + + if (getopt) { + *(int *)optval = sk->sk_bypass_prot_mem; + return 0; + } + + val = *(int *)optval; + if (val < 0 || val > 1) + return -EINVAL; + + sk->sk_bypass_prot_mem = val; + return 0; +} + +BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) + return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); + + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { + .func = bpf_sock_create_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { + int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); + + if (err) + memset(optval, 0, optlen); + + return err; + } + + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = { + .func = bpf_sock_create_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -5529,6 +5907,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; @@ -5571,6 +5952,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; @@ -5606,7 +5990,7 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, return err; if (((struct sockaddr_in *)addr)->sin_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; - return __inet_bind(sk, addr, addr_len, flags); + return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) @@ -5616,7 +6000,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); + return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr, + addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ @@ -5689,12 +6074,8 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { #endif #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) -static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, - const struct neighbour *neigh, - const struct net_device *dev, u32 mtu) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) { - memcpy(params->dmac, neigh->ha, ETH_ALEN); - memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; if (mtu) @@ -5733,7 +6114,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } - fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; @@ -5748,13 +6129,22 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -5799,27 +6189,38 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SRC) + params->ipv4_src = fib_result_prefsrc(net, &res); + /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ if (likely(nhc->nhc_gw_family != AF_INET6)) { if (nhc->nhc_gw_family) params->ipv4_dst = nhc->nhc_gw.ipv4; - - neigh = __ipv4_neigh_lookup_noref(dev, - (__force u32)params->ipv4_dst); } else { struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; params->family = AF_INET6; *dst = nhc->nhc_gw.ipv6; - neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); } - if (!neigh) + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) + goto set_fwd_params; + + if (likely(nhc->nhc_gw_family != AF_INET6)) + neigh = __ipv4_neigh_lookup_noref(dev, + (__force u32)params->ipv4_dst); + else + neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); + + if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); - return bpf_fib_set_fwd_params(params, neigh, dev, mtu); +set_fwd_params: + return bpf_fib_set_fwd_params(params, mtu); } #endif @@ -5847,7 +6248,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return -ENODEV; idev = __in6_dev_get_safely(dev); - if (unlikely(!idev || !idev->cnf.forwarding)) + if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { @@ -5873,6 +6274,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -5880,7 +6287,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -5927,24 +6337,46 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SRC) { + if (res.f6i->fib6_prefsrc.plen) { + *src = res.f6i->fib6_prefsrc.addr; + } else { + err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, + &fl6.daddr, 0, + src); + if (err) + return BPF_FIB_LKUP_RET_NO_SRC_ADDR; + } + } + + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) + goto set_fwd_params; + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. */ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); - if (!neigh) + if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); - return bpf_fib_set_fwd_params(params, neigh, dev, mtu); +set_fwd_params: + return bpf_fib_set_fwd_params(params, mtu); } #endif +#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ + BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) + BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) { if (plen < sizeof(*params)) return -EINVAL; - if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; switch (params->family) { @@ -5982,7 +6414,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, if (plen < sizeof(*params)) return -EINVAL; - if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; if (params->tot_len) @@ -6044,12 +6476,10 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, { int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; struct net_device *dev = skb->dev; - int skb_len, dev_len; - int mtu; + int mtu, dev_len, skb_len; if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) return -EINVAL; - if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) return -EINVAL; @@ -6058,7 +6488,6 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, return -ENODEV; mtu = READ_ONCE(dev->mtu); - dev_len = mtu + dev->hard_header_len; /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ @@ -6076,15 +6505,15 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; - - if (flags & BPF_MTU_CHK_SEGS && - !skb_gso_validate_network_len(skb, mtu)) - ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + if (flags & BPF_MTU_CHK_SEGS) { + if (!skb_transport_header_was_set(skb)) + return -EINVAL; + if (!skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } } out: - /* BPF verifier guarantees valid pointer */ *mtu_len = mtu; - return ret; } @@ -6105,8 +6534,6 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, return -ENODEV; mtu = READ_ONCE(dev->mtu); - - /* Add L2-header as dev MTU is L3 size */ dev_len = mtu + dev->hard_header_len; /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ @@ -6117,9 +6544,7 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, if (xdp_len > dev_len) ret = BPF_MTU_CHK_RET_FRAG_NEEDED; - /* BPF verifier guarantees valid pointer */ *mtu_len = mtu; - return ret; } @@ -6129,7 +6554,8 @@ static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_INT, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, + .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6140,7 +6566,8 @@ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_INT, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, + .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6249,6 +6676,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; + lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; @@ -6305,6 +6733,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, int hdroff = 0; int err; + lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) @@ -6381,6 +6810,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, int srhoff = 0; int ret; + lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; @@ -6434,7 +6864,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6443,7 +6872,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, hinfo, NULL, 0, + sk = __inet_lookup(net, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6457,7 +6886,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, hinfo, NULL, 0, + sk = __inet6_lookup(net, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); @@ -6483,12 +6912,11 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) + u64 flags, int sdif) { struct sock *sk = NULL; struct net *net; u8 family; - int sdif; if (len == sizeof(tuple->ipv4)) family = AF_INET; @@ -6500,10 +6928,12 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; - if (family == AF_INET) - sdif = inet_sdif(skb); - else - sdif = inet6_sdif(skb); + if (sdif < 0) { + if (family == AF_INET) + sdif = inet_sdif(skb); + else + sdif = inet6_sdif(skb); + } if ((s32)netns_id < 0) { net = caller_net; @@ -6523,10 +6953,11 @@ out: static struct sock * __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) + u64 flags, int sdif) { struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, - ifindex, proto, netns_id, flags); + ifindex, proto, netns_id, flags, + sdif); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); @@ -6534,8 +6965,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -6566,7 +6995,7 @@ bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, } return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, - netns_id, flags); + netns_id, flags, -1); } static struct sock * @@ -6582,8 +7011,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -6612,7 +7039,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6631,7 +7058,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6650,7 +7077,79 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { + .func = bpf_tc_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { + .func = bpf_tc_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { + .func = bpf_tc_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6672,12 +7171,13 @@ static const struct bpf_func_proto bpf_sk_release_proto = { BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { - struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, - flags); + flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { @@ -6687,7 +7187,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6695,12 +7195,13 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { - struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, - flags); + flags, sdif); } static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { @@ -6710,7 +7211,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6718,12 +7219,13 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { - struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, - flags); + flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { @@ -6733,7 +7235,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6743,7 +7245,8 @@ BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_TCP, netns_id, flags); + IPPROTO_TCP, netns_id, flags, + -1); } static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { @@ -6752,7 +7255,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6762,7 +7265,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, - netns_id, flags); + netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { @@ -6771,7 +7274,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6781,7 +7284,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_UDP, - netns_id, flags); + netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { @@ -6790,7 +7293,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6844,8 +7347,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, FIELD)); \ } while (0) - if (insn > insn_buf) - return insn - insn_buf; + BTF_TYPE_EMIT(struct bpf_tcp_sock); switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): @@ -6957,7 +7459,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); - if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; @@ -7025,6 +7527,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_sock, FIELD)); \ } while (0) + BTF_TYPE_EMIT(struct bpf_xdp_sock); + switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); @@ -7045,7 +7549,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES - u32 cookie; int ret; if (unlikely(!sk || th_len < sizeof(*th))) @@ -7067,8 +7570,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; - cookie = ntohl(th->ack_seq) - 1; - /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ @@ -7077,7 +7578,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; - ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) @@ -7088,7 +7589,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_family != AF_INET6) return -EINVAL; - ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ @@ -7194,8 +7695,8 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; - if (unlikely(sk_fullsock(sk) && sk->sk_reuseport)) - return -ESOCKTNOSUPPORT; + if (sk_unhashed(sk)) + return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; @@ -7268,6 +7769,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u8 search_kind, search_len, copy_len, magic_len; int ret; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. @@ -7333,7 +7837,7 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -7451,17 +7955,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, return -EOPNOTSUPP; switch (tstamp_type) { - case BPF_SKB_TSTAMP_DELIVERY_MONO: + case BPF_SKB_CLOCK_REALTIME: + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_REALTIME; + break; + case BPF_SKB_CLOCK_MONOTONIC: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; - skb->mono_delivery_time = 1; + skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; - case BPF_SKB_TSTAMP_UNSPEC: - if (tstamp) + case BPF_SKB_CLOCK_TAI: + if (!tstamp) return -EINVAL; - skb->tstamp = 0; - skb->mono_delivery_time = 0; + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_TAI; break; default: return -EINVAL; @@ -7503,7 +8011,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, @@ -7535,15 +8043,13 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { - u32 cookie = ntohl(th->ack_seq) - 1; - - if (__cookie_v4_check(iph, th, cookie) > 0) + if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; @@ -7564,9 +8070,7 @@ BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) - u32 cookie = ntohl(th->ack_seq) - 1; - - if (__cookie_v6_check(iph, th, cookie) > 0) + if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; @@ -7589,42 +8093,37 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { #endif /* CONFIG_INET */ -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == sk_skb_change_head || - func == bpf_skb_change_tail || - func == sk_skb_change_tail || - func == bpf_skb_adjust_room || - func == sk_skb_adjust_room || - func == bpf_skb_pull_data || - func == sk_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_msg_push_data || - func == bpf_msg_pop_data || - func == bpf_xdp_adjust_tail || -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) - func == bpf_lwt_seg6_store_bytes || - func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action || -#endif -#ifdef CONFIG_INET - func == bpf_sock_ops_store_hdr_opt || -#endif - func == bpf_lwt_in_push_encap || - func == bpf_lwt_xmit_push_encap) +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_clone_redirect: + case BPF_FUNC_l3_csum_replace: + case BPF_FUNC_l4_csum_replace: + case BPF_FUNC_lwt_push_encap: + case BPF_FUNC_lwt_seg6_action: + case BPF_FUNC_lwt_seg6_adjust_srh: + case BPF_FUNC_lwt_seg6_store_bytes: + case BPF_FUNC_msg_pop_data: + case BPF_FUNC_msg_pull_data: + case BPF_FUNC_msg_push_data: + case BPF_FUNC_skb_adjust_room: + case BPF_FUNC_skb_change_head: + case BPF_FUNC_skb_change_proto: + case BPF_FUNC_skb_change_tail: + case BPF_FUNC_skb_pull_data: + case BPF_FUNC_skb_store_bytes: + case BPF_FUNC_skb_vlan_pop: + case BPF_FUNC_skb_vlan_push: + case BPF_FUNC_store_hdr_opt: + case BPF_FUNC_xdp_adjust_head: + case BPF_FUNC_xdp_adjust_meta: + case BPF_FUNC_xdp_adjust_tail: + /* tail-called program could call any of the above */ + case BPF_FUNC_tail_call: return true; - - return false; + default: + return false; + } } const struct bpf_func_proto bpf_event_output_data_proto __weak; @@ -7639,10 +8138,6 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; @@ -7654,8 +8149,22 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_getsockopt_proto; + default: + return NULL; + } default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -7668,10 +8177,6 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { @@ -7707,14 +8212,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; @@ -7725,20 +8235,25 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; } default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -7752,12 +8267,14 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -7883,6 +8400,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: @@ -7911,9 +8430,9 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: - return &bpf_sk_lookup_tcp_proto; + return &bpf_tc_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: - return &bpf_sk_lookup_udp_proto; + return &bpf_tc_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: @@ -7921,7 +8440,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: - return &bpf_skc_lookup_tcp_proto; + return &bpf_tc_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: @@ -7944,7 +8463,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8003,13 +8522,13 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able - * to use them interchangably with the same BTF type ID. Because modules + * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will @@ -8064,7 +8583,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8091,28 +8610,14 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8156,7 +8661,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8167,7 +8672,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8194,7 +8699,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8283,13 +8788,16 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, data_end): + if (info->is_ldsx || size != size_default) + return false; + break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): - case bpf_ctx_range(struct __sk_buff, data): - case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; break; @@ -8303,7 +8811,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; - case offsetof(struct __sk_buff, sk): + case bpf_ctx_range_ptr(struct __sk_buff, sk): if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; @@ -8369,7 +8877,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } @@ -8381,7 +8889,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: @@ -8684,23 +9192,18 @@ EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + int off, int size); EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + int off, int size) { int ret = -EACCES; - if (atype == BPF_READ) - return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); - mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) - ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); + ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; @@ -8731,13 +9234,21 @@ static bool xdp_is_valid_access(int off, int size, } if (type == BPF_WRITE) { - if (bpf_prog_is_dev_bound(prog->aux)) { + if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); } } return false; + } else { + switch (off) { + case offsetof(struct xdp_md, data_meta): + case offsetof(struct xdp_md, data): + case offsetof(struct xdp_md, data_end): + if (info->is_ldsx) + return false; + } } switch (off) { @@ -8755,7 +9266,8 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; @@ -8767,17 +9279,13 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static int xdp_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + int off, int size) { int ret = -EACCES; - if (atype == BPF_READ) - return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); - mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) - ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); + ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; @@ -8795,8 +9303,8 @@ static bool sock_addr_is_valid_access(int off, int size, if (off % size != 0) return false; - /* Disallow access to IPv6 fields from IPv4 contex and vise - * versa. + /* Disallow access to fields not belonging to the attach type's address + * family. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): @@ -8881,20 +9389,24 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case offsetof(struct bpf_sock_addr, sk): + case bpf_ctx_range_ptr(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; - default: - if (type == BPF_READ) { - if (size != size_default) - return false; - } else { + case bpf_ctx_range(struct bpf_sock_addr, user_family): + case bpf_ctx_range(struct bpf_sock_addr, family): + case bpf_ctx_range(struct bpf_sock_addr, type): + case bpf_ctx_range(struct bpf_sock_addr, protocol): + if (type != BPF_READ) return false; - } + if (size != size_default) + return false; + break; + default: + return false; } return true; @@ -8931,17 +9443,17 @@ static bool sock_ops_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - case offsetof(struct bpf_sock_ops, sk): + case bpf_ctx_range_ptr(struct bpf_sock_ops, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; - case offsetof(struct bpf_sock_ops, skb_data): + case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct bpf_sock_ops, skb_data_end): + case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; @@ -8950,7 +9462,7 @@ static bool sock_ops_is_valid_access(int off, int size, bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); - case offsetof(struct bpf_sock_ops, skb_hwtstamp): + case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp): if (size != sizeof(__u64)) return false; break; @@ -9020,17 +9532,17 @@ static bool sk_msg_is_valid_access(int off, int size, return false; switch (off) { - case offsetof(struct sk_msg_md, data): + case bpf_ctx_range_ptr(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; - case offsetof(struct sk_msg_md, data_end): + case bpf_ctx_range_ptr(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; - case offsetof(struct sk_msg_md, sk): + case bpf_ctx_range_ptr(struct sk_msg_md, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; @@ -9062,17 +9574,20 @@ static bool flow_dissector_is_valid_access(int off, int size, if (off < 0 || off >= sizeof(struct __sk_buff)) return false; + if (off % size != 0) + return false; + if (type == BPF_WRITE) return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): - if (size != size_default) + if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): - if (size != size_default) + if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; @@ -9123,16 +9638,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; - /* AX is needed because src_reg and dst_reg could be the same */ - __u8 tmp_reg = BPF_REG_AX; - - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - PKT_VLAN_PRESENT_OFFSET); - *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, - SKB_MONO_DELIVERY_TIME_MASK, 2); - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); - *insn++ = BPF_JMP_A(1); - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); + BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); + BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); + BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); + BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); + *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); +#else + BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); +#endif return insn; } @@ -9165,7 +9681,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. @@ -9174,12 +9690,13 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, - TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); - /* skb->tc_at_ingress && skb->mono_delivery_time, + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); + /* check if ingress mask bits is set */ + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); + *insn++ = BPF_JMP_A(4); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); + *insn++ = BPF_JMP_A(2); + /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); @@ -9199,33 +9716,37 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the - * mono_delivery_time bit also. + * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* Writing __sk_buff->tstamp as ingress, goto <clear> */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); - /* <clear>: mono_delivery_time */ - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, PKT_VLAN_PRESENT_OFFSET); + /* <clear>: skb->tstamp_type */ + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif /* <store>: skb->tstamp = tstamp */ - *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg, - offsetof(struct sk_buff, tstamp)); + *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, + skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); return insn; } +#define BPF_EMIT_STORE(size, si, off) \ + BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ + (si)->dst_reg, (si)->src_reg, (off), (si)->imm) + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -9255,9 +9776,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, priority): if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, priority, 4, - target_size)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + bpf_target_off(struct sk_buff, priority, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, @@ -9288,9 +9809,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, mark): if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, mark, 4, - target_size)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + bpf_target_off(struct sk_buff, mark, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, @@ -9309,11 +9830,16 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { - *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); - *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - queue_mapping, - 2, target_size)); + u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); + + if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { + *insn++ = BPF_JMP_A(0); /* noop */ + break; + } + + if (BPF_CLASS(si->code) == BPF_STX) + *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); + *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, @@ -9349,8 +9875,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, - si->src_reg, off); + *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); @@ -9365,8 +9890,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct qdisc_skb_cb, tc_classid); *target_size = 2; if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, - si->src_reg, off); + *insn++ = BPF_EMIT_STORE(BPF_H, si, off); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); @@ -9399,9 +9923,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, tc_index, 2, - target_size)); + *insn++ = BPF_EMIT_STORE(BPF_H, si, + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, @@ -9602,8 +10126,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_bound_dev_if)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_bound_dev_if)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); @@ -9613,8 +10137,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_mark)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_mark)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); @@ -9624,8 +10148,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_priority)); + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_priority)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); @@ -9890,10 +10414,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ - *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ + *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ + tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ - + OFF); \ + + OFF, \ + si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ offsetof(S, TF)); \ } while (0) @@ -9910,10 +10436,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, } \ } while (0) -#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ - S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) - static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -10031,10 +10553,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ @@ -10119,18 +10641,20 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ - reg, si->src_reg, \ - offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ + BPF_MEM | BPF_CLASS(si->code), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD), \ + si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ @@ -10144,9 +10668,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) - if (insn > insn_buf) - return insn - insn_buf; - switch (si->off) { case offsetof(struct bpf_sock_ops, op): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, @@ -10165,8 +10686,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, off -= offsetof(struct bpf_sock_ops, replylong[0]); off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - off); + *insn++ = BPF_EMIT_STORE(BPF_W, si, off); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); @@ -10523,8 +11043,7 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct sk_buff, cb); off += offsetof(struct sk_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, - si->src_reg, off); + *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); @@ -10765,7 +11284,6 @@ const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { - .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops cg_sock_verifier_ops = { @@ -10928,6 +11446,7 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; + int err; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) @@ -10935,10 +11454,6 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { - /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ - if (sk_is_refcounted(selected_sk)) - sock_put(selected_sk); - /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. @@ -10946,24 +11461,33 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ - return is_sockarray ? -ENOENT : -EINVAL; + err = is_sockarray ? -ENOENT : -EINVAL; + goto error; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; - if (sk->sk_protocol != selected_sk->sk_protocol) - return -EPROTOTYPE; - else if (sk->sk_family != selected_sk->sk_family) - return -EAFNOSUPPORT; - - /* Catch all. Likely bound to a different sockaddr. */ - return -EBADFD; + if (sk->sk_protocol != selected_sk->sk_protocol) { + err = -EPROTOTYPE; + } else if (sk->sk_family != selected_sk->sk_family) { + err = -EAFNOSUPPORT; + } else { + /* Catch all. Likely bound to a different sockaddr. */ + err = -EBADFD; + } + goto error; } reuse_kern->selected_sk = selected_sk; return 0; +error: + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + + return err; } static const struct bpf_func_proto sk_select_reuseport_proto = { @@ -11028,7 +11552,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -11210,7 +11734,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -11227,7 +11751,7 @@ static bool sk_lookup_is_valid_access(int off, int size, return false; switch (off) { - case offsetof(struct bpf_sk_lookup, sk): + case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk): info->reg_type = PTR_TO_SOCKET_OR_NULL; return size == sizeof(__u64); @@ -11544,7 +12068,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = { }; static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id) +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; @@ -11573,11 +12097,482 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; } + +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + +int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags) +{ + if (unlikely(flags)) + return -EINVAL; + if (unlikely(bpf_try_make_writable(skb, 0))) + return -EFAULT; + + memmove(bpf_skb_meta_pointer(skb, offset), from, len); + return 0; +} + +__bpf_kfunc_start_defs(); +__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)s; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); + + return 0; +} + +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + return 0; +} + +__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct xdp_buff *xdp = (struct xdp_buff *)x; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); + + return 0; +} + +__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, + const u8 *sun_path, u32 sun_path__sz) +{ + struct sockaddr_un *un; + + if (sa_kern->sk->sk_family != AF_UNIX) + return -EINVAL; + + /* We do not allow changing the address to unnamed or larger than the + * maximum allowed address size for a unix sockaddr. + */ + if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) + return -EINVAL; + + un = (struct sockaddr_un *)sa_kern->uaddr; + memcpy(un->sun_path, sun_path, sun_path__sz); + sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; + + return 0; +} + +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, + struct bpf_tcp_req_attrs *attrs, int attrs__sz) +{ +#if IS_ENABLED(CONFIG_SYN_COOKIES) + struct sk_buff *skb = (struct sk_buff *)s; + const struct request_sock_ops *ops; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct request_sock *req; + struct net *net; + __u16 min_mss; + u32 tsoff = 0; + + if (attrs__sz != sizeof(*attrs) || + attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) + return -EINVAL; + + if (!skb_at_tc_ingress(skb)) + return -EINVAL; + + net = dev_net(skb->dev); + if (net != sock_net(sk)) + return -ENETUNREACH; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ops = &tcp_request_sock_ops; + min_mss = 536; + break; +#if IS_BUILTIN(CONFIG_IPV6) + case htons(ETH_P_IPV6): + ops = &tcp6_request_sock_ops; + min_mss = IPV6_MIN_MTU - 60; + break; +#endif + default: + return -EINVAL; + } + + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || + sk_is_mptcp(sk)) + return -EINVAL; + + if (attrs->mss < min_mss) + return -EINVAL; + + if (attrs->wscale_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) + return -EINVAL; + + if (attrs->snd_wscale > TCP_MAX_WSCALE || + attrs->rcv_wscale > TCP_MAX_WSCALE) + return -EINVAL; + } + + if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) + return -EINVAL; + + if (attrs->tstamp_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) + return -EINVAL; + + tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); + } + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return -ENOMEM; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + + req->rsk_listener = sk; + req->syncookie = 1; + req->mss = attrs->mss; + req->ts_recent = attrs->rcv_tsval; + + ireq->snd_wscale = attrs->snd_wscale; + ireq->rcv_wscale = attrs->rcv_wscale; + ireq->tstamp_ok = !!attrs->tstamp_ok; + ireq->sack_ok = !!attrs->sack_ok; + ireq->wscale_ok = !!attrs->wscale_ok; + ireq->ecn_ok = !!attrs->ecn_ok; + + treq->req_usec_ts = !!attrs->usec_ts_ok; + treq->ts_off = tsoff; + + skb_orphan(skb); + skb->sk = req_to_sk(req); + skb->destructor = sock_pfree; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, + u64 flags) +{ + struct sk_buff *skb; + + if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) + return -EOPNOTSUPP; + + if (flags) + return -EINVAL; + + skb = skops->skb; + skb_shinfo(skb)->tx_flags |= SKBTX_BPF; + TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; + skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + + return 0; +} + +/** + * bpf_xdp_pull_data() - Pull in non-linear xdp data. + * @x: &xdp_md associated with the XDP buffer + * @len: length of data to be made directly accessible in the linear part + * + * Pull in data in case the XDP buffer associated with @x is non-linear and + * not all @len are in the linear data area. + * + * Direct packet access allows reading and writing linear XDP data through + * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which + * ends up in the linear part of the xdp_buff depends on the NIC and its + * configuration. When a frag-capable XDP program wants to directly access + * headers that may be in the non-linear area, call this kfunc to make sure + * the data is available in the linear area. Alternatively, use dynptr or + * bpf_xdp_{load,store}_bytes() to access data without pulling. + * + * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate + * headers in the non-linear data area. + * + * A call to this kfunc may reduce headroom. If there is not enough tailroom + * in the linear data area, metadata and data will be shifted down. + * + * A call to this kfunc is susceptible to change the buffer geometry. + * Therefore, at load time, all checks on pointers previously done by the + * verifier are invalidated and must be performed again, if the kfunc is used + * in combination with direct packet access. + * + * Return: + * * %0 - success + * * %-EINVAL - invalid len + */ +__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)x; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, delta, shift, headroom, tailroom, n_frags_free = 0; + void *data_hard_end = xdp_data_hard_end(xdp); + int data_len = xdp->data_end - xdp->data; + void *start; + + if (len <= data_len) + return 0; + + if (unlikely(len > xdp_get_buff_len(xdp))) + return -EINVAL; + + start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; + + headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); + tailroom = data_hard_end - xdp->data_end; + + delta = len - data_len; + if (unlikely(delta > tailroom + headroom)) + return -EINVAL; + + shift = delta - tailroom; + if (shift > 0) { + memmove(start - shift, start, xdp->data_end - start); + + xdp->data_meta -= shift; + xdp->data -= shift; + xdp->data_end -= shift; + } + + for (i = 0; i < sinfo->nr_frags && delta; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + u32 shrink = min_t(u32, delta, skb_frag_size(frag)); + + memcpy(xdp->data_end, skb_frag_address(frag), shrink); + + xdp->data_end += shrink; + sinfo->xdp_frags_size -= shrink; + delta -= shrink; + if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) + n_frags_free++; + } + + if (unlikely(n_frags_free)) { + memmove(sinfo->frags, sinfo->frags + n_frags_free, + (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); + + sinfo->nr_frags -= n_frags_free; + + if (!sinfo->nr_frags) { + xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); + } + } + + return 0; +} + +__bpf_kfunc_end_defs(); + +int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + int err; + + err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); + if (err) + return err; + + bpf_dynptr_set_rdonly(ptr); + + return 0; +} + +BTF_KFUNCS_START(bpf_kfunc_check_set_skb) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb) + +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + +BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) +BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) +BTF_ID_FLAGS(func, bpf_xdp_pull_data) +BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) + +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) +BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) + +BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) + +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) + +static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_xdp, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_sock_addr, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_tcp_reqsk, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_sock_ops, +}; + +static int __init bpf_kfunc_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + &bpf_kfunc_set_sock_addr); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); +} +late_initcall(bpf_kfunc_init); + +__bpf_kfunc_start_defs(); + +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. + * + * The function expects a non-NULL pointer to a socket, and invokes the + * protocol specific socket destroy handlers. + * + * The helper can only be called from BPF contexts that have acquired the socket + * locks. + * + * Parameters: + * @sock: Pointer to socket to be destroyed + * + * Return: + * On error, may return EPROTONOSUPPORT, EINVAL. + * EPROTONOSUPPORT if protocol specific destroy handler is not supported. + * 0 otherwise + */ +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) +{ + struct sock *sk = (struct sock *)sock; + + /* The locking semantics that allow for synchronous execution of the + * destroy handlers are only supported for TCP and UDP. + * Supporting protocols will need to acquire sock lock in the BPF context + * prior to invoking this kfunc. + */ + if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && + sk->sk_protocol != IPPROTO_UDP)) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, ECONNABORTED); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) +BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) + +static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && + prog->expected_attach_type != BPF_TRACE_ITER) + return -EACCES; + return 0; +} + +static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_sk_iter_kfunc_ids, + .filter = tracing_iter_filter, +}; + +static int init_subsystem(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); +} +late_initcall(init_subsystem); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25fb0bbc310f..1b61bb25ba0e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -27,6 +27,7 @@ #include <linux/tcp.h> #include <linux/ptp_classify.h> #include <net/flow_dissector.h> +#include <net/pkt_cls.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> @@ -39,7 +40,7 @@ static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { - flow_dissector->used_keys |= (1 << key_id); + flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, @@ -105,7 +106,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, #endif /* CONFIG_BPF_SYSCALL */ /** - * __skb_flow_get_ports - extract the upper layer ports and return them + * skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset @@ -115,8 +116,8 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - const void *data, int hlen) +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); @@ -136,7 +137,7 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, return 0; } -EXPORT_SYMBOL(__skb_flow_get_ports); +EXPORT_SYMBOL(skb_flow_get_ports); static bool icmp_has_id(u8 type) { @@ -204,6 +205,50 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } +static void __skb_flow_dissect_ah(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_ah; + struct ip_auth_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_ah = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_ah->spi = hdr->spi; +} + +static void __skb_flow_dissect_esp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_esp; + struct ip_esp_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_esp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_esp->spi = hdr->spi; +} + static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, @@ -241,13 +286,23 @@ void skb_flow_dissect_meta(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (tc_skb_ext_tc_enabled()) { + struct tc_skb_ext *ext; + + ext = skb_ext_find(skb, TC_SKB_EXT); + if (ext) + meta->l2_miss = ext->l2_miss; + } +#endif } EXPORT_SYMBOL(skb_flow_dissect_meta); static void -skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, - struct flow_dissector *flow_dissector, - void *target_container) +skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type, + u32 ctrl_flags, + struct flow_dissector *flow_dissector, + void *target_container) { struct flow_dissector_key_control *ctrl; @@ -258,6 +313,7 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, FLOW_DISSECTOR_KEY_ENC_CONTROL, target_container); ctrl->addr_type = type; + ctrl->flags = ctrl_flags; } void @@ -313,6 +369,7 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, { struct ip_tunnel_info *info; struct ip_tunnel_key *key; + u32 ctrl_flags = 0; /* A quick check to see if there might be something to do. */ if (!dissector_uses_key(flow_dissector, @@ -337,11 +394,20 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, key = &info->key; + if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM; + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT; + if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM; + if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT; + switch (ip_tunnel_info_af(info)) { case AF_INET: - skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, - flow_dissector, - target_container); + skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + ctrl_flags, flow_dissector, + target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { struct flow_dissector_key_ipv4_addrs *ipv4; @@ -354,9 +420,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, } break; case AF_INET6: - skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, - flow_dissector, - target_container); + skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + ctrl_flags, flow_dissector, + target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *ipv6; @@ -368,6 +434,10 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, ipv6->dst = key->u.ipv6.dst; } break; + default: + skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector, + target_container); + break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { @@ -401,17 +471,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); - if (info->options_len) { - enc_opt->len = info->options_len; - ip_tunnel_info_opts_get(enc_opt->data, info); - enc_opt->dst_opt_type = info->key.tun_flags & - TUNNEL_OPTIONS_PRESENT; - } + if (!info->options_len) + return; + + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + + ip_tunnel_set_options_present(flags); + ip_tunnel_flags_and(flags, info->key.tun_flags, flags); + + val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, + IP_TUNNEL_GENEVE_OPT_BIT); + enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); @@ -548,6 +626,30 @@ __skb_flow_dissect_arp(const struct sk_buff *skb, } static enum flow_dissect_ret +__skb_flow_dissect_cfm(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_cfm *key, *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, + target_container); + + key->mdl_ver = hdr->mdl_ver; + key->opcode = hdr->opcode; + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, @@ -751,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { - enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS; - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); - if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + if (!key_ports && !key_ports_range) return; - key_ports = skb_flow_dissector_target(flow_dissector, - dissector_ports, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; } static void @@ -822,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -866,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE, - target_container); - - if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { @@ -1006,21 +1117,22 @@ bool __skb_flow_dissect(const struct net *net, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + if (skb) { if (!net) { if (skb->dev) - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } } - WARN_ON_ONCE(!net); + DEBUG_NET_WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; - rcu_read_lock(); run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); @@ -1048,17 +1160,17 @@ bool __skb_flow_dissect(const struct net *net, prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); - if (result == BPF_FLOW_DISSECTOR_CONTINUE) - goto dissect_continue; - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); - rcu_read_unlock(); - return result == BPF_OK; + if (result != BPF_FLOW_DISSECTOR_CONTINUE) { + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return result == BPF_OK; + } } -dissect_continue: - rcu_read_unlock(); } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); @@ -1368,7 +1480,7 @@ proto_again: break; } - nhoff += ntohs(hdr->message_length); + nhoff += sizeof(struct ptp_header); fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } @@ -1390,6 +1502,12 @@ proto_again: break; } + case htons(ETH_P_CFM): + fdret = __skb_flow_dissect_cfm(skb, flow_dissector, + target_container, data, + nhoff, hlen); + break; + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; @@ -1531,7 +1649,14 @@ ip_proto_again: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; - + case IPPROTO_ESP: + __skb_flow_dissect_esp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_AH: + __skb_flow_dissect_ah(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } @@ -1693,6 +1818,13 @@ u32 flow_hash_from_keys(struct flow_keys *keys) } EXPORT_SYMBOL(flow_hash_from_keys); +u32 flow_hash_from_keys_seed(struct flow_keys *keys, + const siphash_key_t *keyval) +{ + return __flow_hash_from_keys(keys, keyval); +} +EXPORT_SYMBOL(flow_hash_from_keys_seed); + static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, const siphash_key_t *keyval) @@ -1732,23 +1864,23 @@ EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; -u32 __skb_get_hash_symmetric(const struct sk_buff *skb) +u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb) { struct flow_keys keys; __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); - __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, - &keys, NULL, 0, 0, 0, - FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric, + &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } -EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); +EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net); /** - * __skb_get_hash: calculate a flow hash + * __skb_get_hash_net: calculate a flow hash + * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to calculate flow hash from * * This function calculates a flow hash based on src/dst addresses @@ -1756,18 +1888,24 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ -void __skb_get_hash(struct sk_buff *skb) +void __skb_get_hash_net(const struct net *net, struct sk_buff *skb) { struct flow_keys keys; u32 hash; + memset(&keys, 0, sizeof(keys)); + + __skb_flow_dissect(net, skb, &flow_keys_dissector, + &keys, NULL, 0, 0, 0, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + __flow_hash_secret_init(); - hash = ___skb_get_hash(skb, &keys, &hashrnd); + hash = __flow_hash_from_keys(&keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } -EXPORT_SYMBOL(__skb_get_hash); +EXPORT_SYMBOL(__skb_get_hash_net); __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb) diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index acfc1f88ea79..bc5169482710 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -146,6 +146,13 @@ void flow_rule_match_tcp(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_tcp); +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out); +} +EXPORT_SYMBOL(flow_rule_match_ipsec); + void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out) { diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index fae9c4694186..f112156db587 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -75,7 +75,7 @@ static void est_fetch_counters(struct net_rate_estimator *e, static void est_timer(struct timer_list *t) { - struct net_rate_estimator *est = from_timer(est, t, timer); + struct net_rate_estimator *est = timer_container_of(est, t, timer); struct gnet_stats_basic_sync b; u64 b_bytes, b_packets; u64 rate, brate; @@ -90,10 +90,12 @@ static void est_timer(struct timer_list *t) rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); + preempt_disable_nested(); write_seqcount_begin(&est->seq); est->avbps += brate; est->avpps += rate; write_seqcount_end(&est->seq); + preempt_enable_nested(); est->last_bytes = b_bytes; est->last_packets = b_packets; @@ -177,7 +179,7 @@ int gen_new_estimator(struct gnet_stats_basic_sync *bstats, spin_lock_bh(lock); old = rcu_dereference_protected(*rate_est, 1); if (old) { - del_timer_sync(&old->timer); + timer_delete_sync(&old->timer); est->avbps = old->avbps; est->avpps = old->avpps; } @@ -206,7 +208,7 @@ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) { struct net_rate_estimator *est; - est = xchg((__force struct net_rate_estimator **)rate_est, NULL); + est = unrcu_pointer(xchg(rate_est, NULL)); if (est) { timer_shutdown_sync(&est->timer); kfree_rcu(est, rcu); diff --git a/net/core/gro.c b/net/core/gro.c index 506f83d715f8..76f9c3712422 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -1,18 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <net/psp.h> #include <net/gro.h> #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> +#include <linux/skbuff_ref.h> #define MAX_GRO_SKBS 8 -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(offload_lock); -static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); -/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ -int gro_normal_batch __read_mostly = 8; /** * dev_add_offload - register offload handlers @@ -31,7 +27,7 @@ void dev_add_offload(struct packet_offload *po) struct packet_offload *elem; spin_lock(&offload_lock); - list_for_each_entry(elem, &offload_base, list) { + list_for_each_entry(elem, &net_hotdata.offload_base, list) { if (po->priority < elem->priority) break; } @@ -55,7 +51,7 @@ EXPORT_SYMBOL(dev_add_offload); */ static void __dev_remove_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *po1; spin_lock(&offload_lock); @@ -92,63 +88,6 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); -/** - * skb_eth_gso_segment - segmentation handler for ethernet protocols. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @type: Ethernet Protocol ID - */ -struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, - netdev_features_t features, __be16 type) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - return segs; -} -EXPORT_SYMBOL(skb_eth_gso_segment); - -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - int vlan_depth = skb->mac_len; - __be16 type = skb_network_protocol(skb, &vlan_depth); - - if (unlikely(!type)) - return ERR_PTR(-EINVAL); - - __skb_pull(skb, vlan_depth); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; -} -EXPORT_SYMBOL(skb_mac_gso_segment); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { @@ -157,21 +96,27 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; - unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; int segs; - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ - gro_max_size = READ_ONCE(p->dev->gro_max_size); + /* Do not splice page pool based packets w/ non-page pool + * packets. This can result in reference count issues as page + * pool pages will not decrement the reference count and will + * instead be immediately returned to the pool or have frag + * count decremented. + */ + if (p->pp_recycle != skb->pp_recycle) + return -ETOOMANYREFS; - if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) + if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) || + NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { - if (p->protocol != htons(ETH_P_IPV6) || - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || - ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || + (p->protocol == htons(ETH_P_IPV6) && + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } @@ -228,9 +173,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - __skb_frag_set_page(frag, page); - skb_frag_off_set(frag, first_offset); - skb_frag_size_set(frag, first_size); + skb_frag_fill_page_desc(frag, page, first_offset, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ @@ -243,8 +186,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) } merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; @@ -280,12 +224,38 @@ done: return 0; } +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + + /* sk ownership - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + skb->sk = NULL; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} -static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +static void gro_complete(struct gro_node *gro, struct sk_buff *skb) { + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); @@ -314,43 +284,61 @@ static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); + gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count); } -static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, - bool flush_old) +static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old) { - struct list_head *head = &napi->gro_hash[index].list; + struct list_head *head = &gro->hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); - napi_gro_complete(napi, skb); - napi->gro_hash[index].count--; + gro_complete(gro, skb); + gro->hash[index].count--; } - if (!napi->gro_hash[index].count) - __clear_bit(index, &napi->gro_bitmask); + if (!gro->hash[index].count) + __clear_bit(index, &gro->bitmask); } -/* napi->gro_hash[].list contains packets ordered by age. +/* + * gro->hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +void __gro_flush(struct gro_node *gro, bool flush_old) { - unsigned long bitmask = napi->gro_bitmask; + unsigned long bitmask = gro->bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; - __napi_gro_flush_chain(napi, base, flush_old); + __gro_flush_chain(gro, base, flush_old); } } -EXPORT_SYMBOL(napi_gro_flush); +EXPORT_SYMBOL(__gro_flush); + +static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, + const struct sk_buff *p, + unsigned long diffs) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; + + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + return diffs; +} static void gro_list_prepare(const struct list_head *head, const struct sk_buff *skb) @@ -362,8 +350,6 @@ static void gro_list_prepare(const struct list_head *head, list_for_each_entry(p, head, list) { unsigned long diffs; - NAPI_GRO_CB(p)->flush = 0; - if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; @@ -380,29 +366,18 @@ static void gro_list_prepare(const struct list_head *head, skb_mac_header(skb), maclen); - /* in most common scenarions 'slow_gro' is 0 + /* in most common scenarios 'slow_gro' is 0 * otherwise we are already on some slower paths * either skip all the infrequent tests altogether or * avoid trying too hard to skip each of them individually */ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - struct tc_skb_ext *skb_ext; - struct tc_skb_ext *p_ext; -#endif - diffs |= p->sk != skb->sk; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext = skb_ext_find(skb, TC_SKB_EXT); - p_ext = skb_ext_find(p, TC_SKB_EXT); - - diffs |= (!!p_ext) ^ (!!skb_ext); - if (!diffs && unlikely(skb_ext)) - diffs |= p_ext->chain ^ skb_ext->chain; -#endif + diffs |= gro_list_prepare_tc_ext(skb, p, diffs); + diffs |= __psp_skb_coalesce_diff(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; @@ -411,14 +386,22 @@ static void gro_list_prepare(const struct list_head *head, static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; + const struct skb_shared_info *pinfo; + const skb_frag_t *frag0; + unsigned int headlen; + NAPI_GRO_CB(skb)->network_offset = 0; NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + headlen = skb_headlen(skb); + NAPI_GRO_CB(skb)->frag0 = skb->data; + NAPI_GRO_CB(skb)->frag0_len = headlen; + if (headlen) + return; + + pinfo = skb_shinfo(skb); + frag0 = &pinfo->frags[0]; - if (!skb_headlen(skb) && pinfo->nr_frags && + if (pinfo->nr_frags && skb_frag_page(frag0) && !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); @@ -449,7 +432,15 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } -static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +static void gro_try_pull_from_frag0(struct sk_buff *skb) +{ + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + if (grow > 0) + gro_pull_from_frag0(skb, grow); +} + +static void gro_flush_oldest(struct gro_node *gro, struct list_head *head) { struct sk_buff *oldest; @@ -465,20 +456,20 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) * SKB to the chain. */ skb_list_del_init(oldest); - napi_gro_complete(napi, oldest); + gro_complete(gro, oldest); } -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static enum gro_result dev_gro_receive(struct gro_node *gro, + struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[bucket]; - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; + struct gro_list *gro_list = &gro->hash[bucket]; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; - int grow; if (netif_elide_gro(skb->dev)) goto normal; @@ -501,7 +492,6 @@ found_ptype: sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; @@ -538,7 +528,7 @@ found_ptype: if (pp) { skb_list_del_init(pp); - napi_gro_complete(napi, pp); + gro_complete(gro, pp); gro_list->count--; } @@ -549,39 +539,37 @@ found_ptype: goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) - gro_flush_oldest(napi, &gro_list->list); + gro_flush_oldest(gro, &gro_list->list); else gro_list->count++; + /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */ + gro_try_pull_from_frag0(skb); NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; if (!skb_is_gso(skb)) skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; - -pull: - grow = skb_gro_offset(skb) - skb_headlen(skb); - if (grow > 0) - gro_pull_from_frag0(skb, grow); ok: if (gro_list->count) { - if (!test_bit(bucket, &napi->gro_bitmask)) - __set_bit(bucket, &napi->gro_bitmask); - } else if (test_bit(bucket, &napi->gro_bitmask)) { - __clear_bit(bucket, &napi->gro_bitmask); + if (!test_bit(bucket, &gro->bitmask)) + __set_bit(bucket, &gro->bitmask); + } else if (test_bit(bucket, &gro->bitmask)) { + __clear_bit(bucket, &gro->bitmask); } return ret; normal: ret = GRO_NORMAL; - goto pull; + gro_try_pull_from_frag0(skb); + goto ok; } struct packet_offload *gro_find_receive_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { @@ -595,7 +583,7 @@ EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { @@ -607,13 +595,12 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static gro_result_t napi_skb_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) +static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb, 1); + gro_normal_one(gro, skb, 1); break; case GRO_MERGED_FREE: @@ -622,7 +609,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else - __kfree_skb_defer(skb); + __napi_kfree_skb(skb, SKB_CONSUMED); break; case GRO_HELD: @@ -634,24 +621,26 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, return ret; } -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb) { gro_result_t ret; - skb_mark_napi_id(skb, napi); + __skb_mark_napi_id(skb, gro); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb)); trace_napi_gro_receive_exit(ret); return ret; } -EXPORT_SYMBOL(napi_gro_receive); +EXPORT_SYMBOL(gro_receive_skb); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { + struct skb_shared_info *shinfo; + if (unlikely(skb->pfmemalloc)) { consume_skb(skb); return; @@ -667,8 +656,13 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; - skb_shinfo(skb)->gso_type = 0; - skb_shinfo(skb)->gso_size = 0; + skb->ip_summed = CHECKSUM_NONE; + + shinfo = skb_shinfo(skb); + shinfo->gso_type = 0; + shinfo->gso_size = 0; + shinfo->hwtstamps.hwtstamp = 0; + if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); @@ -704,7 +698,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb, 1); + gro_normal_one(&napi->gro, skb, 1); break; case GRO_MERGED_FREE: @@ -737,7 +731,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) skb_reset_mac_header(skb); skb_gro_reset_offset(skb, hlen); - if (unlikely(skb_gro_header_hard(skb, hlen))) { + if (unlikely(!skb_gro_may_pull(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { net_warn_ratelimited("%s: dropping impossible skb from %s\n", @@ -747,7 +741,10 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) } } else { eth = (const struct ethhdr *)skb->data; - gro_pull_from_frag0(skb, hlen); + + if (NAPI_GRO_CB(skb)->frag0 != skb->data) + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; } @@ -770,7 +767,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb)); trace_napi_gro_frags_exit(ret); return ret; @@ -802,3 +799,37 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete); + +void gro_init(struct gro_node *gro) +{ + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&gro->hash[i].list); + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +void gro_cleanup(struct gro_node *gro) +{ + struct sk_buff *skb, *n; + + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + list_for_each_entry_safe(skb, n, &gro->hash[i].list, list) + kfree_skb(skb); + + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + list_for_each_entry_safe(skb, n, &gro->rx_list, list) + kfree_skb(skb); + + gro->rx_count = 0; +} diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index ed5ec5de47f6..a725d21159a6 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -3,15 +3,18 @@ #include <linux/slab.h> #include <linux/netdevice.h> #include <net/gro_cells.h> +#include <net/hotdata.h> struct gro_cell { struct sk_buff_head napi_skbs; struct napi_struct napi; + local_lock_t bh_lock; }; int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) { struct net_device *dev = skb->dev; + bool have_bh_lock = false; struct gro_cell *cell; int res; @@ -24,9 +27,11 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) goto unlock; } + local_lock_nested_bh(&gcells->cells->bh_lock); + have_bh_lock = true; cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); @@ -41,6 +46,8 @@ drop: res = NET_RX_SUCCESS; unlock: + if (have_bh_lock) + local_unlock_nested_bh(&gcells->cells->bh_lock); rcu_read_unlock(); return res; } @@ -54,7 +61,9 @@ static int gro_cell_poll(struct napi_struct *napi, int budget) int work_done = 0; while (work_done < budget) { + __local_lock_nested_bh(&cell->bh_lock); skb = __skb_dequeue(&cell->napi_skbs); + __local_unlock_nested_bh(&cell->bh_lock); if (!skb) break; napi_gro_receive(napi, skb); @@ -78,6 +87,7 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); __skb_queue_head_init(&cell->napi_skbs); + local_lock_init(&cell->bh_lock); set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); diff --git a/net/core/gso.c b/net/core/gso.c new file mode 100644 index 000000000000..bcd156372f4d --- /dev/null +++ b/net/core/gso.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/skbuff.h> +#include <linux/sctp.h> +#include <net/gso.h> +#include <net/gro.h> + +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); +/* openvswitch calls this on rx path, so we need a different check. + */ +static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; + + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + struct sk_buff *segs; + + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + /* We're going to init ->check field in TCP or UDP header */ + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + } + + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + segs = skb_mac_gso_segment(skb, features); + + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + skb_warn_bad_offload(skb); + + return segs; +} +EXPORT_SYMBOL(__skb_gso_segment); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int thlen = 0; + + if (skb->encapsulation) { + thlen = skb_inner_transport_header(skb) - + skb_transport_header(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + thlen += inner_tcp_hdrlen(skb); + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + thlen = tcp_hdrlen(skb); + } else if (unlikely(skb_is_gso_sctp(skb))) { + thlen = sizeof(struct sctphdr); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + thlen = sizeof(struct udphdr); + } + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return thlen + shinfo->gso_size; +} + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS + * + * There are a couple of instances where we have a GSO skb, and we + * want to determine what size it would be after it is segmented. + * + * We might want to check: + * - L3+L4+payload size (e.g. IP forwarding) + * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) + * + * This is a helper to do that correctly considering GSO_BY_FRAGS. + * + * @skb: GSO skb + * + * @seg_len: The segmented length (from skb_gso_*_seglen). In the + * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. + * + * @max_len: The maximum permissible length. + * + * Returns true if the segmented length <= max length. + */ +static inline bool skb_gso_size_check(const struct sk_buff *skb, + unsigned int seg_len, + unsigned int max_len) { + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + + if (shinfo->gso_size != GSO_BY_FRAGS) + return seg_len <= max_len; + + /* Undo this so we can re-use header sizes */ + seg_len -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (seg_len + skb_headlen(iter) > max_len) + return false; + } + + return true; +} + +/** + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. + */ +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) +{ + return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); + +/** + * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? + * + * @skb: GSO skb + * @len: length to validate against + * + * skb_gso_validate_mac_len validates if a given skb will fit a wanted + * length once split, including L2, L3 and L4 headers and the payload. + */ +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) +{ + return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); + diff --git a/net/core/hotdata.c b/net/core/hotdata.c new file mode 100644 index 000000000000..dddd5c287cf0 --- /dev/null +++ b/net/core/hotdata.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/cache.h> +#include <linux/jiffies.h> +#include <linux/list.h> +#include <net/aligned_data.h> +#include <net/hotdata.h> +#include <net/ip.h> +#include <net/proto_memory.h> + +struct net_hotdata net_hotdata __cacheline_aligned = { + .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), + .gro_normal_batch = 8, + + .netdev_budget = 300, + /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ + .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, + + .tstamp_prequeue = 1, + .max_backlog = 1000, + .dev_tx_weight = 64, + .dev_rx_weight = 64, + .sysctl_max_skb_frags = MAX_SKB_FRAGS, + .sysctl_skb_defer_max = 128, + .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE +}; +EXPORT_SYMBOL(net_hotdata); + +struct net_aligned_data net_aligned_data; +EXPORT_IPV6_MOD(net_aligned_data); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c new file mode 100644 index 000000000000..669b357b73b2 --- /dev/null +++ b/net/core/ieee8021q_helpers.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <linux/types.h> +#include <net/dscp.h> +#include <net/ieee8021q.h> + +/* verify that table covers all 8 traffic types */ +#define TT_MAP_SIZE_OK(tbl) \ + compiletime_assert(ARRAY_SIZE(tbl) == IEEE8021Q_TT_MAX, \ + #tbl " size mismatch") + +/* The following arrays map Traffic Types (TT) to traffic classes (TC) for + * different number of queues as shown in the example provided by + * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and + * Table I-1 "Traffic type to traffic class mapping". + */ +static const u8 ieee8021q_8queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, + [IEEE8021Q_TT_VO] = 5, + [IEEE8021Q_TT_IC] = 6, + [IEEE8021Q_TT_NC] = 7, +}; + +static const u8 ieee8021q_7queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4, + [IEEE8021Q_TT_IC] = 5, + [IEEE8021Q_TT_NC] = 6, +}; + +static const u8 ieee8021q_6queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2, + [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3, + [IEEE8021Q_TT_IC] = 4, + [IEEE8021Q_TT_NC] = 5, +}; + +static const u8 ieee8021q_5queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, + [IEEE8021Q_TT_NC] = 4, +}; + +static const u8 ieee8021q_4queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3, +}; + +static const u8 ieee8021q_3queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2, +}; + +static const u8 ieee8021q_2queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1, +}; + +static const u8 ieee8021q_1queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0, + [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0, +}; + +/** + * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class + * @tt: IEEE 802.1Q Traffic Type + * @num_queues: Number of queues + * + * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based + * on the number of queues configured on the NIC. The mapping is based on the + * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic + * class mapping" and Table I-1 "Traffic type to traffic class mapping". + * + * Return: Traffic Class corresponding to the given Traffic Type or negative + * value in case of error. + */ +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) +{ + if (tt < 0 || tt >= IEEE8021Q_TT_MAX) { + pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt, + IEEE8021Q_TT_MAX); + return -EINVAL; + } + + switch (num_queues) { + case 8: + TT_MAP_SIZE_OK(ieee8021q_8queue_tt_tc_map); + return ieee8021q_8queue_tt_tc_map[tt]; + case 7: + TT_MAP_SIZE_OK(ieee8021q_7queue_tt_tc_map); + return ieee8021q_7queue_tt_tc_map[tt]; + case 6: + TT_MAP_SIZE_OK(ieee8021q_6queue_tt_tc_map); + return ieee8021q_6queue_tt_tc_map[tt]; + case 5: + TT_MAP_SIZE_OK(ieee8021q_5queue_tt_tc_map); + return ieee8021q_5queue_tt_tc_map[tt]; + case 4: + TT_MAP_SIZE_OK(ieee8021q_4queue_tt_tc_map); + return ieee8021q_4queue_tt_tc_map[tt]; + case 3: + TT_MAP_SIZE_OK(ieee8021q_3queue_tt_tc_map); + return ieee8021q_3queue_tt_tc_map[tt]; + case 2: + TT_MAP_SIZE_OK(ieee8021q_2queue_tt_tc_map); + return ieee8021q_2queue_tt_tc_map[tt]; + case 1: + TT_MAP_SIZE_OK(ieee8021q_1queue_tt_tc_map); + return ieee8021q_1queue_tt_tc_map[tt]; + } + + pr_err("Invalid number of queues %d\n", num_queues); + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc); + +/** + * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type + * @dscp: IETF DSCP value + * + * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT). + * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic + * Type, this function is inspired by the RFC8325 documentation which describe + * the mapping between DSCP and 802.11 User Priority (UP) values. + * + * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value + */ +int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + switch (dscp) { + case DSCP_CS0: + /* Comment from RFC8325: + * [RFC4594], Section 4.8, recommends High-Throughput Data be marked + * AF1x (that is, AF11, AF12, and AF13, according to the rules defined + * in [RFC2475]). + * + * By default (as described in Section 2.3), High-Throughput Data will + * map to UP 1 and, thus, to the Background Access Category (AC_BK), + * which is contrary to the intent expressed in [RFC4594]. + + * Unfortunately, there really is no corresponding fit for the High- + * Throughput Data service class within the constrained 4 Access + * Category [IEEE.802.11-2016] model. If the High-Throughput Data + * service class is assigned to the Best Effort Access Category (AC_BE), + * then it would contend with Low-Latency Data (while [RFC4594] + * recommends a distinction in servicing between these service classes) + * as well as with the default service class; alternatively, if it is + * assigned to the Background Access Category (AC_BK), then it would + * receive a less-then-best-effort service and contend with Low-Priority + * Data (as discussed in Section 4.2.10). + * + * As such, since there is no directly corresponding fit for the High- + * Throughout Data service class within the [IEEE.802.11-2016] model, it + * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby + * admitting it to the Best Effort Access Category (AC_BE). + * + * Note: The above text is from RFC8325 which is describing the mapping + * between DSCP and 802.11 User Priority (UP) values. The mapping + * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but + * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q + * Traffic Types BE and BK. + */ + case DSCP_AF11: + case DSCP_AF12: + case DSCP_AF13: + return IEEE8021Q_TT_BE; + /* Comment from RFC8325: + * RFC3662 and RFC4594 both recommend Low-Priority Data be marked + * with DSCP CS1. The Low-Priority Data service class loosely + * corresponds to the [IEEE.802.11-2016] Background Access Category + */ + case DSCP_CS1: + return IEEE8021Q_TT_BK; + case DSCP_CS2: + case DSCP_AF21: + case DSCP_AF22: + case DSCP_AF23: + return IEEE8021Q_TT_EE; + case DSCP_CS3: + case DSCP_AF31: + case DSCP_AF32: + case DSCP_AF33: + return IEEE8021Q_TT_CA; + case DSCP_CS4: + case DSCP_AF41: + case DSCP_AF42: + case DSCP_AF43: + return IEEE8021Q_TT_VI; + case DSCP_CS5: + case DSCP_EF: + case DSCP_VOICE_ADMIT: + return IEEE8021Q_TT_VO; + case DSCP_CS6: + return IEEE8021Q_TT_IC; + case DSCP_CS7: + return IEEE8021Q_TT_NC; + } + + return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp); +} +EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index c469d1c4db5d..212cde35affa 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -33,7 +33,7 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); -static unsigned char default_operstate(const struct net_device *dev) +static unsigned int default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; @@ -42,12 +42,21 @@ static unsigned char default_operstate(const struct net_device *dev) * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { - int iflink = dev_get_iflink(dev); struct net_device *peer; + int iflink; + + /* If called from netdev_run_todo()/linkwatch_sync_dev(), + * dev_net(dev) can be already freed, and RTNL is not held. + */ + if (dev->reg_state <= NETREG_REGISTERED) + iflink = dev_get_iflink(dev); + else + iflink = dev->ifindex; if (iflink == dev->ifindex) return IF_OPER_DOWN; + ASSERT_RTNL(); peer = __dev_get_by_index(dev_net(dev), iflink); if (!peer) return IF_OPER_DOWN; @@ -62,16 +71,13 @@ static unsigned char default_operstate(const struct net_device *dev) return IF_OPER_UP; } - static void rfc2863_policy(struct net_device *dev) { - unsigned char operstate = default_operstate(dev); + unsigned int operstate = default_operstate(dev); - if (operstate == dev->operstate) + if (operstate == READ_ONCE(dev->operstate)) return; - write_lock(&dev_base_lock); - switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) @@ -87,9 +93,7 @@ static void rfc2863_policy(struct net_device *dev) break; } - dev->operstate = operstate; - - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->operstate, operstate); } @@ -153,9 +157,9 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_wq, &linkwatch_work, 0); + mod_delayed_work(system_dfl_wq, &linkwatch_work, 0); else - schedule_delayed_work(&linkwatch_work, delay); + queue_delayed_work(system_dfl_wq, &linkwatch_work, delay); } @@ -179,7 +183,7 @@ static void linkwatch_do_dev(struct net_device *dev) else dev_deactivate(dev); - netdev_state_change(dev); + netif_state_change(dev); } /* Note: our callers are responsible for calling netdev_tracker_free(). * This is the reason we use __dev_put() instead of dev_put(). @@ -192,7 +196,10 @@ static void __linkwatch_run_queue(int urgent_only) #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; - struct net_device *dev; + /* Use a local list here since we add non-urgent + * events back to the global one when called with + * urgent_only=1. + */ LIST_HEAD(wrk); /* Give urgent case more budget */ @@ -218,6 +225,7 @@ static void __linkwatch_run_queue(int urgent_only) list_splice_init(&lweventlist, &wrk); while (!list_empty(&wrk) && do_dev > 0) { + struct net_device *dev; dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); @@ -232,7 +240,9 @@ static void __linkwatch_run_queue(int urgent_only) */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); + netdev_lock_ops(dev); linkwatch_do_dev(dev); + netdev_unlock_ops(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } @@ -245,25 +255,41 @@ static void __linkwatch_run_queue(int urgent_only) spin_unlock_irq(&lweventlist_lock); } -void linkwatch_forget_dev(struct net_device *dev) +static bool linkwatch_clean_dev(struct net_device *dev) { unsigned long flags; - int clean = 0; + bool clean = false; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); - clean = 1; + clean = true; /* We must release netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); - if (clean) + + return clean; +} + +void __linkwatch_sync_dev(struct net_device *dev) +{ + netdev_ops_assert_locked(dev); + + if (linkwatch_clean_dev(dev)) linkwatch_do_dev(dev); } +void linkwatch_sync_dev(struct net_device *dev) +{ + if (linkwatch_clean_dev(dev)) { + netdev_lock_ops(dev); + linkwatch_do_dev(dev); + netdev_unlock_ops(dev); + } +} /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c new file mode 100644 index 000000000000..9e9fb25314b9 --- /dev/null +++ b/net/core/lock_debug.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Amazon.com Inc. or its affiliates. */ + +#include <linux/init.h> +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/netdev_lock.h> +#include <net/netns/generic.h> + +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + enum netdev_cmd cmd = event; + + /* Keep enum and don't add default to trigger -Werror=switch */ + switch (cmd) { + case NETDEV_XDP_FEAT_CHANGE: + netdev_assert_locked(dev); + fallthrough; + case NETDEV_CHANGE: + case NETDEV_REGISTER: + case NETDEV_UP: + netdev_ops_assert_locked(dev); + fallthrough; + case NETDEV_DOWN: + case NETDEV_REBOOT: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_PRE_CHANGEADDR: + case NETDEV_GOING_DOWN: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_PRE_UP: + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + case NETDEV_POST_INIT: + case NETDEV_PRE_UNINIT: + case NETDEV_RELEASE: + case NETDEV_NOTIFY_PEERS: + case NETDEV_JOIN: + case NETDEV_CHANGEUPPER: + case NETDEV_RESEND_IGMP: + case NETDEV_PRECHANGEMTU: + case NETDEV_CHANGEINFODATA: + case NETDEV_BONDING_INFO: + case NETDEV_PRECHANGEUPPER: + case NETDEV_CHANGELOWERSTATE: + case NETDEV_UDP_TUNNEL_PUSH_INFO: + case NETDEV_UDP_TUNNEL_DROP_INFO: + case NETDEV_CHANGE_TX_QUEUE_LEN: + case NETDEV_CVLAN_FILTER_PUSH_INFO: + case NETDEV_CVLAN_FILTER_DROP_INFO: + case NETDEV_SVLAN_FILTER_PUSH_INFO: + case NETDEV_SVLAN_FILTER_DROP_INFO: + case NETDEV_OFFLOAD_XSTATS_ENABLE: + case NETDEV_OFFLOAD_XSTATS_DISABLE: + case NETDEV_OFFLOAD_XSTATS_REPORT_USED: + case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: + ASSERT_RTNL(); + break; + + case NETDEV_CHANGENAME: + ASSERT_RTNL_NET(net); + break; + } + + return NOTIFY_DONE; +} +EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL"); + +static int rtnl_net_debug_net_id; + +static int __net_init rtnl_net_debug_net_init(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + nb->notifier_call = netdev_debug_event; + + return register_netdevice_notifier_net(net, nb); +} + +static void __net_exit rtnl_net_debug_net_exit(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + unregister_netdevice_notifier_net(net, nb); +} + +static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { + .init = rtnl_net_debug_net_init, + .exit = rtnl_net_debug_net_exit, + .id = &rtnl_net_debug_net_id, + .size = sizeof(struct notifier_block), +}; + +static struct notifier_block rtnl_net_debug_block = { + .notifier_call = netdev_debug_event, +}; + +static int __init rtnl_net_debug_init(void) +{ + int ret; + + ret = register_pernet_subsys(&rtnl_net_debug_net_ops); + if (ret) + return ret; + + ret = register_netdevice_notifier(&rtnl_net_debug_block); + if (ret) + unregister_pernet_subsys(&rtnl_net_debug_net_ops); + + return ret; +} + +subsys_initcall(rtnl_net_debug_init); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 8b6b5e72b217..9f40be0c3e71 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -8,8 +8,10 @@ #include <linux/skbuff.h> #include <linux/types.h> #include <linux/bpf.h> +#include <net/flow.h> #include <net/lwtunnel.h> #include <net/gre.h> +#include <net/ip.h> #include <net/ip6_route.h> #include <net/ipv6_stubs.h> @@ -38,13 +40,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, struct dst_entry *dst, bool can_redirect) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int ret; - /* Migration disable and BH disable are needed to protect per-cpu - * redirect_info between BPF prog and skb_do_redirect(). + /* Disabling BH is needed to protect per-CPU bpf_redirect_info between + * BPF prog and skb_do_redirect(). */ - migrate_disable(); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -60,9 +63,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, ret = BPF_OK; } else { skb_reset_mac_header(skb); - ret = skb_do_redirect(skb); - if (ret == 0) - ret = BPF_REDIRECT; + skb_do_redirect(skb); + ret = BPF_REDIRECT; } break; @@ -78,24 +80,26 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, break; } + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); - migrate_enable(); return ret; } static int bpf_lwt_input_reroute(struct sk_buff *skb) { + enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { struct net_device *dev = skb_dst(skb)->dev; - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); dev_hold(dev); skb_dst_drop(skb); - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); + reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); @@ -205,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; @@ -255,7 +259,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) - return err; + return net_xmit_errno(err); /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 711cd3b4347a..f9d76d85d04f 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -23,6 +23,8 @@ #include <net/ip6_fib.h> #include <net/rtnh.h> +#include "dev.h" + DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); @@ -158,21 +160,14 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) return ret; } - rcu_read_lock(); - ops = rcu_dereference(lwtun_encaps[encap_type]); - rcu_read_unlock(); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); - rtnl_lock(); - - rcu_read_lock(); - ops = rcu_dereference(lwtun_encaps[encap_type]); - rcu_read_unlock(); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } #endif @@ -206,8 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, } encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type, - extack) != 0) + if (lwtunnel_valid_encap_type(encap_type, extack)) return -EOPNOTSUPP; } } @@ -325,82 +319,132 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; + + local_bh_disable(); - if (!dst) + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->output)) + if (likely(ops && ops->output)) { + dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + local_bh_disable(); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->xmit)) + if (likely(ops && ops->xmit)) { + dev_xmit_recursion_inc(); ret = ops->xmit(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || @@ -410,8 +454,11 @@ int lwtunnel_input(struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->input)) + if (likely(ops && ops->input)) { + dev_xmit_recursion_inc(); ret = ops->input(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) diff --git a/net/core/mp_dmabuf_devmem.h b/net/core/mp_dmabuf_devmem.h new file mode 100644 index 000000000000..67cd0dd7319c --- /dev/null +++ b/net/core/mp_dmabuf_devmem.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Dmabuf device memory provider. + * + * Authors: Mina Almasry <almasrymina@google.com> + * + */ +#ifndef _NET_MP_DMABUF_DEVMEM_H +#define _NET_MP_DMABUF_DEVMEM_H + +#include <net/netmem.h> + +#if defined(CONFIG_NET_DEVMEM) +int mp_dmabuf_devmem_init(struct page_pool *pool); + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp); + +void mp_dmabuf_devmem_destroy(struct page_pool *pool); + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem); +#else +static inline int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + return -EOPNOTSUPP; +} + +static inline netmem_ref +mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + return 0; +} + +static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ +} + +static inline bool +mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + return false; +} +#endif + +#endif /* _NET_MP_DMABUF_DEVMEM_H */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f00a79fc301b..96a3b1a93252 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -14,7 +14,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> -#include <linux/kmemleak.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> @@ -29,6 +28,7 @@ #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> +#include <net/ip.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> @@ -54,15 +54,34 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev); +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; #endif +static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) +{ + int i; + + switch (family) { + default: + DEBUG_NET_WARN_ON_ONCE(1); + fallthrough; /* to avoid panic by null-ptr-deref */ + case AF_INET: + i = NEIGH_ARP_TABLE; + break; + case AF_INET6: + i = NEIGH_ND_TABLE; + break; + } + + return &dev->neighbours[i]; +} + /* - Neighbour hash table buckets are protected with rwlock tbl->lock. + Neighbour hash table buckets are protected with tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks @@ -130,16 +149,17 @@ static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; - write_lock_bh(&n->tbl->lock); + spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; - /* remove from the gc list if new state is permanent or if neighbor - * is externally learned; otherwise entry should be on the gc list + /* remove from the gc list if new state is permanent or if neighbor is + * externally learned / validated; otherwise entry should be on the gc + * list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || - n->flags & NTF_EXT_LEARNED; + n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { @@ -152,14 +172,14 @@ static void neigh_update_gc_list(struct neighbour *n) } out: write_unlock(&n->lock); - write_unlock_bh(&n->tbl->lock); + spin_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; - write_lock_bh(&n->tbl->lock); + spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; @@ -173,7 +193,7 @@ static void neigh_update_managed_list(struct neighbour *n) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); - write_unlock_bh(&n->tbl->lock); + spin_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, @@ -186,6 +206,7 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) @@ -203,20 +224,24 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, *notify = 1; *managed_update = true; } + if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { + if (ndm_flags & NTF_EXT_VALIDATED) + neigh->flags |= NTF_EXT_VALIDATED; + else + neigh->flags &= ~NTF_EXT_VALIDATED; + *notify = 1; + *gc_update = true; + } } -static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, - struct neigh_table *tbl) +bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { - struct neighbour *neigh; - - neigh = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(*np, neigh); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } @@ -226,39 +251,19 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, return retval; } -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) -{ - struct neigh_hash_table *nht; - void *pkey = ndel->primary_key; - u32 hash_val; - struct neighbour *n; - struct neighbour __rcu **np; - - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); - hash_val = hash_val >> (32 - nht->hash_shift); - - np = &nht->hash_buckets[hash_val]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock)))) { - if (n == ndel) - return neigh_del(n, np, tbl); - np = &n->next; - } - return false; -} - static int neigh_forced_gc(struct neigh_table *tbl) { - int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + int max_clean = atomic_read(&tbl->gc_entries) - + READ_ONCE(tbl->gc_thresh2); + u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; + int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { @@ -269,27 +274,42 @@ static int neigh_forced_gc(struct neigh_table *tbl) (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || - time_after(tref, n->updated)) + !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); - if (remove && neigh_remove_one(n, tbl)) + if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; + if (++loop == 16) { + if (ktime_get_ns() > tmax) + goto unlock; + loop = 0; + } } } - tbl->last_flush = jiffies; - - write_unlock_bh(&tbl->lock); + WRITE_ONCE(tbl->last_flush, jiffies); +unlock: + spin_unlock_bh(&tbl->lock); return shrunk; } static void neigh_add_timer(struct neighbour *n, unsigned long when) { + /* Use safe distance from the jiffies - LONG_MAX point while timer + * is running in DELAY/PROBE state but still show to user space + * large times in the past. + */ + unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); + neigh_hold(n); + if (!time_in_range(n->confirmed, mint, jiffies)) + n->confirmed = mint; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); @@ -300,7 +320,7 @@ static void neigh_add_timer(struct neighbour *n, unsigned long when) static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && - del_timer(&n->timer)) { + timer_delete(&n->timer)) { neigh_release(n); return 1; } @@ -359,78 +379,104 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, } } +static void neigh_flush_one(struct neighbour *n) +{ + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + + write_lock(&n->lock); + + neigh_del_timer(n); + neigh_mark_dead(n); + + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + + neigh_dbg(2, "neigh %p is stray\n", n); + } + + write_unlock(&n->lock); + + neigh_cleanup_and_release(n); +} + static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - int i; + struct hlist_head *dev_head; + struct hlist_node *tmp; + struct neighbour *n; + + dev_head = neigh_get_dev_table(dev, tbl->family); + + hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { + if (skip_perm && + (n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_VALIDATED)) + continue; + + neigh_flush_one(n); + } +} + +static void neigh_flush_table(struct neigh_table *tbl) +{ struct neigh_hash_table *nht; + int i; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - if (dev && n->dev != dev) { - np = &n->next; - continue; - } - if (skip_perm && n->nud_state & NUD_PERMANENT) { - np = &n->next; - continue; - } - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - write_lock(&n->lock); - neigh_del_timer(n); - neigh_mark_dead(n); - if (refcount_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - We must destroy neighbour entry, - but someone still uses it. - - The destroy will be delayed until - the last user releases us, but - we must kill timers etc. and move - it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - n->output = neigh_blackhole; - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); - } + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) + neigh_flush_one(n); } } void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev, skip_perm); - pneigh_ifdown_and_unlock(tbl, dev); + spin_lock_bh(&tbl->lock); + if (likely(dev)) { + neigh_flush_dev(tbl, dev, skip_perm); + } else { + DEBUG_NET_WARN_ON_ONCE(skip_perm); + neigh_flush_table(tbl); + } + spin_unlock_bh(&tbl->lock); + + pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) - del_timer_sync(&tbl->proxy_timer); + timer_delete_sync(&tbl->proxy_timer); return 0; } @@ -454,17 +500,17 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, { struct neighbour *n = NULL; unsigned long now = jiffies; - int entries; + int entries, gc_thresh3; if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; - if (entries >= tbl->gc_thresh3 || - (entries >= tbl->gc_thresh2 && - time_after(now, tbl->last_flush + 5 * HZ))) { - if (!neigh_forced_gc(tbl) && - entries >= tbl->gc_thresh3) { + gc_thresh3 = READ_ONCE(tbl->gc_thresh3); + if (entries >= gc_thresh3 || + (entries >= READ_ONCE(tbl->gc_thresh2) && + time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); @@ -512,27 +558,21 @@ static void neigh_get_hash_rnd(u32 *x) static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { - size_t size = (1 << shift) * sizeof(struct neighbour *); + size_t size = (1 << shift) * sizeof(struct hlist_head); + struct hlist_head *hash_heads; struct neigh_hash_table *ret; - struct neighbour __rcu **buckets; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) { - buckets = kzalloc(size, GFP_ATOMIC); - } else { - buckets = (struct neighbour __rcu **) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); - kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); - } - if (!buckets) { + + hash_heads = kzalloc(size, GFP_ATOMIC); + if (!hash_heads) { kfree(ret); return NULL; } - ret->hash_buckets = buckets; + ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); @@ -544,15 +584,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); - size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); - struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) { - kfree(buckets); - } else { - kmemleak_free(buckets); - free_pages((unsigned long)buckets, get_order(size)); - } + kfree(nht->hash_heads); kfree(nht); } @@ -571,24 +604,17 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { - struct neighbour *n, *next; + struct hlist_node *tmp; + struct neighbour *n; - for (n = rcu_dereference_protected(old_nht->hash_buckets[i], - lockdep_is_held(&tbl->lock)); - n != NULL; - n = next) { + neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); - next = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(n->next, - rcu_dereference_protected( - new_nht->hash_buckets[hash], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(new_nht->hash_buckets[hash], n); + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } @@ -604,7 +630,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, NEIGH_CACHE_STAT_INC(tbl, lookups); - rcu_read_lock_bh(); + rcu_read_lock(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { if (!refcount_inc_not_zero(&n->refcnt)) @@ -612,42 +638,11 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, NEIGH_CACHE_STAT_INC(tbl, hits); } - rcu_read_unlock_bh(); + rcu_read_unlock(); return n; } EXPORT_SYMBOL(neigh_lookup); -struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, - const void *pkey) -{ - struct neighbour *n; - unsigned int key_len = tbl->key_len; - u32 hash_val; - struct neigh_hash_table *nht; - - NEIGH_CACHE_STAT_INC(tbl, lookups); - - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); - hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift); - - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { - if (!memcmp(n->primary_key, pkey, key_len) && - net_eq(dev_net(n->dev), net)) { - if (!refcount_inc_not_zero(&n->refcnt)) - n = NULL; - NEIGH_CACHE_STAT_INC(tbl, hits); - break; - } - } - - rcu_read_unlock_bh(); - return n; -} -EXPORT_SYMBOL(neigh_lookup_nodev); - static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, u32 flags, @@ -692,7 +687,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); @@ -706,11 +701,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, goto out_tbl_unlock; } - for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); - n1 != NULL; - n1 = rcu_dereference_protected(n1->next, - lockdep_is_held(&tbl->lock))) { + neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); @@ -726,17 +717,18 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); - rcu_assign_pointer(n->next, - rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(nht->hash_buckets[hash_val], n); - write_unlock_bh(&tbl->lock); + hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); + + hlist_add_head_rcu(&n->dev_list, + neigh_get_dev_table(dev, tbl->family)); + + spin_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); @@ -747,7 +739,9 @@ out_neigh_release: struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); + bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + + return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -761,54 +755,53 @@ static u32 pneigh_hash(const void *pkey, unsigned int key_len) return hash_val; } -static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, - struct net *net, - const void *pkey, - unsigned int key_len, - struct net_device *dev) +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) { + struct pneigh_entry *n; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(tbl->phash_buckets[hash_val], + lockdep_is_held(&tbl->phash_lock)); + while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; - n = n->next; - } - return NULL; -} -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, struct net_device *dev) -{ - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); + } - return __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); + return NULL; } -EXPORT_SYMBOL_GPL(__pneigh_lookup); +EXPORT_IPV6_MOD(pneigh_lookup); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev, int creat) +int pneigh_create(struct neigh_table *tbl, struct net *net, + const void *pkey, struct net_device *dev, + u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); - - read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); - read_unlock_bh(&tbl->lock); + unsigned int key_len; + u32 hash_val; + int err = 0; - if (n || !creat) - goto out; + mutex_lock(&tbl->phash_lock); - ASSERT_RTNL(); + n = pneigh_lookup(tbl, net, pkey, dev); + if (n) + goto update; + key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); - if (!n) + if (!n) { + err = -ENOBUFS; goto out; + } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); @@ -818,81 +811,104 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); - n = NULL; + err = -ENOBUFS; goto out; } - write_lock_bh(&tbl->lock); + hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; - tbl->phash_buckets[hash_val] = n; - write_unlock_bh(&tbl->lock); + rcu_assign_pointer(tbl->phash_buckets[hash_val], n); +update: + WRITE_ONCE(n->flags, flags); + n->permanent = permanent; + WRITE_ONCE(n->protocol, protocol); out: - return n; + mutex_unlock(&tbl->phash_lock); + return err; } -EXPORT_SYMBOL(pneigh_lookup); +static void pneigh_destroy(struct rcu_head *rcu) +{ + struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); + + netdev_put(n->dev, &n->dev_tracker); + kfree(n); +} int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - struct pneigh_entry *n, **np; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + struct pneigh_entry *n, __rcu **np; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + + mutex_lock(&tbl->phash_lock); - write_lock_bh(&tbl->lock); - for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + for (np = &tbl->phash_buckets[hash_val]; + (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { - *np = n->next; - write_unlock_bh(&tbl->lock); + rcu_assign_pointer(*np, n->next); + + mutex_unlock(&tbl->phash_lock); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); return 0; } } - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); return -ENOENT; } -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev) +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { - struct pneigh_entry *n, **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np; + LIST_HEAD(head); u32 h; + mutex_lock(&tbl->phash_lock); + for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, 1)) != NULL) { + if (skip_perm && n->permanent) + goto skip; if (!dev || n->dev == dev) { - *np = n->next; - n->next = freelist; - freelist = n; + rcu_assign_pointer(*np, n->next); + list_add(&n->free_node, &head); continue; } +skip: np = &n->next; } } - write_unlock_bh(&tbl->lock); - while ((n = freelist)) { - freelist = n->next; - n->next = NULL; + + mutex_unlock(&tbl->phash_lock); + + while (!list_empty(&head)) { + n = list_first_entry(&head, typeof(*n), free_node); + list_del(&n->free_node); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); } - return -ENOENT; } -static void neigh_parms_destroy(struct neigh_parms *parms); - static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) - neigh_parms_destroy(parms); + kfree(parms); } /* @@ -941,7 +957,7 @@ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); - neigh->output = neigh->ops->output; + WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; @@ -953,20 +969,20 @@ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); - neigh->output = neigh->ops->connected_output; + WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neigh_hash_table *nht; + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np; unsigned int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); @@ -976,55 +992,53 @@ static void neigh_periodic_work(struct work_struct *work) if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; - tbl->last_rand = jiffies; + + WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); } - if (atomic_read(&tbl->entries) < tbl->gc_thresh1) + if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { - np = &nht->hash_buckets[i]; - - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || - (n->flags & NTF_EXT_LEARNED)) { + (n->flags & + (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); - goto next_elt; + continue; } - if (time_before(n->used, n->confirmed)) + if (time_before(n->used, n->confirmed) && + time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || - time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - *np = n->next; + !time_in_range_open(jiffies, n->used, + n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); - -next_elt: - np = &n->next; } /* * It's fine to release lock here, even if hash table * grows while we are preempted. */ - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); cond_resched(); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } @@ -1035,7 +1049,7 @@ out: */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) @@ -1090,7 +1104,7 @@ static void neigh_probe(struct neighbour *neigh) static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; - struct neighbour *neigh = from_timer(neigh, t, timer); + struct neighbour *neigh = timer_container_of(neigh, t, timer); unsigned int state; int notify = 0; @@ -1112,13 +1126,13 @@ static void neigh_timer_handler(struct timer_list *t) neigh->used + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is delayed\n", neigh); - neigh->nud_state = NUD_DELAY; + WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_suspect(neigh); next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { neigh_dbg(2, "neigh %p is suspected\n", neigh); - neigh->nud_state = NUD_STALE; + WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; @@ -1128,14 +1142,14 @@ static void neigh_timer_handler(struct timer_list *t) neigh->confirmed + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is now reachable\n", neigh); - neigh->nud_state = NUD_REACHABLE; + WRITE_ONCE(neigh->nud_state, NUD_REACHABLE); neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { neigh_dbg(2, "neigh %p is probed\n", neigh); - neigh->nud_state = NUD_PROBE; + WRITE_ONCE(neigh->nud_state, NUD_PROBE); neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; @@ -1149,9 +1163,15 @@ static void neigh_timer_handler(struct timer_list *t) if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { - neigh->nud_state = NUD_FAILED; + if (neigh->nud_state == NUD_PROBE && + neigh->flags & NTF_EXT_VALIDATED) { + WRITE_ONCE(neigh->nud_state, NUD_STALE); + neigh->updated = jiffies; + } else { + WRITE_ONCE(neigh->nud_state, NUD_FAILED); + neigh_invalidate(neigh); + } notify = 1; - neigh_invalidate(neigh); goto out; } @@ -1198,7 +1218,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); - neigh->nud_state = NUD_INCOMPLETE; + WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); neigh->updated = now; if (!immediate_ok) { next = now + 1; @@ -1210,7 +1230,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, } neigh_add_timer(neigh, next); } else { - neigh->nud_state = NUD_FAILED; + WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh->updated = jiffies; write_unlock_bh(&neigh->lock); @@ -1220,7 +1240,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh_del_timer(neigh); - neigh->nud_state = NUD_DELAY; + WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); @@ -1299,6 +1319,8 @@ static void neigh_update_hhs(struct neighbour *neigh) NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. + NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed + or invalidated. Caller MUST hold reference count on the entry. */ @@ -1332,7 +1354,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update); if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { new = old & ~NUD_PERMANENT; - neigh->nud_state = new; + WRITE_ONCE(neigh->nud_state, new); err = 0; goto out; } @@ -1341,7 +1363,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); - neigh->nud_state = new; + WRITE_ONCE(neigh->nud_state, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1420,7 +1442,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - neigh->nud_state = new; + WRITE_ONCE(neigh->nud_state, new); notify = 1; } @@ -1461,12 +1483,13 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, * we can reinject the packet there. */ n2 = NULL; - if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { + if (dst && + READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } - n1->output(n1, skb); + READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); @@ -1507,7 +1530,7 @@ void __neigh_set_probe_once(struct neighbour *neigh) neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; - neigh->nud_state = NUD_INCOMPLETE; + WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), @@ -1576,7 +1599,7 @@ out: return rc; out_kfree_skb: rc = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); @@ -1600,7 +1623,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) err = dev_queue_xmit(skb); else { err = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } @@ -1618,17 +1641,17 @@ static void neigh_managed_work(struct work_struct *work) managed_work.work); struct neighbour *neigh; - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) { - struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); + struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; @@ -1656,17 +1679,27 @@ static void neigh_proxy_process(struct timer_list *t) } else if (!sched_next || tdif < sched_next) sched_next = tdif; } - del_timer(&tbl->proxy_timer); + timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); } +static unsigned long neigh_proxy_delay(struct neigh_parms *p) +{ + /* If proxy_delay is zero, do not call get_random_u32_below() + * as it is undefined behavior. + */ + unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); + + return proxy_delay ? + jiffies + get_random_u32_below(proxy_delay) : jiffies; +} + void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long sched_next = jiffies + - get_random_u32_below(NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = neigh_proxy_delay(p); if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); @@ -1677,7 +1710,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); - if (del_timer(&tbl->proxy_timer)) { + if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } @@ -1715,8 +1748,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; @@ -1729,9 +1761,9 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, return NULL; } - write_lock_bh(&tbl->lock); - list_add(&p->list, &tbl->parms.list); - write_unlock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); + list_add_rcu(&p->list, &tbl->parms.list); + spin_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } @@ -1751,23 +1783,20 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; - write_lock_bh(&tbl->lock); - list_del(&parms->list); + + spin_lock_bh(&tbl->lock); + list_del_rcu(&parms->list); parms->dead = 1; - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); + netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); -static void neigh_parms_destroy(struct neigh_parms *parms) -{ - kfree(parms); -} - static struct lock_class_key neigh_table_proxy_queue_class; -static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { @@ -1781,8 +1810,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); - tbl->parms.reachable_time = - neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); + neigh_set_reach_time(&tbl->parms); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); @@ -1809,7 +1837,8 @@ void neigh_table_init(int index, struct neigh_table *tbl) else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); - rwlock_init(&tbl->lock); + spin_lock_init(&tbl->lock); + mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, @@ -1824,17 +1853,23 @@ void neigh_table_init(int index, struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; - neigh_tables[index] = tbl; + rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */ int neigh_table_clear(int index, struct neigh_table *tbl) { - neigh_tables[index] = NULL; + RCU_INIT_POINTER(neigh_tables[index], NULL); + synchronize_rcu(); + /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); - del_timer_sync(&tbl->proxy_timer); + timer_delete_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) @@ -1862,10 +1897,10 @@ static struct neigh_table *neigh_find_table(int family) switch (family) { case AF_INET: - tbl = neigh_tables[NEIGH_ARP_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: - tbl = neigh_tables[NEIGH_ND_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } @@ -1945,10 +1980,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_release(neigh); - neigh_remove_one(neigh, tbl); - write_unlock_bh(&tbl->lock); + neigh_remove_one(neigh); + spin_unlock_bh(&tbl->lock); out: return err; @@ -2020,21 +2055,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; - - if (ndm_flags & NTF_MANAGED) { + if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } - err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); - if (pn) { - pn->flags = ndm_flags; - if (protocol) - pn->protocol = protocol; - err = 0; - } + err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, + !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } @@ -2052,7 +2079,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || - ndm_flags & NTF_EXT_LEARNED; + ndm_flags & (NTF_EXT_LEARNED | + NTF_EXT_VALIDATED); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; @@ -2063,10 +2091,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED will result in the neighbor + * being created with an invalid state (NUD_NONE). + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = NUD_NONE; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot create externally validated neighbor with an invalid state"); + err = -EINVAL; + goto out; + } + } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & - (NTF_EXT_LEARNED | NTF_MANAGED), + (NTF_EXT_LEARNED | NTF_MANAGED | + NTF_EXT_VALIDATED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); @@ -2078,6 +2123,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_release(neigh); goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED do not update the existing + * state other than clearing it if it was + * NUD_PERMANENT. + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot mark neighbor as externally validated with an invalid state"); + err = -EINVAL; + neigh_release(neigh); + goto out; + } + } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | @@ -2094,13 +2157,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; + if (ndm_flags & NTF_EXT_VALIDATED) + flags |= NEIGH_UPDATE_F_EXT_VALIDATED; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); - if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { + if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) neigh_event_send(neigh, NULL); - err = 0; - } neigh_release(neigh); out: return err; @@ -2115,7 +2178,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) return -ENOBUFS; if ((parms->dev && - nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || + nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || @@ -2130,7 +2193,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || - nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || @@ -2167,22 +2230,21 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || - nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) || - nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || - nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || - nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) + nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), + NDTA_PAD) || + nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || + nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || + nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; - long flush_delta = now - tbl->last_flush; - long rand_delta = now - tbl->last_rand; + long flush_delta = now - READ_ONCE(tbl->last_flush); + long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, @@ -2190,14 +2252,12 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_proxy_qlen = tbl->proxy_queue.qlen, + .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); + nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); - rcu_read_unlock_bh(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; @@ -2213,17 +2273,17 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); - ndst.ndts_allocs += st->allocs; - ndst.ndts_destroys += st->destroys; - ndst.ndts_hash_grows += st->hash_grows; - ndst.ndts_res_failed += st->res_failed; - ndst.ndts_lookups += st->lookups; - ndst.ndts_hits += st->hits; - ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast; - ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; - ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; - ndst.ndts_forced_gc_runs += st->forced_gc_runs; - ndst.ndts_table_fulls += st->table_fulls; + ndst.ndts_allocs += READ_ONCE(st->allocs); + ndst.ndts_destroys += READ_ONCE(st->destroys); + ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); + ndst.ndts_res_failed += READ_ONCE(st->res_failed); + ndst.ndts_lookups += READ_ONCE(st->lookups); + ndst.ndts_hits += READ_ONCE(st->hits); + ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); + ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); + ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); + ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); + ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, @@ -2235,12 +2295,10 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; - read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; nla_put_failure: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -2259,8 +2317,6 @@ static int neightbl_fill_param_info(struct sk_buff *skb, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; @@ -2269,11 +2325,9 @@ static int neightbl_fill_param_info(struct sk_buff *skb, neightbl_fill_parms(skb, parms) < 0) goto errout; - read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; errout: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -2290,6 +2344,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, @@ -2309,9 +2364,9 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct nlattr *tb[NDTA_MAX + 1]; struct neigh_table *tbl; struct ndtmsg *ndtmsg; - struct nlattr *tb[NDTA_MAX+1]; bool found = false; int err, tidx; @@ -2327,26 +2382,33 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); + rcu_read_lock(); + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = neigh_tables[tidx]; + tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } - if (!found) - return -ENOENT; + if (!found) { + rcu_read_unlock(); + err = -ENOENT; + goto errout; + } /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; @@ -2409,8 +2471,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, @@ -2452,21 +2513,22 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout_tbl_lock; if (tb[NDTA_THRESH1]) - tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); + WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) - tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]); + WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) - tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]); + WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) - tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]); + WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; errout_tbl_lock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); errout: return err; } @@ -2476,12 +2538,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, { struct ndtmsg *ndtm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); + if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } - ndtm = nlmsg_data(nlh); if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; @@ -2513,10 +2575,12 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; + rcu_read_lock(); + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = neigh_tables[tidx]; + tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; @@ -2530,7 +2594,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) nidx = 0; p = list_next_entry(&tbl->parms, list); - list_for_each_entry_from(p, &tbl->parms_list, list) { + list_for_each_entry_from_rcu(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; @@ -2550,6 +2614,8 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: + rcu_read_unlock(); + cb->args[0] = tidx; cb->args[1] = nidx; @@ -2625,13 +2691,15 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; + u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; - neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; - neigh_flags = pn->flags & NTF_OLD_MASK; + neigh_flags = READ_ONCE(pn->flags); + neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; + neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; @@ -2645,7 +2713,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; - if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + protocol = READ_ONCE(pn->protocol); + if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; @@ -2671,7 +2740,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = dev ? netdev_master_upper_dev_get(dev) : NULL; + master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". @@ -2704,7 +2773,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct net *net = sock_net(skb->sk); struct neighbour *n; - int rc, h, s_h = cb->args[1]; + int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; @@ -2712,37 +2781,31 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); + nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; - for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; - n != NULL; - n = rcu_dereference_bh(n->next)) { + idx = 0; + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - flags) < 0) { - rc = -1; + err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags); + if (err < 0) goto out; - } next: idx++; } } - rc = skb->len; out: - rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; - return rc; + return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, @@ -2751,43 +2814,38 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); - int rc, h, s_h = cb->args[3]; + int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - read_lock_bh(&tbl->lock); - for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; + n; + n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, flags, tbl) < 0) { - read_unlock_bh(&tbl->lock); - rc = -1; + err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags, tbl); + if (err < 0) goto out; - } next: idx++; } } - read_unlock_bh(&tbl->lock); - rc = skb->len; out: cb->args[3] = h; cb->args[4] = idx; - return rc; - + return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, @@ -2801,12 +2859,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, if (strict_check) { struct ndmsg *ndm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); @@ -2872,11 +2930,13 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; + err = 0; s_t = cb->args[0]; + rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { - tbl = neigh_tables[t]; + tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; @@ -2892,69 +2952,64 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } + rcu_read_unlock(); cb->args[0] = t; - return skb->len; + return err; } -static int neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, - void **dst, int *dev_idx, u8 *ndm_flags, - struct netlink_ext_ack *extack) +static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) { - struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); + } + + if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { + NL_SET_ERR_MSG(extack, "No device specified"); + return ERR_PTR(-EINVAL); } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) - return err; - - *ndm_flags = ndm->ndm_flags; - *dev_idx = ndm->ndm_ifindex; - *tbl = neigh_find_table(ndm->ndm_family); - if (*tbl == NULL) { - NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - return -EAFNOSUPPORT; - } + return ERR_PTR(err); for (i = 0; i <= NDA_MAX; ++i) { - if (!tb[i]) - continue; - switch (i) { case NDA_DST: - if (nla_len(tb[i]) != (int)(*tbl)->key_len) { - NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - return -EINVAL; + if (!tb[i]) { + NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); + return ERR_PTR(-EINVAL); } - *dst = nla_data(tb[i]); break; default: + if (!tb[i]) + continue; + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } } - return 0; + return ndm; } static inline size_t neigh_nlmsg_size(void) @@ -2968,27 +3023,6 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int neigh_get_reply(struct net *net, struct neighbour *neigh, - u32 pid, u32 seq) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) @@ -2997,85 +3031,91 @@ static inline size_t pneigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, - u32 pid, u32 seq, struct neigh_table *tbl) +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { + struct net *net = sock_net(in_skb->sk); + u32 pid = NETLINK_CB(in_skb).portid; + struct nlattr *tb[NDA_MAX + 1]; + struct net_device *dev = NULL; + u32 seq = nlh->nlmsg_seq; + struct neigh_table *tbl; + struct neighbour *neigh; struct sk_buff *skb; - int err = 0; + struct ndmsg *ndm; + void *dst; + int err; - skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + ndm = neigh_valid_get_req(nlh, tb, extack); + if (IS_ERR(ndm)) + return PTR_ERR(ndm); + + if (ndm->ndm_flags & NTF_PROXY) + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + else + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; - err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); - if (err) { - kfree_skb(skb); - goto errout; - } + rcu_read_lock(); - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} + tbl = neigh_find_table(ndm->ndm_family); + if (!tbl) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + err = -EAFNOSUPPORT; + goto err_unlock; + } -static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(in_skb->sk); - struct net_device *dev = NULL; - struct neigh_table *tbl = NULL; - struct neighbour *neigh; - void *dst = NULL; - u8 ndm_flags = 0; - int dev_idx = 0; - int err; + if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + err = -EINVAL; + goto err_unlock; + } - err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, - extack); - if (err < 0) - return err; + dst = nla_data(tb[NDA_DST]); - if (dev_idx) { - dev = __dev_get_by_index(net, dev_idx); + if (ndm->ndm_ifindex) { + dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); - return -ENODEV; + err = -ENODEV; + goto err_unlock; } } - if (!dst) { - NL_SET_ERR_MSG(extack, "Network address not specified"); - return -EINVAL; - } - - if (ndm_flags & NTF_PROXY) { + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - pn = pneigh_lookup(tbl, net, dst, dev, 0); + pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); - return -ENOENT; + err = -ENOENT; + goto err_unlock; } - return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, tbl); - } - if (!dev) { - NL_SET_ERR_MSG(extack, "No device specified"); - return -EINVAL; - } + err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) + goto err_unlock; + } else { + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + err = -ENOENT; + goto err_unlock; + } - neigh = neigh_lookup(tbl, dst, dev); - if (!neigh) { - NL_SET_ERR_MSG(extack, "Neighbour entry not found"); - return -ENOENT; + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + neigh_release(neigh); + if (err) + goto err_unlock; } - err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq); - - neigh_release(neigh); + rcu_read_unlock(); + return rtnl_unicast(skb, net, pid); +err_unlock: + rcu_read_unlock(); + kfree_skb(skb); return err; } @@ -3084,20 +3124,18 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void int chain; struct neigh_hash_table *nht; - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); + rcu_read_lock(); + nht = rcu_dereference(tbl->nht); - read_lock(&tbl->lock); /* avoid resizes */ + spin_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; - for (n = rcu_dereference_bh(nht->hash_buckets[chain]); - n != NULL; - n = rcu_dereference_bh(n->next)) + neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } - read_unlock(&tbl->lock); - rcu_read_unlock_bh(); + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); @@ -3105,29 +3143,25 @@ EXPORT_SYMBOL(neigh_for_each); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { - int chain; struct neigh_hash_table *nht; + int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[chain]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); - } else - np = &n->next; + } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); @@ -3140,14 +3174,15 @@ int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; + if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; - tbl = neigh_tables[index]; + rcu_read_lock(); + tbl = rcu_dereference(neigh_tables[index]); if (!tbl) - goto out; - rcu_read_lock_bh(); + goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); @@ -3159,11 +3194,12 @@ int neigh_xmit(int index, struct net_device *dev, neigh = __neigh_create(tbl, addr, dev, false); err = PTR_ERR(neigh); if (IS_ERR(neigh)) { - rcu_read_unlock_bh(); + rcu_read_unlock(); goto out_kfree_skb; } - err = neigh->output(neigh, skb); - rcu_read_unlock_bh(); + err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock: + rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { err = dev_hard_header(skb, dev, ntohs(skb->protocol), @@ -3182,43 +3218,53 @@ EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS -static struct neighbour *neigh_get_first(struct seq_file *seq) +static struct neighbour *neigh_get_valid(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); + + if (!net_eq(dev_net(n->dev), net)) + return NULL; + + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); + if (!v) + return NULL; + if (pos) + return v; + } + + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + return n; + + if (READ_ONCE(n->nud_state) & ~NUD_NOARP) + return n; + + return NULL; +} + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; - struct neighbour *n = NULL; - int bucket; + struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { - n = rcu_dereference_bh(nht->hash_buckets[bucket]); - - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - loff_t fakep = 0; - void *v; - v = state->neigh_sub_iter(state, n, &fakep); - if (!v) - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - if (n->nud_state & ~NUD_NOARP) - break; -next: - n = rcu_dereference_bh(n->next); + while (++state->bucket < (1 << nht->hash_shift)) { + neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { + tmp = neigh_get_valid(seq, n, NULL); + if (tmp) + return tmp; } - - if (n) - break; } - state->bucket = bucket; - return n; + return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, @@ -3226,46 +3272,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; - struct net *net = seq_file_net(seq); - struct neigh_hash_table *nht = state->nht; + struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); + if (v) return n; } - n = rcu_dereference_bh(n->next); - while (1) { - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - void *v = state->neigh_sub_iter(state, n, pos); - if (v) - return n; - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - - if (n->nud_state & ~NUD_NOARP) - break; -next: - n = rcu_dereference_bh(n->next); + hlist_for_each_entry_continue(n, hash) { + tmp = neigh_get_valid(seq, n, pos); + if (tmp) { + n = tmp; + goto out; } - - if (n) - break; - - if (++state->bucket >= (1 << nht->hash_shift)) - break; - - n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); } + n = neigh_get_first(seq); +out: if (n && pos) --(*pos); + return n; } @@ -3294,9 +3322,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = tbl->phash_buckets[bucket]; + pn = rcu_dereference(tbl->phash_buckets[bucket]); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3314,15 +3343,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = pn->next; + pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = tbl->phash_buckets[state->bucket]; + + pn = rcu_dereference(tbl->phash_buckets[state->bucket]); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3363,17 +3394,17 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) - __acquires(rcu_bh) + __acquires(rcu) { struct neigh_seq_state *state = seq->private; state->tbl = tbl; - state->bucket = 0; + state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); - rcu_read_lock_bh(); - state->nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); + rcu_read_lock(); + state->nht = rcu_dereference(tbl->nht); + spin_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } @@ -3408,13 +3439,13 @@ EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) - __releases(rcu_bh) + __releases(rcu) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; - read_unlock(&tbl->lock); - rcu_read_unlock_bh(); + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); @@ -3504,10 +3535,12 @@ static const struct seq_operations neigh_stat_seq_ops = { static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { - struct net *net = dev_net(n->dev); struct sk_buff *skb; int err = -ENOBUFS; + struct net *net; + rcu_read_lock(); + net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; @@ -3520,10 +3553,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; + goto out; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +out: + rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) @@ -3535,7 +3569,7 @@ EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); -static int proc_unres_qlen(struct ctl_table *ctl, int write, +static int proc_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; @@ -3570,7 +3604,7 @@ static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, rcu_read_unlock(); } -static void neigh_proc_update(struct ctl_table *ctl, int write) +static void neigh_proc_update(const struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; @@ -3587,7 +3621,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) neigh_copy_dflt_parms(net, p, index); } -static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3602,7 +3636,7 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; @@ -3618,7 +3652,7 @@ static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int wr return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, +int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3628,7 +3662,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, +int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3638,7 +3672,7 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, } EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); -static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3648,7 +3682,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, +int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3658,7 +3692,7 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); -static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3668,7 +3702,7 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, return ret; } -static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, +static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3687,8 +3721,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, * only be effective after the next time neigh_periodic_work * decides to recompute it */ - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); } return ret; } @@ -3725,7 +3758,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3776,7 +3809,6 @@ static struct neigh_sysctl_table { .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, - {}, }, }; @@ -3788,6 +3820,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; + size_t neigh_vars_size; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) @@ -3799,11 +3832,11 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, t->neigh_vars[i].extra2 = p; } + neigh_vars_size = ARRAY_SIZE(t->neigh_vars); if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, - sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); + neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; dev_name_source = "default"; @@ -3850,8 +3883,9 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); - t->sysctl_header = - register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars); + t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p), + neigh_path, t->neigh_vars, + neigh_vars_size); if (!t->sysctl_header) goto free; @@ -3878,16 +3912,20 @@ EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ +static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, + {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, + {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, +}; + static int __init neigh_init(void) { - rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); - - rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, - 0); - rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0); - + rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 1ec23bf8b05c..70e0e9a3b650 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -3,52 +3,22 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/wext.h> +#include <net/hotdata.h> #include "dev.h" -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) - -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) - -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos) { - struct net *net = seq_file_net(seq); + unsigned long ifindex = *pos; struct net_device *dev; - struct hlist_head *h; - unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_index_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, index_hlist) { - if (++count == offset) - return dev; + for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { + *pos = dev->ifindex; + return dev; } - return NULL; } -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) -{ - struct net_device *dev; - unsigned int bucket; - - do { - dev = dev_from_same_bucket(seq, pos); - if (dev) - return dev; - - bucket = get_bucket(*pos) + 1; - *pos = set_bucket_offset(bucket, 1); - } while (bucket < NETDEV_HASHENTRIES); - - return NULL; -} - -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. - */ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { @@ -56,16 +26,13 @@ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) if (!*pos) return SEQ_START_TOKEN; - if (get_bucket(*pos) >= NETDEV_HASHENTRIES) - return NULL; - - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void dev_seq_stop(struct seq_file *seq, void *v) @@ -115,10 +82,14 @@ static int dev_seq_show(struct seq_file *seq, void *v) return 0; } -static u32 softnet_backlog_len(struct softnet_data *sd) +static u32 softnet_input_pkt_queue_len(struct softnet_data *sd) { - return skb_queue_len_lockless(&sd->input_pkt_queue) + - skb_queue_len_lockless(&sd->process_queue); + return skb_queue_len_lockless(&sd->input_pkt_queue); +} + +static u32 softnet_process_queue_len(struct softnet_data *sd) +{ + return skb_queue_len_lockless(&sd->process_queue); } static struct softnet_data *softnet_get_online(loff_t *pos) @@ -152,6 +123,8 @@ static void softnet_seq_stop(struct seq_file *seq, void *v) static int softnet_seq_show(struct seq_file *seq, void *v) { struct softnet_data *sd = v; + u32 input_qlen = softnet_input_pkt_queue_len(sd); + u32 process_qlen = softnet_process_queue_len(sd); unsigned int flow_limit_count = 0; #ifdef CONFIG_NET_FLOW_LIMIT @@ -159,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); + /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) - flow_limit_count = fl->count; + flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif @@ -169,12 +143,16 @@ static int softnet_seq_show(struct seq_file *seq, void *v) * mapping the data a specific CPU */ seq_printf(seq, - "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x\n", + READ_ONCE(sd->processed), + numa_drop_read(&sd->drop_counters), + READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count, - softnet_backlog_len(sd), (int)seq->index); + READ_ONCE(sd->received_rps), flow_limit_count, + input_qlen + process_qlen, (int)seq->index, + input_qlen, process_qlen); return 0; } @@ -209,7 +187,13 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) } } - list_for_each_entry_rcu(pt, &ptype_all, list) { + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) { if (i == pos) return pt; ++i; @@ -234,6 +218,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); struct net_device *dev; struct packet_type *pt; struct list_head *nxt; @@ -256,15 +241,22 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) goto found; } } - - nxt = ptype_all.next; - goto ptype_all; + nxt = net->ptype_all.next; + goto net_ptype_all; } - if (pt->type == htons(ETH_P_ALL)) { -ptype_all: - if (nxt != &ptype_all) + if (pt->af_packet_net) { +net_ptype_all: + if (nxt != &net->ptype_all && nxt != &net->ptype_specific) goto found; + + if (nxt == &net->ptype_all) { + /* continue with ->ptype_specific if it's not empty */ + nxt = net->ptype_specific.next; + if (nxt != &net->ptype_specific) + goto found; + } + hash = 0; nxt = ptype_base[0].next; } else diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ca55dd747d6c..ca878525ad7c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,9 @@ #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> +#include <net/rps.h> #include "dev.h" #include "net-sysfs.h" @@ -30,13 +33,95 @@ #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; static const char fmt_dec[] = "%d\n"; +static const char fmt_uint[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; -/* Caller holds RTNL or dev_base_lock */ +/* Caller holds RTNL, netdev->lock or RCU */ static inline int dev_isalive(const struct net_device *dev) { - return dev->reg_state <= NETREG_REGISTERED; + return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; +} + +/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, + * when unregistering a net device and accessing associated sysfs files. The + * potential deadlock is as follow: + * + * CPU 0 CPU 1 + * + * rtnl_lock vfs_read + * unregister_netdevice_many kernfs_seq_start + * device_del / kobject_put kernfs_get_active (kn->active++) + * kernfs_drain sysfs_kf_seq_show + * wait_event( rtnl_lock + * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release + * -> waits on CPU 1 to decrease kn->active the rtnl lock. + * + * The historical fix was to use rtnl_trylock with restart_syscall to bail out + * of sysfs operations when the lock couldn't be taken. This fixed the above + * issue as it allowed CPU 1 to bail out of the ABBA situation. + * + * But it came with performances issues, as syscalls are being restarted in + * loops when there was contention on the rtnl lock, with huge slow downs in + * specific scenarios (e.g. lots of virtual interfaces created and userspace + * daemons querying their attributes). + * + * The idea below is to bail out of the active kernfs_node protection + * (kn->active) while trying to take the rtnl lock. + * + * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The + * net device is guaranteed to be alive if this returns successfully. + */ +static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, + struct net_device *ndev) +{ + struct kernfs_node *kn; + int ret = 0; + + /* First, we hold a reference to the net device as the unregistration + * path might run in parallel. This will ensure the net device and the + * associated sysfs objects won't be freed while we try to take the rtnl + * lock. + */ + dev_hold(ndev); + /* sysfs_break_active_protection was introduced to allow self-removal of + * devices and their associated sysfs files by bailing out of the + * sysfs/kernfs protection. We do this here to allow the unregistration + * path to complete in parallel. The following takes a reference on the + * kobject and the kernfs_node being accessed. + * + * This works because we hold a reference onto the net device and the + * unregistration path will wait for us eventually in netdev_run_todo + * (outside an rtnl lock section). + */ + kn = sysfs_break_active_protection(kobj, attr); + /* We can now try to take the rtnl lock. This can't deadlock us as the + * unregistration path is able to drain sysfs files (kernfs_node) thanks + * to the above dance. + */ + if (rtnl_lock_interruptible()) { + ret = -ERESTARTSYS; + goto unbreak; + } + /* Check dismantle on the device hasn't started, otherwise deny the + * operation. + */ + if (!dev_isalive(ndev)) { + rtnl_unlock(); + ret = -ENODEV; + goto unbreak; + } + /* We are now sure the device dismantle hasn't started nor that it can + * start before we exit the locking section as we hold the rtnl lock. + * There's no need to keep unbreaking the sysfs protection nor to hold + * a net device reference from that point; that was only needed to take + * the rtnl lock. + */ +unbreak: + sysfs_unbreak_active_protection(kn); + dev_put(ndev); + + return ret; } /* use same locking rules as GIF* ioctl's */ @@ -47,10 +132,10 @@ static ssize_t netdev_show(const struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } @@ -59,7 +144,7 @@ static ssize_t netdev_show(const struct device *dev, #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sysfs_emit(buf, format_string, dev->field); \ + return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -92,16 +177,46 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (ret) goto err; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + goto err; + + ret = (*set)(netdev, new); + if (ret == 0) + ret = len; + + rtnl_unlock(); + err: + return ret; +} + +/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */ +static ssize_t +netdev_lock_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *netdev = to_net_dev(dev); + struct net *net = dev_net(netdev); + unsigned long new; + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + ret = kstrtoul(buf, 0, &new); + if (ret) + return ret; + + netdev_lock(netdev); if (dev_isalive(netdev)) { ret = (*set)(netdev, new); if (ret == 0) ret = len; } - rtnl_unlock(); - err: + netdev_unlock(netdev); + return ret; } @@ -124,7 +239,7 @@ static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sysfs_emit(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, @@ -134,24 +249,28 @@ static ssize_t name_assign_type_show(struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (ndev->name_assign_type != NET_NAME_UNKNOWN) + if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; } static DEVICE_ATTR_RO(name_assign_type); -/* use same locking rules as GIFHWADDR ioctl's */ +/* use same locking rules as GIFHWADDR ioctl's (netif_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + down_read(&dev_addr_sem); + + rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); - read_unlock(&dev_base_lock); + rcu_read_unlock(); + + up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); @@ -160,10 +279,13 @@ static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); + int ret = -EINVAL; + rcu_read_lock(); if (dev_isalive(ndev)) - return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); - return -EINVAL; + ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); + rcu_read_unlock(); + return ret; } static DEVICE_ATTR_RO(broadcast); @@ -180,7 +302,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early - * without hitting the trylock/restart in netdev_store. + * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; @@ -192,11 +314,24 @@ static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + int ret; - if (netif_running(netdev)) - return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - return -EINVAL; + ret = -EINVAL; + if (netif_running(netdev)) { + /* Synchronize carrier state with link watch, + * see also rtnl_getlink(). + */ + linkwatch_sync_dev(netdev); + + ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + } + + rtnl_unlock(); + return ret; } static DEVICE_ATTR_RW(carrier); @@ -207,15 +342,17 @@ static ssize_t speed_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (netif_running(netdev) && netif_device_present(netdev)) { + ret = -EINVAL; + if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) @@ -233,14 +370,16 @@ static ssize_t duplex_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -306,11 +445,9 @@ static ssize_t operstate_show(struct device *dev, const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; - read_lock(&dev_base_lock); - operstate = netdev->operstate; + operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; - read_unlock(&dev_base_lock); if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ @@ -390,7 +527,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - WRITE_ONCE(dev->gro_flush_timeout, val); + netdev_set_gro_flush_timeout(dev, val); return 0; } @@ -401,13 +538,16 @@ static ssize_t gro_flush_timeout_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_gro_flush_timeout); + return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout); } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { - WRITE_ONCE(dev->napi_defer_hard_irqs, val); + if (val > S32_MAX) + return -ERANGE; + + netdev_set_defer_hard_irqs(dev, (u32)val); return 0; } @@ -418,9 +558,10 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); + return netdev_lock_store(dev, attr, buf, len, + change_napi_defer_hard_irqs); } -NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint); static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -428,7 +569,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; - ssize_t ret = 0; + ssize_t ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -437,16 +578,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - ret = dev_set_alias(netdev, buf, count); - if (ret < 0) - goto err; - ret = len; - netdev_state_change(netdev); - } + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); err: rtnl_unlock(); @@ -458,7 +598,7 @@ static ssize_t ifalias_show(struct device *dev, { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; - ssize_t ret = 0; + ssize_t ret; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) @@ -498,24 +638,17 @@ static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; - - /* The check is also done in dev_get_phys_port_id; this helps returning - * early without hitting the trylock/restart below. - */ - if (!netdev->netdev_ops->ndo_get_phys_port_id) - return -EOPNOTSUPP; + struct netdev_phys_item_id ppid; + ssize_t ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_phys_port_id(netdev, &ppid); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -526,25 +659,17 @@ static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + char name[IFNAMSIZ]; + ssize_t ret; - /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. - */ - if (!netdev->netdev_ops->ndo_get_phys_port_name && - !netdev->devlink_port) - return -EOPNOTSUPP; - - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - char name[IFNAMSIZ]; + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sysfs_emit(buf, "%s\n", name); - ret = dev_get_phys_port_name(netdev, name, sizeof(name)); - if (!ret) - ret = sysfs_emit(buf, "%s\n", name); - } rtnl_unlock(); return ret; @@ -555,45 +680,70 @@ static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + struct netdev_phys_item_id ppid = { }; + ssize_t ret; - /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. This works - * because recurse is false when calling dev_get_port_parent_id. - */ - if (!netdev->netdev_ops->ndo_get_port_parent_id && - !netdev->devlink_port) - return -EOPNOTSUPP; + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = netif_get_port_parent_id(netdev, &ppid, false); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid = { }; - - ret = dev_get_port_parent_id(netdev, &ppid, false); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_switch_id); +static struct attribute *netdev_phys_attrs[] __ro_after_init = { + &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr, + NULL, +}; + +static umode_t netdev_phys_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + struct device *dev = kobj_to_dev(kobj); + struct net_device *netdev = to_net_dev(dev); + + if (attr == &dev_attr_phys_port_id.attr) { + if (!netdev->netdev_ops->ndo_get_phys_port_id) + return 0; + } else if (attr == &dev_attr_phys_port_name.attr) { + if (!netdev->netdev_ops->ndo_get_phys_port_name && + !netdev->devlink_port) + return 0; + } else if (attr == &dev_attr_phys_switch_id.attr) { + if (!netdev->netdev_ops->ndo_get_port_parent_id && + !netdev->devlink_port) + return 0; + } + + return attr->mode; +} + +static const struct attribute_group netdev_phys_group = { + .attrs = netdev_phys_attrs, + .is_visible = netdev_phys_is_visible, +}; + static ssize_t threaded_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + rcu_read_lock(); if (dev_isalive(netdev)) - ret = sysfs_emit(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + + rcu_read_unlock(); - rtnl_unlock(); return ret; } @@ -607,7 +757,7 @@ static int modify_napi_threaded(struct net_device *dev, unsigned long val) if (val != 0 && val != 1) return -EOPNOTSUPP; - ret = dev_set_threaded(dev, val); + ret = netif_set_threaded(dev, val); return ret; } @@ -616,7 +766,7 @@ static ssize_t threaded_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - return netdev_store(dev, attr, buf, len, modify_napi_threaded); + return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded); } static DEVICE_ATTR_RW(threaded); @@ -646,9 +796,6 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_napi_defer_hard_irqs.attr, - &dev_attr_phys_port_id.attr, - &dev_attr_phys_port_name.attr, - &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, @@ -668,14 +815,14 @@ static ssize_t netstat_show(const struct device *d, WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } @@ -831,42 +978,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t store_rps_map(struct netdev_rx_queue *queue, - const char *buf, size_t len) +static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, + cpumask_var_t mask) { - struct rps_map *old_map, *map; - cpumask_var_t mask; - int err, cpu, i; static DEFINE_MUTEX(rps_map_mutex); - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); - if (err) { - free_cpumask_var(mask); - return err; - } - - if (!cpumask_empty(mask)) { - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); - if (cpumask_empty(mask)) { - free_cpumask_var(mask); - return -EINVAL; - } - } + struct rps_map *old_map, *map; + int cpu, i; map = kzalloc(max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), GFP_KERNEL); - if (!map) { - free_cpumask_var(mask); + if (!map) return -ENOMEM; - } i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) @@ -893,9 +1016,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, if (old_map) kfree_rcu(old_map, rcu); + return 0; +} + +int rps_cpumask_housekeeping(struct cpumask *mask) +{ + if (!cpumask_empty(mask)) { + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); + if (cpumask_empty(mask)) + return -EINVAL; + } + return 0; +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + const char *buf, size_t len) +{ + cpumask_var_t mask; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) + goto out; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto out; + + err = netdev_rx_queue_set_rps_mask(queue, mask); +out: free_cpumask_var(mask); - return len; + return err ? : len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, @@ -907,7 +1066,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) - val = (unsigned long)flow_table->mask + 1; + val = 1UL << flow_table->log; rcu_read_unlock(); return sysfs_emit(buf, "%lu\n", val); @@ -960,9 +1119,11 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, if (!table) return -ENOMEM; - table->mask = mask; - for (count = 0; count <= mask; count++) + table->log = ilog2(mask) + 1; + for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; + table->flows[count].filter = RPS_NO_FILTER; + } } else { table = NULL; } @@ -1026,7 +1187,7 @@ static const void *rx_queue_namespace(const struct kobject *kobj) struct device *dev = &queue->dev->dev; const void *ns = NULL; - if (dev->class && dev->class->ns_type) + if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; @@ -1040,20 +1201,56 @@ static void rx_queue_get_ownership(const struct kobject *kobj, net_ns_get_ownership(net, uid, gid); } -static struct kobj_type rx_queue_ktype __ro_after_init = { +static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; +static int rx_queue_default_mask(struct net_device *dev, + struct netdev_rx_queue *queue) +{ +#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) + struct cpumask *rps_default_mask; + int res = 0; + + mutex_lock(&rps_default_mask_mutex); + + rps_default_mask = dev_net(dev)->core.rps_default_mask; + if (rps_default_mask && !cpumask_empty(rps_default_mask)) + res = netdev_rx_queue_set_rps_mask(queue, rps_default_mask); + + mutex_unlock(&rps_default_mask_mutex); + + return res; +#else + return 0; +#endif +} + static int rx_queue_add_kobject(struct net_device *dev, int index) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; + /* Rx queues are cleared in rx_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ @@ -1065,16 +1262,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = rx_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) - goto err; + goto err_default_groups; } + error = rx_queue_default_mask(dev, queue); + if (error) + goto err_default_groups; + kobject_uevent(kobj, KOBJ_ADD); return error; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1119,12 +1327,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { - struct kobject *kobj = &dev->_rx[i].kobj; + struct netdev_rx_queue *queue = &dev->_rx[i]; + struct kobject *kobj = &queue->kobj; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } @@ -1163,9 +1373,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num, */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, char *buf); - ssize_t (*store)(struct netdev_queue *queue, - const char *buf, size_t len); + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) @@ -1182,7 +1394,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - return attribute->show(queue, buf); + return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, @@ -1196,7 +1408,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - return attribute->store(queue, buf, count); + return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -1204,7 +1416,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; -static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) +static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); @@ -1222,18 +1435,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } -static ssize_t traffic_class_show(struct netdev_queue *queue, - char *buf) +static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int num_tc, tc; - int index; + int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; index = get_netdev_queue_index(queue); @@ -1260,24 +1473,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static ssize_t tx_maxrate_show(struct netdev_queue *queue, - char *buf) +static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } -static ssize_t tx_maxrate_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { - struct net_device *dev = queue->dev; int err, index = get_netdev_queue_index(queue); + struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without - * hitting the trylock/restart below. + * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; @@ -1286,18 +1500,23 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (err < 0) return err; - if (!rtnl_trylock()) - return restart_syscall(); + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) + return err; err = -EOPNOTSUPP; + netdev_lock_ops(dev); if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); + netdev_unlock_ops(dev); - rtnl_unlock(); if (!err) { queue->tx_maxrate = rate; + rtnl_unlock(); return len; } + + rtnl_unlock(); return err; } @@ -1341,16 +1560,17 @@ static ssize_t bql_set(const char *buf, const size_t count, return count; } -static ssize_t bql_show_hold_time(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } -static ssize_t bql_set_hold_time(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1369,8 +1589,72 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); -static ssize_t bql_show_inflight(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); +} + +static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct dql *dql = &queue->dql; + unsigned int value; + int err; + + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + + value = msecs_to_jiffies(value); + if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG)) + return -ERANGE; + + if (!dql->stall_thrs && value) + dql->last_reap = jiffies; + /* Force last_reap to be live */ + smp_wmb(); + dql->stall_thrs = value; + + return len; +} + +static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = + __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); + +static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); +} + +static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) +{ + WRITE_ONCE(queue->dql.stall_max, 0); + return len; +} + +static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = + __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); + +static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%lu\n", dql->stall_cnt); +} + +static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = + __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); + +static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1381,13 +1665,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ -static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - char *buf) \ +static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ -static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ +static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ @@ -1407,6 +1694,9 @@ static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_min_attribute.attr, &bql_hold_time_attribute.attr, &bql_inflight_attribute.attr, + &bql_stall_thrs_attribute.attr, + &bql_stall_cnt_attribute.attr, + &bql_stall_max_attribute.attr, NULL }; @@ -1414,6 +1704,9 @@ static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; +#else +/* Fake declaration, all the code using it should be dead */ +static const struct attribute_group dql_group = {}; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS @@ -1467,19 +1760,21 @@ out_no_maps: return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int len, tc; + int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; @@ -1490,18 +1785,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) return -EINVAL; } - /* Make sure the subordinate device can't be freed */ - get_device(&dev->dev); + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); - put_device(&dev->dev); + dev_put(dev); return len; } -static ssize_t xps_cpus_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct net_device *dev = queue->dev; unsigned int index; @@ -1525,9 +1823,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { free_cpumask_var(mask); - return restart_syscall(); + return err; } err = netif_set_xps_queue(dev, mask, index); @@ -1541,26 +1840,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); -static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int tc; + int tc, ret; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, dev); + if (ret) + return ret; tc = netdev_txq_to_tc(dev, index); + + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); - if (tc < 0) - return -EINVAL; - return xps_queue_show(dev, index, tc, buf, XPS_RXQS); + ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; + dev_put(dev); + return ret; } -static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, +static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; @@ -1584,9 +1891,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { bitmap_free(mask); - return restart_syscall(); + return err; } cpus_read_lock(); @@ -1629,7 +1937,7 @@ static const void *netdev_queue_namespace(const struct kobject *kobj) struct device *dev = &queue->dev->dev; const void *ns = NULL; - if (dev->class && dev->class->ns_type) + if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; @@ -1643,20 +1951,43 @@ static void netdev_queue_get_ownership(const struct kobject *kobj, net_ns_get_ownership(net, uid, gid); } -static struct kobj_type netdev_queue_ktype __ro_after_init = { +static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; +static bool netdev_uses_bql(const struct net_device *dev) +{ + if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE)) + return false; + + return IS_ENABLED(CONFIG_BQL); +} + static int netdev_queue_add_kobject(struct net_device *dev, int index) { struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; + /* Tx queues are cleared in netdev_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ @@ -1668,15 +1999,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; -#ifdef CONFIG_BQL - error = sysfs_create_group(kobj, &dql_group); + queue->groups = netdev_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); if (error) goto err; -#endif + + if (netdev_uses_bql(dev)) { + error = sysfs_create_group(kobj, &dql_group); + if (error) + goto err_default_groups; + } kobject_uevent(kobj, KOBJ_ADD); return 0; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1693,9 +2031,9 @@ static int tx_queue_change_owner(struct net_device *ndev, int index, if (error) return error; -#ifdef CONFIG_BQL - error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); -#endif + if (netdev_uses_bql(ndev)) + error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); + return error; } #endif /* CONFIG_SYSFS */ @@ -1725,11 +2063,13 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) queue->kobj.uevent_suppress = 1; -#ifdef CONFIG_BQL - sysfs_remove_group(&queue->kobj, &dql_group); -#endif + + if (netdev_uses_bql(dev)) + sysfs_remove_group(&queue->kobj, &dql_group); + + sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } @@ -1829,8 +2169,10 @@ static void remove_queue_kobjects(struct net_device *dev) net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); + netdev_lock_ops(dev); dev->real_num_rx_queues = 0; dev->real_num_tx_queues = 0; + netdev_unlock_ops(dev); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif @@ -1907,7 +2249,7 @@ static void netdev_release(struct device *d) * device is dead and about to be freed. */ kfree(rcu_access_pointer(dev->ifalias)); - netdev_freemem(dev); + kvfree(dev); } static const void *net_namespace(const struct device *d) @@ -1925,7 +2267,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) net_ns_get_ownership(net, uid, gid); } -static struct class net_class __ro_after_init = { +static const struct class net_class = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, @@ -1975,7 +2317,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->ns.count)) + if (!check_net(dev_net(ndev))) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); @@ -2007,6 +2349,7 @@ int netdev_register_kobject(struct net_device *ndev) groups++; *groups++ = &netstat_group; + *groups++ = &netdev_phys_group; if (wireless_group_needed(ndev)) *groups++ = &wireless_group; diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 8a5b04c2699a..e938f25e8e86 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -11,4 +11,6 @@ int netdev_queue_update_kobjects(struct net_device *net, int netdev_change_owner(struct net_device *, const struct net *net_old, const struct net *net_new); +extern struct mutex rps_default_mask_mutex; + #endif diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c40cd8dd75c7..f2fa34b1d78d 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -23,7 +23,7 @@ #include <linux/net_dropmon.h> #include <linux/slab.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <asm/bitops.h> #define CREATE_TRACE_POINTS @@ -41,6 +41,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full); #endif #if IS_ENABLED(CONFIG_PAGE_POOL) @@ -61,3 +62,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); + +EXPORT_TRACEPOINT_SYMBOL_GPL(udp_fail_queue_rcv_skb); + +EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 078a0a420c8a..a6e6a964a287 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -19,8 +19,10 @@ #include <linux/net_namespace.h> #include <linux/sched/task.h> #include <linux/uidgid.h> -#include <linux/cookie.h> +#include <linux/proc_fs.h> +#include <linux/nstree.h> +#include <net/aligned_data.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> @@ -55,7 +57,6 @@ static bool init_net_initialized; * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); -EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) @@ -64,16 +65,17 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem); static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; -DEFINE_COOKIE(net_cookie); - static struct net_generic *net_alloc_generic(void) { + unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); + unsigned int generic_size; struct net_generic *ng; - unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + generic_size = offsetof(struct net_generic, ptr[gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) - ng->s.len = max_gen_ptrs; + ng->s.len = gen_ptrs; return ng; } @@ -121,7 +123,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) int err = -ENOMEM; void *data = NULL; - if (ops->id && ops->size) { + if (ops->id) { data = kzalloc(ops->size, GFP_KERNEL); if (!data) goto out; @@ -136,7 +138,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; - if (ops->id && ops->size) { + if (ops->id) { ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; @@ -160,16 +162,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops, } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -178,12 +209,63 @@ static void ops_free_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; - if (ops->size && ops->id) { + + if (ops->id) { list_for_each_entry(net, net_exit_list, exit_list) kfree(net_generic(net, *ops->id)); } } +static void ops_undo_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list, + bool expedite_rcu) +{ + const struct pernet_operations *saved_ops; + bool hold_rtnl = false; + + if (!ops) + ops = list_entry(ops_list, typeof(*ops), list); + + saved_ops = ops; + + list_for_each_entry_continue_reverse(ops, ops_list, list) { + hold_rtnl |= !!ops->exit_rtnl; + ops_pre_exit_list(ops, net_exit_list); + } + + /* Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so the + * rcu_barrier() after ops_undo_list() isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + if (expedite_rcu) + synchronize_rcu_expedited(); + else + synchronize_rcu(); + + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_exit_list(ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_free_list(ops, net_exit_list); +} + +static void ops_undo_single(struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + LIST_HEAD(ops_list); + + list_add(&ops->list, &ops_list); + ops_undo_list(&ops_list, NULL, net_exit_list, false); + list_del(&ops->list); +} + /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { @@ -233,13 +315,13 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->ns.count) == 0) + if (!check_net(net)) return NETNSA_NSID_NOT_ASSIGNED; - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); id = __peernet2id(net, peer); if (id >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); return id; } @@ -249,12 +331,12 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) * just been idr_remove()'d from there in cleanup_net(). */ if (!maybe_get_net(peer)) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); return NETNSA_NSID_NOT_ASSIGNED; } id = alloc_netid(net, peer, -1); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); put_net(peer); if (id < 0) @@ -304,31 +386,62 @@ struct net *get_net_ns_by_id(const struct net *net, int id) } EXPORT_SYMBOL_GPL(get_net_ns_by_id); -/* - * setup_net runs the initializers for the network namespace object. - */ -static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) +static __net_init void preinit_net_sysctl(struct net *net) { - /* Must be called with pernet_ops_rwsem held */ - const struct pernet_operations *ops, *saved_ops; - int error = 0; - LIST_HEAD(net_exit_list); + net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + net->core.sysctl_tstamp_allow_data = 1; + net->core.sysctl_txq_reselection = msecs_to_jiffies(1000); +} - refcount_set(&net->ns.count, 1); - ref_tracker_dir_init(&net->refcnt_tracker, 128); - ref_tracker_dir_init(&net->notrefcnt_tracker, 128); +/* init code that must occur even if setup_net() is not called. */ +static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) +{ + int ret; + + ret = ns_common_init(net); + if (ret) + return ret; refcount_set(&net->passive, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); + ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); + get_random_bytes(&net->hash_mix, sizeof(u32)); - preempt_disable(); - net->net_cookie = gen_cookie_next(&net_cookie); - preempt_enable(); net->dev_base_seq = 1; net->user_ns = user_ns; + idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); mutex_init(&net->ipv4.ra_mutex); +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + mutex_init(&net->rtnl_mutex); + lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); +#endif + + INIT_LIST_HEAD(&net->ptype_all); + INIT_LIST_HEAD(&net->ptype_specific); + preinit_net_sysctl(net); + return 0; +} + +/* + * setup_net runs the initializers for the network namespace object. + */ +static __net_init int setup_net(struct net *net) +{ + /* Must be called with pernet_ops_rwsem held */ + const struct pernet_operations *ops; + LIST_HEAD(net_exit_list); + int error = 0; + + net->net_cookie = ns_tree_gen_id(net); + list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); if (error < 0) @@ -337,6 +450,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); + ns_tree_add_raw(net); out: return error; @@ -345,46 +459,11 @@ out_undo: * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - synchronize_rcu(); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); - + ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } -static int __net_init net_defaults_init_net(struct net *net) -{ - net->core.sysctl_somaxconn = SOMAXCONN; - net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; - - return 0; -} - -static struct pernet_operations net_defaults_ops = { - .init = net_defaults_init_net, -}; - -static __init int net_defaults_init(void) -{ - if (register_pernet_subsys(&net_defaults_ops)) - panic("Cannot initialize net default settings"); - - return 0; -} - -core_initcall(net_defaults_init); - #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) { @@ -433,7 +512,22 @@ out_free: goto out; } -static void net_free(struct net *net) +static LLIST_HEAD(defer_free_list); + +static void net_complete_free(void) +{ + struct llist_node *kill_list; + struct net *net, *next; + + /* Get the list of namespaces to free from last round. */ + kill_list = llist_del_all(&defer_free_list); + + llist_for_each_entry_safe(net, next, kill_list, defer_free_list) + kmem_cache_free(net_cachep, net); + +} + +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -441,7 +535,8 @@ static void net_free(struct net *net) /* There should not be any trackers left there. */ ref_tracker_dir_exit(&net->notrefcnt_tracker); - kmem_cache_free(net_cachep, net); + /* Wait for an extra rcu_barrier() before final free. */ + llist_add(&net->defer_free_list, &defer_free_list); } } @@ -450,10 +545,10 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; @@ -472,7 +567,10 @@ struct net *copy_net_ns(unsigned long flags, rv = -ENOMEM; goto dec_ucounts; } - refcount_set(&net->passive, 1); + + rv = preinit_net(net, user_ns); + if (rv < 0) + goto dec_ucounts; net->ucounts = ucounts; get_user_ns(user_ns); @@ -480,17 +578,18 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) goto put_userns; - rv = setup_net(net, user_ns); + rv = setup_net(net); up_read(&pernet_ops_rwsem); if (rv < 0) { put_userns: + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -537,31 +636,34 @@ static void unhash_nsid(struct net *net, struct net *last) for_each_net(tmp) { int id; - spin_lock_bh(&tmp->nsid_lock); + spin_lock(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); + spin_unlock(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, GFP_KERNEL); if (tmp == last) break; } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); +struct task_struct *cleanup_net_task; + static void cleanup_net(struct work_struct *work) { - const struct pernet_operations *ops; - struct net *net, *tmp, *last; struct llist_node *net_kill_list; + struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); + WRITE_ONCE(cleanup_net_task, current); + /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -569,8 +671,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ down_write(&net_rwsem); - llist_for_each_entry(net, net_kill_list, cleanup_list) + llist_for_each_entry(net, net_kill_list, cleanup_list) { + ns_tree_remove(net); list_del_rcu(&net->list); + } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). @@ -589,25 +693,7 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } - /* Run all of the network namespace pre_exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - /* - * Another CPU might be rcu-iterating the list, wait for it. - * This needs to be before calling the exit() notifiers, so - * the rcu_barrier() below isn't sufficient alone. - * Also the pre_exit() and exit() methods need this barrier. - */ - synchronize_rcu(); - - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true); up_read(&pernet_ops_rwsem); @@ -616,16 +702,20 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); + net_complete_free(); + /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } + WRITE_ONCE(cleanup_net_task, NULL); } /** @@ -659,32 +749,33 @@ EXPORT_SYMBOL_GPL(__put_net); * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * - * Returns the net's common namespace. + * Returns the net's common namespace or ERR_PTR() if ref is zero. */ struct ns_common *get_net_ns(struct ns_common *ns) { - return &get_net(container_of(ns, struct net, ns))->ns; + struct net *net; + + net = maybe_get_net(container_of(ns, struct net, ns)); + if (net) + return &net->ns; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { - struct file *file; - struct ns_common *ns; - struct net *net; + CLASS(fd, f)(fd); - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return ERR_CAST(file); + if (fd_empty(f)) + return ERR_PTR(-EBADF); - ns = get_proc_ns(file_inode(file)); - if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); - else - net = ERR_PTR(-EINVAL); + if (proc_ns_file(fd_file(f))) { + struct ns_common *ns = get_proc_ns(file_inode(fd_file(f))); + if (ns->ops == &netns_operations) + return get_net(container_of(ns, struct net, ns)); + } - fput(file); - return net; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif @@ -711,22 +802,37 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); -static __net_init int net_ns_net_init(struct net *net) +#ifdef CONFIG_NET_NS_REFCNT_TRACKER +static void net_ns_net_debugfs(struct net *net) { -#ifdef CONFIG_NET_NS - net->ns.ops = &netns_operations; -#endif - return ns_alloc_inum(&net->ns); + ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt", + net->net_cookie, net->ns.inum); + ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt", + net->net_cookie, net->ns.inum); } -static __net_exit void net_ns_net_exit(struct net *net) +static int __init init_net_debugfs(void) { - ns_free_inum(&net->ns); + ref_tracker_dir_debugfs(&init_net.refcnt_tracker); + ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker); + net_ns_net_debugfs(&init_net); + return 0; +} +late_initcall(init_net_debugfs); +#else +static void net_ns_net_debugfs(struct net *net) +{ +} +#endif + +static __net_init int net_ns_net_init(struct net *net) +{ + net_ns_net_debugfs(net); + return 0; } static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, - .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { @@ -772,9 +878,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, @@ -783,7 +889,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, } err = alloc_netid(net, peer, nsid); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, nlh, GFP_KERNEL); @@ -1061,7 +1167,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); - return err < 0 ? err : skb->len; + return err; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, @@ -1093,11 +1199,63 @@ out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } +#ifdef CONFIG_NET_NS +static void __init netns_ipv4_struct_check(void) +{ + /* TX readonly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_early_retrans); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_win_divisor); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_rtt_log); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_autocorking); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_snd_mss); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_notsent_lowat); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_limit_output_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_rtt_wlen); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_wmem); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_ip_fwd_use_pmtu); + + /* RX readonly hotpath cache line */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_moderate_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rcvbuf_low_rtt); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_ip_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_l3mdev_accept); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_reordering); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rmem); +} +#endif + +static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid, + .dumpit = rtnl_net_dumpid, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + void __init net_ns_init(void) { struct net_generic *ng; #ifdef CONFIG_NET_NS + netns_ipv4_struct_check(); net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, SLAB_PANIC|SLAB_ACCOUNT, NULL); @@ -1117,8 +1275,15 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif + /* + * This currently cannot fail as the initial network namespace + * has a static inode number. + */ + if (preinit_net(&init_net, &init_user_ns)) + panic("Could not preinitialize the initial network namespace"); + down_write(&pernet_ops_rwsem); - if (setup_net(&init_net, &init_user_ns)) + if (setup_net(&init_net)) panic("Could not setup the initial network namespace"); init_net_initialized = true; @@ -1127,30 +1292,19 @@ void __init net_ns_init(void) if (register_pernet_subsys(&net_ns_ops)) panic("Could not register network namespace subsystems"); - rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - RTNL_FLAG_DOIT_UNLOCKED); -} - -static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) -{ - ops_pre_exit_list(ops, net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, net_exit_list); - ops_free_list(ops, net_exit_list); + rtnl_register_many(net_ns_rtnl_msg_handlers); } #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + LIST_HEAD(net_exit_list); struct net *net; int error; - LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); - if (ops->init || (ops->id && ops->size)) { + if (ops->init || ops->id) { /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ @@ -1166,21 +1320,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { - struct net *net; LIST_HEAD(net_exit_list); + struct net *net; - list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + list_del(&ops->list); + ops_undo_single(ops, &net_exit_list); } #else @@ -1202,8 +1356,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) list_del(&ops->list); } else { LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); } } @@ -1216,13 +1371,20 @@ static int register_pernet_operations(struct list_head *list, { int error; + if (WARN_ON(!!ops->id ^ !!ops->size)) + return -EINVAL; + if (ops->id) { error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, GFP_KERNEL); if (error < 0) return error; *ops->id = error; - max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); + /* This does not require READ_ONCE as writers already hold + * pernet_ops_rwsem. But WRITE_ONCE is needed to protect + * net_alloc_generic. + */ + WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1)); } error = __register_pernet_operations(list, ops); if (error) { @@ -1353,11 +1515,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); @@ -1384,7 +1541,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, diff --git a/net/core/net_test.c b/net/core/net_test.c new file mode 100644 index 000000000000..9c3a590865d2 --- /dev/null +++ b/net/core/net_test.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <kunit/test.h> + +/* GSO */ + +#include <linux/skbuff.h> + +static const char hdr[] = "abcdefgh"; +#define GSO_TEST_SIZE 1000 + +static void __init_skb(struct sk_buff *skb) +{ + skb_reset_mac_header(skb); + memcpy(skb_mac_header(skb), hdr, sizeof(hdr)); + + /* skb_segment expects skb->data at start of payload */ + skb_pull(skb, sizeof(hdr)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + /* proto is arbitrary, as long as not ETH_P_TEB or vlan */ + skb->protocol = htons(ETH_P_ATALK); + skb_shinfo(skb)->gso_size = GSO_TEST_SIZE; +} + +enum gso_test_nr { + GSO_TEST_LINEAR, + GSO_TEST_NO_GSO, + GSO_TEST_FRAGS, + GSO_TEST_FRAGS_PURE, + GSO_TEST_GSO_PARTIAL, + GSO_TEST_FRAG_LIST, + GSO_TEST_FRAG_LIST_PURE, + GSO_TEST_FRAG_LIST_NON_UNIFORM, + GSO_TEST_GSO_BY_FRAGS, +}; + +struct gso_test_case { + enum gso_test_nr id; + const char *name; + + /* input */ + unsigned int linear_len; + unsigned int nr_frags; + const unsigned int *frags; + unsigned int nr_frag_skbs; + const unsigned int *frag_skbs; + + /* output as expected */ + unsigned int nr_segs; + const unsigned int *segs; +}; + +static struct gso_test_case cases[] = { + { + .id = GSO_TEST_NO_GSO, + .name = "no_gso", + .linear_len = GSO_TEST_SIZE, + .nr_segs = 1, + .segs = (const unsigned int[]) { GSO_TEST_SIZE }, + }, + { + .id = GSO_TEST_LINEAR, + .name = "linear", + .linear_len = GSO_TEST_SIZE + GSO_TEST_SIZE + 1, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 }, + }, + { + .id = GSO_TEST_FRAGS, + .name = "frags", + .linear_len = GSO_TEST_SIZE, + .nr_frags = 2, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, 1 }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 }, + }, + { + .id = GSO_TEST_FRAGS_PURE, + .name = "frags_pure", + .nr_frags = 3, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 }, + }, + { + .id = GSO_TEST_GSO_PARTIAL, + .name = "gso_partial", + .linear_len = GSO_TEST_SIZE, + .nr_frags = 2, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, 3 }, + .nr_segs = 2, + .segs = (const unsigned int[]) { 2 * GSO_TEST_SIZE, 3 }, + }, + { + /* commit 89319d3801d1: frag_list on mss boundaries */ + .id = GSO_TEST_FRAG_LIST, + .name = "frag_list", + .linear_len = GSO_TEST_SIZE, + .nr_frag_skbs = 2, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE }, + }, + { + .id = GSO_TEST_FRAG_LIST_PURE, + .name = "frag_list_pure", + .nr_frag_skbs = 2, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + .nr_segs = 2, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + }, + { + /* commit 43170c4e0ba7: GRO of frag_list trains */ + .id = GSO_TEST_FRAG_LIST_NON_UNIFORM, + .name = "frag_list_non_uniform", + .linear_len = GSO_TEST_SIZE, + .nr_frag_skbs = 4, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, 1, GSO_TEST_SIZE, 2 }, + .nr_segs = 4, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE, 3 }, + }, + { + /* commit 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") and + * commit 90017accff61 ("sctp: Add GSO support") + * + * "there will be a cover skb with protocol headers and + * children ones containing the actual segments" + */ + .id = GSO_TEST_GSO_BY_FRAGS, + .name = "gso_by_frags", + .nr_frag_skbs = 4, + .frag_skbs = (const unsigned int[]) { 100, 200, 300, 400 }, + .nr_segs = 4, + .segs = (const unsigned int[]) { 100, 200, 300, 400 }, + }, +}; + +static void gso_test_case_to_desc(struct gso_test_case *t, char *desc) +{ + sprintf(desc, "%s", t->name); +} + +KUNIT_ARRAY_PARAM(gso_test, cases, gso_test_case_to_desc); + +static void gso_test_func(struct kunit *test) +{ + const int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct sk_buff *skb, *segs, *cur, *next, *last; + const struct gso_test_case *tcase; + netdev_features_t features; + struct page *page; + int i; + + tcase = test->param_value; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + skb = build_skb(page_address(page), sizeof(hdr) + tcase->linear_len + shinfo_size); + KUNIT_ASSERT_NOT_NULL(test, skb); + __skb_put(skb, sizeof(hdr) + tcase->linear_len); + + __init_skb(skb); + + if (tcase->nr_frags) { + unsigned int pg_off = 0; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + page_ref_add(page, tcase->nr_frags - 1); + + for (i = 0; i < tcase->nr_frags; i++) { + skb_fill_page_desc(skb, i, page, pg_off, tcase->frags[i]); + pg_off += tcase->frags[i]; + } + + KUNIT_ASSERT_LE(test, pg_off, PAGE_SIZE); + + skb->data_len = pg_off; + skb->len += skb->data_len; + skb->truesize += skb->data_len; + } + + if (tcase->frag_skbs) { + unsigned int total_size = 0, total_true_size = 0; + struct sk_buff *frag_skb, *prev = NULL; + + for (i = 0; i < tcase->nr_frag_skbs; i++) { + unsigned int frag_size; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + + frag_size = tcase->frag_skbs[i]; + frag_skb = build_skb(page_address(page), + frag_size + shinfo_size); + KUNIT_ASSERT_NOT_NULL(test, frag_skb); + __skb_put(frag_skb, frag_size); + + if (prev) + prev->next = frag_skb; + else + skb_shinfo(skb)->frag_list = frag_skb; + prev = frag_skb; + + total_size += frag_size; + total_true_size += frag_skb->truesize; + } + + skb->len += total_size; + skb->data_len += total_size; + skb->truesize += total_true_size; + + if (tcase->id == GSO_TEST_GSO_BY_FRAGS) + skb_shinfo(skb)->gso_size = GSO_BY_FRAGS; + } + + features = NETIF_F_SG | NETIF_F_HW_CSUM; + if (tcase->id == GSO_TEST_GSO_PARTIAL) + features |= NETIF_F_GSO_PARTIAL; + + /* TODO: this should also work with SG, + * rather than hit BUG_ON(i >= nfrags) + */ + if (tcase->id == GSO_TEST_FRAG_LIST_NON_UNIFORM) + features &= ~NETIF_F_SG; + + segs = skb_segment(skb, features); + if (IS_ERR(segs)) { + KUNIT_FAIL(test, "segs error %pe", segs); + goto free_gso_skb; + } else if (!segs) { + KUNIT_FAIL(test, "no segments"); + goto free_gso_skb; + } + + last = segs->prev; + for (cur = segs, i = 0; cur; cur = next, i++) { + next = cur->next; + + KUNIT_ASSERT_EQ(test, cur->len, sizeof(hdr) + tcase->segs[i]); + + /* segs have skb->data pointing to the mac header */ + KUNIT_ASSERT_PTR_EQ(test, skb_mac_header(cur), cur->data); + KUNIT_ASSERT_PTR_EQ(test, skb_network_header(cur), cur->data + sizeof(hdr)); + + /* header was copied to all segs */ + KUNIT_ASSERT_EQ(test, memcmp(skb_mac_header(cur), hdr, sizeof(hdr)), 0); + + /* last seg can be found through segs->prev pointer */ + if (!next) + KUNIT_ASSERT_PTR_EQ(test, cur, last); + + consume_skb(cur); + } + + KUNIT_ASSERT_EQ(test, i, tcase->nr_segs); + +free_gso_skb: + consume_skb(skb); +} + +/* IP tunnel flags */ + +#include <net/ip_tunnels.h> + +struct ip_tunnel_flags_test { + const char *name; + + const u16 *src_bits; + const u16 *exp_bits; + u8 src_num; + u8 exp_num; + + __be16 exp_val; + bool exp_comp; +}; + +#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \ + .name = (n), \ + .src_bits = (src), \ + .src_num = ARRAY_SIZE(src), \ + .exp_comp = (comp), \ + .exp_val = (eval), \ + .exp_bits = (exp), \ + .exp_num = ARRAY_SIZE(exp), \ +} + +/* These are __be16-compatible and can be compared as is */ +static const u16 ip_tunnel_flags_1[] = { + IP_TUNNEL_KEY_BIT, + IP_TUNNEL_STRICT_BIT, + IP_TUNNEL_ERSPAN_OPT_BIT, +}; + +/* Due to the previous flags design limitation, setting either + * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT`` + * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they + * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is + * backward-compatible. + */ +#ifdef __LITTLE_ENDIAN +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT +#else +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT +#endif + +static const u16 ip_tunnel_flags_2_src[] = { + IP_TUNNEL_CONFLICT_BIT, +}; + +static const u16 ip_tunnel_flags_2_exp[] = { + IP_TUNNEL_CONFLICT_BIT, + IP_TUNNEL_SIT_ISATAP_BIT, +}; + +/* Bits 17 and higher are not compatible with __be16 flags */ +static const u16 ip_tunnel_flags_3_src[] = { + IP_TUNNEL_VXLAN_OPT_BIT, + 17, + 18, + 20, +}; + +static const u16 ip_tunnel_flags_3_exp[] = { + IP_TUNNEL_VXLAN_OPT_BIT, +}; + +static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = { + IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true, + cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) | + BIT(IP_TUNNEL_STRICT_BIT) | + BIT(IP_TUNNEL_ERSPAN_OPT_BIT)), + ip_tunnel_flags_1), + IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true, + VTI_ISVTI, ip_tunnel_flags_2_exp), + IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false, + cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)), + ip_tunnel_flags_3_exp), +}; + +static void +ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t, + char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} +KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test, + ip_tunnel_flags_test_case_to_desc); + +static void ip_tunnel_flags_test_run(struct kunit *test) +{ + const struct ip_tunnel_flags_test *t = test->param_value; + IP_TUNNEL_DECLARE_FLAGS(src) = { }; + IP_TUNNEL_DECLARE_FLAGS(exp) = { }; + IP_TUNNEL_DECLARE_FLAGS(out); + + for (u32 j = 0; j < t->src_num; j++) + __set_bit(t->src_bits[j], src); + for (u32 j = 0; j < t->exp_num; j++) + __set_bit(t->exp_bits[j], exp); + + KUNIT_ASSERT_EQ(test, t->exp_comp, + ip_tunnel_flags_is_be16_compat(src)); + KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val, + (__force u16)ip_tunnel_flags_to_be16(src)); + + ip_tunnel_flags_from_be16(out, t->exp_val); + KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out)); +} + +static struct kunit_case net_test_cases[] = { + KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), + KUNIT_CASE_PARAM(ip_tunnel_flags_test_run, + ip_tunnel_flags_test_gen_params), + { }, +}; + +static struct kunit_suite net_test_suite = { + .name = "net_core", + .test_cases = net_test_cases, +}; +kunit_test_suite(net_test_suite); + +MODULE_DESCRIPTION("KUnit tests for networking core"); +MODULE_LICENSE("GPL"); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index d6a70aeaa503..dff66d8fb325 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -21,7 +21,9 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state struct cgroup_cls_state *task_cls_state(struct task_struct *p) { return css_cls_state(task_css_check(p, net_cls_cgrp_id, - rcu_read_lock_bh_held())); + rcu_read_lock_held() || + rcu_read_lock_bh_held() || + rcu_read_lock_trace_held())); } EXPORT_SYMBOL_GPL(task_cls_state); @@ -88,6 +90,12 @@ static void update_classid_task(struct task_struct *p, u32 classid) }; unsigned int fd = 0; + /* Only update the leader task, when many threads in this task, + * so it can avoid the useless traversal. + */ + if (p != p->group_leader) + return; + do { task_lock(p); fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c new file mode 100644 index 000000000000..ba673e81716f --- /dev/null +++ b/net/core/netdev-genl-gen.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netdev-genl-gen.h" + +#include <uapi/linux/netdev.h> +#include <net/netdev_netlink.h> + +/* Integer value ranges */ +static const struct netlink_range_validation netdev_a_page_pool_id_range = { + .min = 1ULL, + .max = U32_MAX, +}; + +static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = { + .min = 1ULL, + .max = S32_MAX, +}; + +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = { + .max = S32_MAX, +}; + +/* Common nested types */ +const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), + [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), +}; + +const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), +}; + +/* NETDEV_CMD_DEV_GET - do */ +static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { + [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_PAGE_POOL_GET - do */ +#ifdef CONFIG_PAGE_POOL +static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL_ID + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), +}; +#endif /* CONFIG_PAGE_POOL */ + +/* NETDEV_CMD_PAGE_POOL_STATS_GET - do */ +#ifdef CONFIG_PAGE_POOL_STATS +static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAGE_POOL_STATS_INFO + 1] = { + [NETDEV_A_PAGE_POOL_STATS_INFO] = NLA_POLICY_NESTED(netdev_page_pool_info_nl_policy), +}; +#endif /* CONFIG_PAGE_POOL_STATS */ + +/* NETDEV_CMD_QUEUE_GET - do */ +static const struct nla_policy netdev_queue_get_do_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_QUEUE_GET - dump */ +static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IFINDEX + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_NAPI_GET - do */ +static const struct nla_policy netdev_napi_get_do_nl_policy[NETDEV_A_NAPI_ID + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_NAPI_GET - dump */ +static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFINDEX + 1] = { + [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_QSTATS_GET - dump */ +static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { + [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), +}; + +/* NETDEV_CMD_BIND_RX - do */ +static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, + [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), +}; + +/* NETDEV_CMD_NAPI_SET - do */ +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, + [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), + [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2), +}; + +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, +}; + +/* Ops table for netdev */ +static const struct genl_split_ops netdev_nl_ops[] = { + { + .cmd = NETDEV_CMD_DEV_GET, + .doit = netdev_nl_dev_get_doit, + .policy = netdev_dev_get_nl_policy, + .maxattr = NETDEV_A_DEV_IFINDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_DEV_GET, + .dumpit = netdev_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#ifdef CONFIG_PAGE_POOL + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .doit = netdev_nl_page_pool_get_doit, + .policy = netdev_page_pool_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .dumpit = netdev_nl_page_pool_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL */ +#ifdef CONFIG_PAGE_POOL_STATS + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .doit = netdev_nl_page_pool_stats_get_doit, + .policy = netdev_page_pool_stats_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_STATS_INFO, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .dumpit = netdev_nl_page_pool_stats_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL_STATS */ + { + .cmd = NETDEV_CMD_QUEUE_GET, + .doit = netdev_nl_queue_get_doit, + .policy = netdev_queue_get_do_nl_policy, + .maxattr = NETDEV_A_QUEUE_TYPE, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_QUEUE_GET, + .dumpit = netdev_nl_queue_get_dumpit, + .policy = netdev_queue_get_dump_nl_policy, + .maxattr = NETDEV_A_QUEUE_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .doit = netdev_nl_napi_get_doit, + .policy = netdev_napi_get_do_nl_policy, + .maxattr = NETDEV_A_NAPI_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .dumpit = netdev_nl_napi_get_dumpit, + .policy = netdev_napi_get_dump_nl_policy, + .maxattr = NETDEV_A_NAPI_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_QSTATS_GET, + .dumpit = netdev_nl_qstats_get_dumpit, + .policy = netdev_qstats_get_nl_policy, + .maxattr = NETDEV_A_QSTATS_SCOPE, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_BIND_RX, + .doit = netdev_nl_bind_rx_doit, + .policy = netdev_bind_rx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_NAPI_SET, + .doit = netdev_nl_napi_set_doit, + .policy = netdev_napi_set_nl_policy, + .maxattr = NETDEV_A_NAPI_THREADED, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group netdev_nl_mcgrps[] = { + [NETDEV_NLGRP_MGMT] = { "mgmt", }, + [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", }, +}; + +static void __netdev_nl_sock_priv_init(void *priv) +{ + netdev_nl_sock_priv_init(priv); +} + +static void __netdev_nl_sock_priv_destroy(void *priv) +{ + netdev_nl_sock_priv_destroy(priv); +} + +struct genl_family netdev_nl_family __ro_after_init = { + .name = NETDEV_FAMILY_NAME, + .version = NETDEV_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = netdev_nl_ops, + .n_split_ops = ARRAY_SIZE(netdev_nl_ops), + .mcgrps = netdev_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), + .sock_priv_size = sizeof(struct netdev_nl_sock), + .sock_priv_init = __netdev_nl_sock_priv_init, + .sock_priv_destroy = __netdev_nl_sock_priv_destroy, +}; diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h new file mode 100644 index 000000000000..cffc08517a41 --- /dev/null +++ b/net/core/netdev-genl-gen.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _LINUX_NETDEV_GEN_H +#define _LINUX_NETDEV_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/netdev.h> +#include <net/netdev_netlink.h> + +/* Common nested types */ +extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; +extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; + +int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info); +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + NETDEV_NLGRP_MGMT, + NETDEV_NLGRP_PAGE_POOL, +}; + +extern struct genl_family netdev_nl_family; + +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv); +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv); + +#endif /* _LINUX_NETDEV_GEN_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c new file mode 100644 index 000000000000..470fabbeacd9 --- /dev/null +++ b/net/core/netdev-genl.c @@ -0,0 +1,1203 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/busy_poll.h> +#include <net/net_namespace.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/sock.h> +#include <net/xdp.h> +#include <net/xdp_sock.h> +#include <net/page_pool/memory_provider.h> + +#include "dev.h" +#include "devmem.h" +#include "netdev-genl-gen.h" + +struct netdev_nl_dump_ctx { + unsigned long ifindex; + unsigned int rxq_idx; + unsigned int txq_idx; + unsigned int napi_id; +}; + +static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) +{ + NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); + + return (struct netdev_nl_dump_ctx *)cb->ctx; +} + +static int +netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info) +{ + u64 xsk_features = 0; + u64 xdp_rx_meta = 0; + void *hdr; + + netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + +#define XDP_METADATA_KFUNC(_, flag, __, xmo) \ + if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ + xdp_rx_meta |= flag; +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + + if (netdev->xsk_tx_metadata_ops) { + if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) + xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; + if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) + xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) + xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; + } + + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, + netdev->xdp_features, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + xdp_rx_meta, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, + xsk_features, NETDEV_A_DEV_PAD)) + goto err_cancel_msg; + + if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + netdev->xdp_zc_max_segs)) + goto err_cancel_msg; + } + + genlmsg_end(rsp, hdr); + + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static void +netdev_genl_dev_notify(struct net_device *netdev, int cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), + NETDEV_NLGRP_MGMT)) + return; + + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + if (netdev_nl_dev_fill(netdev, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, + 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); +} + +int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device *netdev; + struct sk_buff *rsp; + u32 ifindex; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_free_msg; + } + + err = netdev_nl_dev_fill(netdev, rsp, info); + netdev_unlock(netdev); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + struct net *net = sock_net(skb->sk); + int err; + + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); + if (err < 0) + return err; + } + + return 0; +} + +static int +netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, + const struct genl_info *info) +{ + unsigned long irq_suspend_timeout; + unsigned long gro_flush_timeout; + u32 napi_defer_hard_irqs; + void *hdr; + pid_t pid; + + if (!napi->dev->up) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + goto nla_put_failure; + + if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) + goto nla_put_failure; + + if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) + goto nla_put_failure; + + if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED, + napi_get_threaded(napi))) + goto nla_put_failure; + + if (napi->thread) { + pid = task_pid_nr(napi->thread); + if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) + goto nla_put_failure; + } + + napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); + if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, + napi_defer_hard_irqs)) + goto nla_put_failure; + + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + + gro_flush_timeout = napi_get_gro_flush_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + gro_flush_timeout)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + struct sk_buff *rsp; + u32 napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); + if (napi) { + err = netdev_nl_napi_fill_one(rsp, napi, info); + netdev_unlock(napi->dev); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + if (err) { + goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; + goto err_free_msg; + } + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + struct napi_struct *napi; + unsigned int prev_id; + int err = 0; + + if (!netdev->up) + return err; + + prev_id = UINT_MAX; + list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (!napi_id_valid(napi->napi_id)) + continue; + + /* Dump continuation below depends on the list being sorted */ + WARN_ON_ONCE(napi->napi_id >= prev_id); + prev_id = napi->napi_id; + + if (ctx->napi_id && napi->napi_id >= ctx->napi_id) + continue; + + err = netdev_nl_napi_fill_one(rsp, napi, info); + if (err) + return err; + ctx->napi_id = napi->napi_id; + } + return err; +} + +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_NAPI_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); + + if (ifindex) { + netdev = netdev_get_by_index_lock(net, ifindex); + if (netdev) { + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + netdev_unlock(netdev); + } else { + err = -ENODEV; + } + } else { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->napi_id = 0; + } + } + + return err; +} + +static int +netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) +{ + u64 irq_suspend_timeout = 0; + u64 gro_flush_timeout = 0; + u8 threaded = 0; + u32 defer = 0; + + if (info->attrs[NETDEV_A_NAPI_THREADED]) { + int ret; + + threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); + ret = napi_set_threaded(napi, threaded); + if (ret) + return ret; + } + + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { + defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); + napi_set_defer_hard_irqs(napi, defer); + } + + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { + gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); + napi_set_gro_flush_timeout(napi, gro_flush_timeout); + } + + return 0; +} + +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + unsigned int napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); + if (napi) { + err = netdev_nl_napi_set_config(napi, info); + netdev_unlock(napi->dev); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + return err; +} + +static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) +{ + if (napi && napi_id_valid(napi->napi_id)) + return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); + return 0; +} + +static int +netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, + u32 q_idx, u32 q_type, const struct genl_info *info) +{ + struct pp_memory_provider_params *params; + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || + nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(netdev, q_idx); + if (nla_put_napi_id(rsp, rxq->napi)) + goto nla_put_failure; + + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) + goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif + + break; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(netdev, q_idx); + if (nla_put_napi_id(rsp, txq->napi)) + goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (txq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif + break; + } + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, + u32 q_type) +{ + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + if (q_id >= netdev->real_num_rx_queues) + return -EINVAL; + return 0; + case NETDEV_QUEUE_TYPE_TX: + if (q_id >= netdev->real_num_tx_queues) + return -EINVAL; + } + return 0; +} + +static int +netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, + u32 q_type, const struct genl_info *info) +{ + int err; + + if (!netdev->up) + return -ENOENT; + + err = netdev_nl_queue_validate(netdev, q_idx, q_type); + if (err) + return err; + + return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); +} + +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 q_id, q_type, ifindex; + struct net_device *netdev; + struct sk_buff *rsp; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) + return -EINVAL; + + q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); + q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), + ifindex); + if (netdev) { + err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); + netdev_unlock_ops_compat(netdev); + } else { + err = -ENODEV; + } + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + int err = 0; + + if (!netdev->up) + return err; + + for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, + NETDEV_QUEUE_TYPE_RX, info); + if (err) + return err; + } + for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, + NETDEV_QUEUE_TYPE_TX, info); + if (err) + return err; + } + + return err; +} + +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + if (ifindex) { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (netdev) { + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + netdev_unlock_ops_compat(netdev); + } else { + err = -ENODEV; + } + } else { + for_each_netdev_lock_ops_compat_scoped(net, netdev, + ctx->ifindex) { + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + } + } + + return err; +} + +#define NETDEV_STAT_NOT_SET (~0ULL) + +static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) +{ + const u64 *add = _add; + u64 *sum = _sum; + + while (size) { + if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) + *sum += *add; + sum++; + add++; + size -= 8; + } +} + +static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) +{ + if (value == NETDEV_STAT_NOT_SET) + return 0; + return nla_put_uint(rsp, attr_id, value); +} + +static int +netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, + u32 q_type, int i, const struct genl_info *info) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + memset(&rx, 0xff, sizeof(rx)); + ops->get_queue_stats_rx(netdev, i, &rx); + if (!memchr_inv(&rx, 0xff, sizeof(rx))) + goto nla_cancel; + if (netdev_nl_stats_write_rx(rsp, &rx)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + memset(&tx, 0xff, sizeof(tx)); + ops->get_queue_stats_tx(netdev, i, &tx); + if (!memchr_inv(&tx, 0xff, sizeof(tx))) + goto nla_cancel; + if (netdev_nl_stats_write_tx(rsp, &tx)) + goto nla_put_failure; + break; + } + + genlmsg_end(rsp, hdr); + return 0; + +nla_cancel: + genlmsg_cancel(rsp, hdr); + return 0; +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + int i, err; + + if (!(netdev->flags & IFF_UP)) + return 0; + + i = ctx->rxq_idx; + while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, + i, info); + if (err) + return err; + ctx->rxq_idx = ++i; + } + i = ctx->txq_idx; + while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, + i, info); + if (err) + return err; + ctx->txq_idx = ++i; + } + + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + return 0; +} + +/** + * netdev_stat_queue_sum() - add up queue stats from range of queues + * @netdev: net_device + * @rx_start: index of the first Rx queue to query + * @rx_end: index after the last Rx queue (first *not* to query) + * @rx_sum: output Rx stats, should be already initialized + * @tx_start: index of the first Tx queue to query + * @tx_end: index after the last Tx queue (first *not* to query) + * @tx_sum: output Tx stats, should be already initialized + * + * Add stats from [start, end) range of queue IDs to *x_sum structs. + * The sum structs must be already initialized. Usually this + * helper is invoked from the .get_base_stats callbacks of drivers + * to account for stats of disabled queues. In that case the ranges + * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues). + */ +void netdev_stat_queue_sum(struct net_device *netdev, + int rx_start, int rx_end, + struct netdev_queue_stats_rx *rx_sum, + int tx_start, int tx_end, + struct netdev_queue_stats_tx *tx_sum) +{ + const struct netdev_stat_ops *ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + int i; + + ops = netdev->stat_ops; + + for (i = rx_start; i < rx_end; i++) { + memset(&rx, 0xff, sizeof(rx)); + if (ops->get_queue_stats_rx) + ops->get_queue_stats_rx(netdev, i, &rx); + netdev_nl_stats_add(rx_sum, &rx, sizeof(rx)); + } + for (i = tx_start; i < tx_end; i++) { + memset(&tx, 0xff, sizeof(tx)); + if (ops->get_queue_stats_tx) + ops->get_queue_stats_tx(netdev, i, &tx); + netdev_nl_stats_add(tx_sum, &tx, sizeof(tx)); + } +} +EXPORT_SYMBOL(netdev_stat_queue_sum); + +static int +netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info) +{ + struct netdev_queue_stats_rx rx_sum; + struct netdev_queue_stats_tx tx_sum; + void *hdr; + + /* Netdev can't guarantee any complete counters */ + if (!netdev->stat_ops->get_base_stats) + return 0; + + memset(&rx_sum, 0xff, sizeof(rx_sum)); + memset(&tx_sum, 0xff, sizeof(tx_sum)); + + netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum); + + /* The op was there, but nothing reported, don't bother */ + if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && + !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum, + 0, netdev->real_num_tx_queues, &tx_sum); + + if (netdev_nl_stats_write_rx(rsp, &rx_sum) || + netdev_nl_stats_write_tx(rsp, &tx_sum)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, + struct sk_buff *skb, const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + if (!netdev->stat_ops) + return 0; + + switch (scope) { + case 0: + return netdev_nl_stats_by_netdev(netdev, skb, info); + case NETDEV_QSTATS_SCOPE_QUEUE: + return netdev_nl_stats_by_queue(netdev, skb, info, ctx); + } + + return -EINVAL; /* Should not happen, per netlink policy */ +} + +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + unsigned int ifindex; + unsigned int scope; + int err = 0; + + scope = 0; + if (info->attrs[NETDEV_A_QSTATS_SCOPE]) + scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); + + ifindex = 0; + if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); + + if (ifindex) { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (!netdev) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + return -ENODEV; + } + if (netdev->stat_ops) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = -EOPNOTSUPP; + } + netdev_unlock_ops_compat(netdev); + return err; + } + + for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; + } + + return err; +} + +static int netdev_nl_read_rxq_bitmap(struct genl_info *info, + u32 rxq_bitmap_len, + unsigned long *rxq_bitmap) +{ + const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; + struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct nlattr *attr; + int rem, err = 0; + u32 rxq_idx; + + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + err = nla_parse_nested(tb, maxtype, attr, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) + return -EINVAL; + + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + if (rxq_idx >= rxq_bitmap_len) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]); + return -EINVAL; + } + + bitmap_set(rxq_bitmap, rxq_idx, 1); + } + + return 0; +} + +static struct device * +netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, + struct netlink_ext_ack *extack) +{ + struct device *dma_dev = NULL; + u32 rxq_idx, prev_rxq_idx; + + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { + struct device *rxq_dma_dev; + + rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); + if (dma_dev && rxq_dma_dev != dma_dev) { + NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", + rxq_idx, prev_rxq_idx); + return ERR_PTR(-EOPNOTSUPP); + } + + dma_dev = rxq_dma_dev; + prev_rxq_idx = rxq_idx; + } + + return dma_dev; +} + +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_devmem_dmabuf_binding *binding; + u32 ifindex, dmabuf_fd, rxq_idx; + struct netdev_nl_sock *priv; + struct net_device *netdev; + unsigned long *rxq_bitmap; + struct device *dma_dev; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + err = 0; + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + if (!netif_device_present(netdev)) + err = -ENODEV; + else if (!netdev_need_ops_lock(netdev)) + err = -EOPNOTSUPP; + if (err) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_DEV_IFINDEX]); + goto err_unlock; + } + + rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL); + if (!rxq_bitmap) { + err = -ENOMEM; + goto err_unlock; + } + + err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues, + rxq_bitmap); + if (err) + goto err_rxq_bitmap; + + dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack); + if (IS_ERR(dma_dev)) { + err = PTR_ERR(dma_dev); + goto err_rxq_bitmap; + } + + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, + dmabuf_fd, priv, info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_rxq_bitmap; + } + + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { + err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, + info->extack); + if (err) + goto err_unbind; + } + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + err = genlmsg_reply(rsp, info); + if (err) + goto err_unbind; + + bitmap_free(rxq_bitmap); + + netdev_unlock(netdev); + + mutex_unlock(&priv->lock); + + return 0; + +err_unbind: + net_devmem_unbind_dmabuf(binding); +err_rxq_bitmap: + bitmap_free(rxq_bitmap); +err_unlock: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_devmem_dmabuf_binding *binding; + struct netdev_nl_sock *priv; + struct net_device *netdev; + struct device *dma_dev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + + if (!netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + if (!netdev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver does not support netmem TX"); + goto err_unlock_netdev; + } + + dma_dev = netdev_queue_get_dma_dev(netdev, 0); + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, + dmabuf_fd, priv, info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock_netdev; + } + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + netdev_unlock(netdev); + mutex_unlock(&priv->lock); + + return genlmsg_reply(rsp, info); + +err_unlock_netdev: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) +{ + INIT_LIST_HEAD(&priv->bindings); + mutex_init(&priv->lock); +} + +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) +{ + struct net_devmem_dmabuf_binding *binding; + struct net_devmem_dmabuf_binding *temp; + netdevice_tracker dev_tracker; + struct net_device *dev; + + mutex_lock(&priv->lock); + list_for_each_entry_safe(binding, temp, &priv->bindings, list) { + mutex_lock(&binding->lock); + dev = binding->dev; + if (!dev) { + mutex_unlock(&binding->lock); + net_devmem_unbind_dmabuf(binding); + continue; + } + netdev_hold(dev, &dev_tracker, GFP_KERNEL); + mutex_unlock(&binding->lock); + + netdev_lock(dev); + net_devmem_unbind_dmabuf(binding); + netdev_unlock(dev); + netdev_put(dev, &dev_tracker); + } + mutex_unlock(&priv->lock); +} + +static int netdev_genl_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_REGISTER: + netdev_lock_ops_to_full(netdev); + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + netdev_unlock_full_to_ops(netdev); + break; + case NETDEV_UNREGISTER: + netdev_lock(netdev); + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + netdev_unlock(netdev); + break; + case NETDEV_XDP_FEAT_CHANGE: + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block netdev_genl_nb = { + .notifier_call = netdev_genl_netdevice_event, +}; + +static int __init netdev_genl_init(void) +{ + int err; + + err = register_netdevice_notifier(&netdev_genl_nb); + if (err) + return err; + + err = genl_register_family(&netdev_nl_family); + if (err) + goto err_unreg_ntf; + + return 0; + +err_unreg_ntf: + unregister_netdevice_notifier(&netdev_genl_nb); + return err; +} + +subsys_initcall(netdev_genl_init); diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c new file mode 100644 index 000000000000..251f27a8307f --- /dev/null +++ b/net/core/netdev_queues.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/netdev_queues.h> + +/** + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations + * @dev: net_device + * @idx: queue index + * + * Get dma device for zero-copy operations to be used for this queue. + * When such device is not available or valid, the function will return NULL. + * + * Return: Device or NULL on error + */ +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) +{ + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; + struct device *dma_dev; + + if (queue_ops && queue_ops->ndo_queue_get_dma_dev) + dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); + else + dma_dev = dev->dev.parent; + + return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; +} + diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c new file mode 100644 index 000000000000..c7d9341b7630 --- /dev/null +++ b/net/core/netdev_rx_queue.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/ethtool_netlink.h> +#include <linux/netdevice.h> +#include <net/netdev_lock.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/memory_provider.h> + +#include "page_pool_priv.h" + +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + + return !!rxq->mp_params.mp_ops; +} +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; + void *new_mem, *old_mem; + int err; + + if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free || + !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start) + return -EOPNOTSUPP; + + netdev_assert_locked(dev); + + new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); + if (!new_mem) + return -ENOMEM; + + old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); + if (!old_mem) { + err = -ENOMEM; + goto err_free_new_mem; + } + + err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + if (err) + goto err_free_old_mem; + + err = page_pool_check_memory_provider(dev, rxq); + if (err) + goto err_free_new_queue_mem; + + if (netif_running(dev)) { + err = qops->ndo_queue_stop(dev, old_mem, rxq_idx); + if (err) + goto err_free_new_queue_mem; + + err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + if (err) + goto err_start_queue; + } else { + swap(new_mem, old_mem); + } + + qops->ndo_queue_mem_free(dev, old_mem); + + kvfree(old_mem); + kvfree(new_mem); + + return 0; + +err_start_queue: + /* Restarting the queue with old_mem should be successful as we haven't + * changed any of the queue configuration, and there is not much we can + * do to recover from a failure here. + * + * WARN if we fail to recover the old rx queue, and at least free + * old_mem so we don't also leak that. + */ + if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { + WARN(1, + "Failed to restart old queue in error path. RX queue %d may be unhealthy.", + rxq_idx); + qops->ndo_queue_mem_free(dev, old_mem); + } + +err_free_new_queue_mem: + qops->ndo_queue_mem_free(dev, new_mem); + +err_free_old_mem: + kvfree(old_mem); + +err_free_new_mem: + kvfree(new_mem); + + return err; +} +EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (!netdev_need_ops_lock(dev)) + return -EOPNOTSUPP; + + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); + return -EINVAL; + } + if (dev->cfg->hds_thresh) { + NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); + return -EINVAL; + } + if (dev_xdp_prog_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); + return -EEXIST; + } + + rxq = __netif_get_rx_queue(dev, rxq_idx); + if (rxq->mp_params.mp_ops) { + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); + return -EEXIST; + } +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) { + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); + return -EBUSY; + } +#endif + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, rxq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + netdev_lock(dev); + ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL); + netdev_unlock(dev); + return ret; +} + +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, + const struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + int err; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + err = netdev_rx_queue_restart(dev, ifq_idx); + WARN_ON(err && err != -ENETDOWN); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + netdev_lock(dev); + __net_mp_close_rxq(dev, ifq_idx, old_p); + netdev_unlock(dev); +} diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h new file mode 100644 index 000000000000..23175cb2bd86 --- /dev/null +++ b/net/core/netmem_priv.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NETMEM_PRIV_H +#define __NETMEM_PRIV_H + +static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) +{ + return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; +} + +static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) +{ + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; +} + +static inline void netmem_clear_pp_magic(netmem_ref netmem) +{ + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); + + netmem_to_nmdesc(netmem)->pp_magic = 0; +} + +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + +static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) +{ + netmem_to_nmdesc(netmem)->pp = pool; +} + +static inline void netmem_set_dma_addr(netmem_ref netmem, + unsigned long dma_addr) +{ + netmem_to_nmdesc(netmem)->dma_addr = dma_addr; +} + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = netmem_to_nmdesc(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + netmem_to_nmdesc(netmem)->pp_magic = magic; +} +#endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 9be762e1d042..09f72f10813c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -34,7 +34,7 @@ #include <net/addrconf.h> #include <net/ndisc.h> #include <net/ip6_checksum.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <trace/events/napi.h> #include <linux/kconfig.h> @@ -45,11 +45,6 @@ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 - -static struct sk_buff_head skb_pool; - -DEFINE_STATIC_SRCU(netpoll_srcu); - #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ @@ -63,13 +58,6 @@ static void zap_completion_queue(void); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); -#define np_info(np, fmt, ...) \ - pr_info("%s: " fmt, np->name, ##__VA_ARGS__) -#define np_err(np, fmt, ...) \ - pr_err("%s: " fmt, np->name, ##__VA_ARGS__) -#define np_notice(np, fmt, ...) \ - pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) - static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) @@ -137,6 +125,20 @@ static void queue_process(struct work_struct *work) } } +static int netif_local_xmit_active(struct net_device *dev) +{ + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) + return 1; + } + + return 0; +} + static void poll_one_napi(struct napi_struct *napi) { int work; @@ -148,7 +150,7 @@ static void poll_one_napi(struct napi_struct *napi) if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) return; - /* We explicilty pass the polling call a budget of 0 to + /* We explicitly pass the polling call a budget of 0 to * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); @@ -183,7 +185,10 @@ void netpoll_poll_dev(struct net_device *dev) if (!ni || down_trylock(&ni->dev_lock)) return; - if (!netif_running(dev)) { + /* Some drivers will take the same locks in poll and xmit, + * we can't poll if local CPU is already in xmit. + */ + if (!netif_running(dev) || netif_local_xmit_active(dev)) { up(&ni->dev_lock); return; } @@ -203,41 +208,36 @@ EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; - int idx; + might_sleep(); - idx = srcu_read_lock(&netpoll_srcu); - ni = srcu_dereference(dev->npinfo, &netpoll_srcu); + ni = rtnl_dereference(dev->npinfo); if (ni) down(&ni->dev_lock); - srcu_read_unlock(&netpoll_srcu, idx); } -EXPORT_SYMBOL(netpoll_poll_disable); void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; - rcu_read_lock(); - ni = rcu_dereference(dev->npinfo); + + ni = rtnl_dereference(dev->npinfo); if (ni) up(&ni->dev_lock); - rcu_read_unlock(); } -EXPORT_SYMBOL(netpoll_poll_enable); -static void refill_skbs(void) +static void refill_skbs(struct netpoll *np) { + struct sk_buff_head *skb_pool; struct sk_buff *skb; - unsigned long flags; - spin_lock_irqsave(&skb_pool.lock, flags); - while (skb_pool.qlen < MAX_SKBS) { + skb_pool = &np->skb_pool; + + while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; - __skb_queue_tail(&skb_pool, skb); + skb_queue_tail(skb_pool, skb); } - spin_unlock_irqrestore(&skb_pool.lock, flags); } static void zap_completion_queue(void) @@ -274,12 +274,13 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) struct sk_buff *skb; zap_completion_queue(); - refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) - skb = skb_dequeue(&skb_pool); + if (!skb) { + skb = skb_dequeue(&np->skb_pool); + schedule_work(&np->refill_wq); + } if (!skb) { if (++count < 10) { @@ -299,7 +300,7 @@ static int netpoll_owner_active(struct net_device *dev) struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; @@ -309,6 +310,7 @@ static int netpoll_owner_active(struct net_device *dev) static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; + netdev_tx_t ret = NET_XMIT_DROP; struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ @@ -317,11 +319,12 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) lockdep_assert_irqs_disabled(); dev = np->dev; + rcu_read_lock(); npinfo = rcu_dereference_bh(dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return NET_XMIT_DROP; + goto out; } /* don't get messages out of order, and no recursion */ @@ -360,7 +363,35 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } - return NETDEV_TX_OK; + ret = NETDEV_TX_OK; +out: + rcu_read_unlock(); + return ret; +} + +static void netpoll_udp_checksum(struct netpoll *np, struct sk_buff *skb, + int len) +{ + struct udphdr *udph; + int udp_len; + + udp_len = len + sizeof(struct udphdr); + udph = udp_hdr(skb); + + /* check needs to be set, since it will be consumed in csum_partial */ + udph->check = 0; + if (np->ipv6) + udph->check = csum_ipv6_magic(&np->local_ip.in6, + &np->remote_ip.in6, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + else + udph->check = csum_tcpudp_magic(np->local_ip.ip, + np->remote_ip.ip, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; } netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) @@ -380,228 +411,141 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } EXPORT_SYMBOL(netpoll_send_skb); -void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +static void push_ipv6(struct netpoll *np, struct sk_buff *skb, int len) { - int total_len, ip_len, udp_len; - struct sk_buff *skb; - struct udphdr *udph; - struct iphdr *iph; - struct ethhdr *eth; - static atomic_t ip_ident; struct ipv6hdr *ip6h; - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - WARN_ON_ONCE(!irqs_disabled()); + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); - udp_len = len + sizeof(*udph); - if (np->ipv6) - ip_len = udp_len + sizeof(*ip6h); - else - ip_len = udp_len + sizeof(*iph); + /* ip6h->version = 6; ip6h->priority = 0; */ + *(unsigned char *)ip6h = 0x60; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; - total_len = ip_len + LL_RESERVED_SPACE(np->dev); + ip6h->payload_len = htons(sizeof(struct udphdr) + len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 32; + ip6h->saddr = np->local_ip.in6; + ip6h->daddr = np->remote_ip.in6; - skb = find_skb(np, total_len + np->dev->needed_tailroom, - total_len - len); - if (!skb) - return; + skb->protocol = htons(ETH_P_IPV6); +} - skb_copy_to_linear_data(skb, msg, len); - skb_put(skb, len); +static void push_ipv4(struct netpoll *np, struct sk_buff *skb, int len) +{ + static atomic_t ip_ident; + struct iphdr *iph; + int ip_len; + + ip_len = len + sizeof(struct udphdr) + sizeof(struct iphdr); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + *(unsigned char *)iph = 0x45; + iph->tos = 0; + put_unaligned(htons(ip_len), &iph->tot_len); + iph->id = htons(atomic_inc_return(&ip_ident)); + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip.ip, &iph->saddr); + put_unaligned(np->remote_ip.ip, &iph->daddr); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + skb->protocol = htons(ETH_P_IP); +} - skb_push(skb, sizeof(*udph)); +static void push_udp(struct netpoll *np, struct sk_buff *skb, int len) +{ + struct udphdr *udph; + int udp_len; + + udp_len = len + sizeof(struct udphdr); + + skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); + udph = udp_hdr(skb); udph->source = htons(np->local_port); udph->dest = htons(np->remote_port); udph->len = htons(udp_len); - if (np->ipv6) { - udph->check = 0; - udph->check = csum_ipv6_magic(&np->local_ip.in6, - &np->remote_ip.in6, - udp_len, IPPROTO_UDP, - csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*ip6h)); - skb_reset_network_header(skb); - ip6h = ipv6_hdr(skb); - - /* ip6h->version = 6; ip6h->priority = 0; */ - *(unsigned char *)ip6h = 0x60; - ip6h->flow_lbl[0] = 0; - ip6h->flow_lbl[1] = 0; - ip6h->flow_lbl[2] = 0; - - ip6h->payload_len = htons(sizeof(struct udphdr) + len); - ip6h->nexthdr = IPPROTO_UDP; - ip6h->hop_limit = 32; - ip6h->saddr = np->local_ip.in6; - ip6h->daddr = np->remote_ip.in6; - - eth = skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IPV6); - } else { - udph->check = 0; - udph->check = csum_tcpudp_magic(np->local_ip.ip, - np->remote_ip.ip, - udp_len, IPPROTO_UDP, - csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - - /* iph->version = 4; iph->ihl = 5; */ - *(unsigned char *)iph = 0x45; - iph->tos = 0; - put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = htons(atomic_inc_return(&ip_ident)); - iph->frag_off = 0; - iph->ttl = 64; - iph->protocol = IPPROTO_UDP; - iph->check = 0; - put_unaligned(np->local_ip.ip, &(iph->saddr)); - put_unaligned(np->remote_ip.ip, &(iph->daddr)); - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - - eth = skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IP); - } + netpoll_udp_checksum(np, skb, len); +} +static void push_eth(struct netpoll *np, struct sk_buff *skb) +{ + struct ethhdr *eth; + + eth = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); ether_addr_copy(eth->h_source, np->dev->dev_addr); ether_addr_copy(eth->h_dest, np->remote_mac); - - skb->dev = np->dev; - - netpoll_send_skb(np, skb); + if (np->ipv6) + eth->h_proto = htons(ETH_P_IPV6); + else + eth->h_proto = htons(ETH_P_IP); } -EXPORT_SYMBOL(netpoll_send_udp); -void netpoll_print_options(struct netpoll *np) +int netpoll_send_udp(struct netpoll *np, const char *msg, int len) { - np_info(np, "local port %d\n", np->local_port); + int total_len, ip_len, udp_len; + struct sk_buff *skb; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!irqs_disabled()); + + udp_len = len + sizeof(struct udphdr); if (np->ipv6) - np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); + ip_len = udp_len + sizeof(struct ipv6hdr); else - np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); - np_info(np, "interface '%s'\n", np->dev_name); - np_info(np, "remote port %d\n", np->remote_port); + ip_len = udp_len + sizeof(struct iphdr); + + total_len = ip_len + LL_RESERVED_SPACE(np->dev); + + skb = find_skb(np, total_len + np->dev->needed_tailroom, + total_len - len); + if (!skb) + return -ENOMEM; + + skb_copy_to_linear_data(skb, msg, len); + skb_put(skb, len); + + push_udp(np, skb, len); if (np->ipv6) - np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); + push_ipv6(np, skb, len); else - np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); - np_info(np, "remote ethernet address %pM\n", np->remote_mac); + push_ipv4(np, skb, len); + push_eth(np, skb); + skb->dev = np->dev; + + return (int)netpoll_send_skb(np, skb); } -EXPORT_SYMBOL(netpoll_print_options); +EXPORT_SYMBOL(netpoll_send_udp); + -static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) +static void skb_pool_flush(struct netpoll *np) { - const char *end; + struct sk_buff_head *skb_pool; - if (!strchr(str, ':') && - in4_pton(str, -1, (void *)addr, -1, &end) > 0) { - if (!*end) - return 0; - } - if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { -#if IS_ENABLED(CONFIG_IPV6) - if (!*end) - return 1; -#else - return -1; -#endif - } - return -1; + cancel_work_sync(&np->refill_wq); + skb_pool = &np->skb_pool; + skb_queue_purge_reason(skb_pool, SKB_CONSUMED); } -int netpoll_parse_options(struct netpoll *np, char *opt) +static void refill_skbs_work_handler(struct work_struct *work) { - char *cur=opt, *delim; - int ipv6; - bool ipversion_set = false; + struct netpoll *np = + container_of(work, struct netpoll, refill_wq); - if (*cur != '@') { - if ((delim = strchr(cur, '@')) == NULL) - goto parse_failed; - *delim = 0; - if (kstrtou16(cur, 10, &np->local_port)) - goto parse_failed; - cur = delim; - } - cur++; - - if (*cur != '/') { - ipversion_set = true; - if ((delim = strchr(cur, '/')) == NULL) - goto parse_failed; - *delim = 0; - ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); - if (ipv6 < 0) - goto parse_failed; - else - np->ipv6 = (bool)ipv6; - cur = delim; - } - cur++; - - if (*cur != ',') { - /* parse out dev name */ - if ((delim = strchr(cur, ',')) == NULL) - goto parse_failed; - *delim = 0; - strscpy(np->dev_name, cur, sizeof(np->dev_name)); - cur = delim; - } - cur++; - - if (*cur != '@') { - /* dst port */ - if ((delim = strchr(cur, '@')) == NULL) - goto parse_failed; - *delim = 0; - if (*cur == ' ' || *cur == '\t') - np_info(np, "warning: whitespace is not allowed\n"); - if (kstrtou16(cur, 10, &np->remote_port)) - goto parse_failed; - cur = delim; - } - cur++; - - /* dst ip */ - if ((delim = strchr(cur, '/')) == NULL) - goto parse_failed; - *delim = 0; - ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); - if (ipv6 < 0) - goto parse_failed; - else if (ipversion_set && np->ipv6 != (bool)ipv6) - goto parse_failed; - else - np->ipv6 = (bool)ipv6; - cur = delim + 1; - - if (*cur != 0) { - /* MAC address */ - if (!mac_pton(cur, np->remote_mac)) - goto parse_failed; - } - - netpoll_print_options(np); - - return 0; - - parse_failed: - np_info(np, "couldn't parse config at '%s'!\n", cur); - return -1; + refill_skbs(np); } -EXPORT_SYMBOL(netpoll_parse_options); int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { @@ -609,17 +553,18 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) const struct net_device_ops *ops; int err; - np->dev = ndev; - strscpy(np->dev_name, ndev->name, IFNAMSIZ); + skb_queue_head_init(&np->skb_pool); + INIT_WORK(&np->refill_wq, refill_skbs_work_handler); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", - np->dev_name); + ndev->name); err = -ENOTSUPP; goto out; } - if (!ndev->npinfo) { + npinfo = rtnl_dereference(ndev->npinfo); + if (!npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; @@ -632,18 +577,21 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) refcount_set(&npinfo->refcnt, 1); - ops = np->dev->netdev_ops; + ops = ndev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev); if (err) goto free_npinfo; } } else { - npinfo = rtnl_dereference(ndev->npinfo); refcount_inc(&npinfo->refcnt); } - npinfo->netpoll = np; + np->dev = ndev; + strscpy(np->dev_name, ndev->name, IFNAMSIZ); + + /* fill up the skb queue */ + refill_skbs(np); /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -657,144 +605,187 @@ out: } EXPORT_SYMBOL_GPL(__netpoll_setup); +/* + * Returns a pointer to a string representation of the identifier used + * to select the egress interface for the given netpoll instance. buf + * must be a buffer of length at least MAC_ADDR_STR_LEN + 1. + */ +static char *egress_dev(struct netpoll *np, char *buf) +{ + if (np->dev_name[0]) + return np->dev_name; + + snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac); + return buf; +} + +static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev, + unsigned int timeout) +{ + unsigned long atmost; + + atmost = jiffies + timeout * HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + np_notice(np, "timeout waiting for carrier\n"); + break; + } + msleep(1); + } +} + +/* + * Take the IPv6 from ndev and populate local_ip structure in netpoll + */ +static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) +{ + char buf[MAC_ADDR_STR_LEN + 1]; + int err = -EDESTADDRREQ; + struct inet6_dev *idev; + + if (!IS_ENABLED(CONFIG_IPV6)) { + np_err(np, "IPv6 is not supported %s, aborting\n", + egress_dev(np, buf)); + return -EINVAL; + } + + idev = __in6_dev_get(ndev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) + continue; + /* Got the IP, let's return */ + np->local_ip.in6 = ifp->addr; + err = 0; + break; + } + read_unlock_bh(&idev->lock); + } + if (err) { + np_err(np, "no IPv6 address for %s, aborting\n", + egress_dev(np, buf)); + return err; + } + + np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); + return 0; +} + +/* + * Take the IPv4 from ndev and populate local_ip structure in netpoll + */ +static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) +{ + char buf[MAC_ADDR_STR_LEN + 1]; + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) { + np_err(np, "no IP address for %s, aborting\n", + egress_dev(np, buf)); + return -EDESTADDRREQ; + } + + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { + np_err(np, "no IP address for %s, aborting\n", + egress_dev(np, buf)); + return -EDESTADDRREQ; + } + + np->local_ip.ip = ifa->ifa_local; + np_info(np, "local IP %pI4\n", &np->local_ip.ip); + + return 0; +} + int netpoll_setup(struct netpoll *np) { + struct net *net = current->nsproxy->net_ns; + char buf[MAC_ADDR_STR_LEN + 1]; struct net_device *ndev = NULL; - struct in_device *in_dev; + bool ip_overwritten = false; int err; rtnl_lock(); - if (np->dev_name[0]) { - struct net *net = current->nsproxy->net_ns; + if (np->dev_name[0]) ndev = __dev_get_by_name(net, np->dev_name); - } + else if (is_valid_ether_addr(np->dev_mac)) + ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); + if (!ndev) { - np_err(np, "%s doesn't exist, aborting\n", np->dev_name); + np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf)); err = -ENODEV; goto unlock; } - dev_hold(ndev); + netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { - np_err(np, "%s is a slave device, aborting\n", np->dev_name); + np_err(np, "%s is a slave device, aborting\n", + egress_dev(np, buf)); err = -EBUSY; goto put; } if (!netif_running(ndev)) { - unsigned long atmost, atleast; - - np_info(np, "device %s not up yet, forcing it\n", np->dev_name); + np_info(np, "device %s not up yet, forcing it\n", + egress_dev(np, buf)); err = dev_open(ndev, NULL); - if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } rtnl_unlock(); - atleast = jiffies + HZ/10; - atmost = jiffies + carrier_timeout * HZ; - while (!netif_carrier_ok(ndev)) { - if (time_after(jiffies, atmost)) { - np_notice(np, "timeout waiting for carrier\n"); - break; - } - msleep(1); - } - - /* If carrier appears to come up instantly, we don't - * trust it and pause so that we don't pump all our - * queued console messages into the bitbucket. - */ - - if (time_before(jiffies, atleast)) { - np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n"); - msleep(4000); - } + netpoll_wait_carrier(np, ndev, carrier_timeout); rtnl_lock(); } if (!np->local_ip.ip) { if (!np->ipv6) { - const struct in_ifaddr *ifa; - - in_dev = __in_dev_get_rtnl(ndev); - if (!in_dev) - goto put_noaddr; - - ifa = rtnl_dereference(in_dev->ifa_list); - if (!ifa) { -put_noaddr: - np_err(np, "no IP address for %s, aborting\n", - np->dev_name); - err = -EDESTADDRREQ; + err = netpoll_take_ipv4(np, ndev); + if (err) goto put; - } - - np->local_ip.ip = ifa->ifa_local; - np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_dev *idev; - - err = -EDESTADDRREQ; - idev = __in6_dev_get(ndev); - if (idev) { - struct inet6_ifaddr *ifp; - - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != - !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) - continue; - np->local_ip.in6 = ifp->addr; - err = 0; - break; - } - read_unlock_bh(&idev->lock); - } - if (err) { - np_err(np, "no IPv6 address for %s, aborting\n", - np->dev_name); + err = netpoll_take_ipv6(np, ndev); + if (err) goto put; - } else - np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); -#else - np_err(np, "IPv6 is not supported %s, aborting\n", - np->dev_name); - err = -EINVAL; - goto put; -#endif } + ip_overwritten = true; } - /* fill up the skb queue */ - refill_skbs(); - err = __netpoll_setup(np, ndev); if (err) - goto put; - netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL); + goto flush; rtnl_unlock(); + + /* Make sure all NAPI polls which started before dev->npinfo + * was visible have exited before we start calling NAPI poll. + * NAPI skips locking if dev->npinfo is NULL. + */ + synchronize_rcu(); + return 0; +flush: + skb_pool_flush(np); put: - dev_put(ndev); + DEBUG_NET_WARN_ON_ONCE(np->dev); + if (ip_overwritten) + memset(&np->local_ip, 0, sizeof(np->local_ip)); + netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(netpoll_setup); -static int __init netpoll_init(void) -{ - skb_queue_head_init(&skb_pool); - return 0; -} -core_initcall(netpoll_init); - static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = @@ -812,7 +803,7 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) kfree(npinfo); } -void __netpoll_cleanup(struct netpoll *np) +static void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; @@ -820,8 +811,10 @@ void __netpoll_cleanup(struct netpoll *np) if (!npinfo) return; - synchronize_srcu(&netpoll_srcu); - + /* At this point, there is a single npinfo instance per netdevice, and + * its refcnt tracks how many netpoll structures are linked to it. We + * only perform npinfo cleanup when the refcnt decrements to zero. + */ if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; @@ -831,30 +824,36 @@ void __netpoll_cleanup(struct netpoll *np) RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); - } else - RCU_INIT_POINTER(np->dev->npinfo, NULL); + } + + skb_pool_flush(np); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); void __netpoll_free(struct netpoll *np) { ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu(); + synchronize_net(); __netpoll_cleanup(np); kfree(np); } EXPORT_SYMBOL_GPL(__netpoll_free); +void do_netpoll_cleanup(struct netpoll *np) +{ + __netpoll_cleanup(np); + netdev_put(np->dev, &np->dev_tracker); + np->dev = NULL; +} +EXPORT_SYMBOL(do_netpoll_cleanup); + void netpoll_cleanup(struct netpoll *np) { rtnl_lock(); if (!np->dev) goto out; - __netpoll_cleanup(np); - netdev_put(np->dev, &np->dev_tracker); - np->dev = NULL; + do_netpoll_cleanup(np); out: rtnl_unlock(); } diff --git a/net/core/of_net.c b/net/core/of_net.c index 55d3fe229269..93ea425b9248 100644 --- a/net/core/of_net.c +++ b/net/core/of_net.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/of_net.h> #include <linux/of_platform.h> +#include <linux/platform_device.h> #include <linux/phy.h> #include <linux/export.h> #include <linux/device.h> diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b203d8660e4..265a729431bb 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -5,12 +5,16 @@ * Copyright (C) 2016 Red Hat, Inc. */ +#include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> -#include <net/page_pool.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -19,15 +23,25 @@ #include <linux/mm.h> /* for put_page() */ #include <linux/poison.h> #include <linux/ethtool.h> +#include <linux/netdevice.h> #include <trace/events/page_pool.h> +#include "dev.h" +#include "mp_dmabuf_devmem.h" +#include "netmem_priv.h" +#include "page_pool_priv.h" + +DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); + #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) -#define BIAS_MAX LONG_MAX +#define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS +static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); + /* alloc_stat_inc is intended to be used in softirq context */ #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) /* recycle_stat_inc is safe to use when preemption is possible. */ @@ -57,7 +71,18 @@ static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_recycle_released_ref", }; -bool page_pool_get_stats(struct page_pool *pool, +/** + * page_pool_get_stats() - fetch page pool stats + * @pool: pool from which page was allocated + * @stats: struct page_pool_stats to fill in + * + * Retrieve statistics about the page_pool. This API is only available + * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. + * A pointer to a caller allocated struct page_pool_stats structure + * is passed to this API which is filled in. The caller can then report + * those stats to the user (perhaps via ethtool, debugfs, etc.). + */ +bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats) { int cpu = 0; @@ -107,9 +132,9 @@ int page_pool_ethtool_stats_get_count(void) } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { - struct page_pool_stats *pool_stats = stats; + const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; @@ -128,79 +153,189 @@ u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else -#define alloc_stat_inc(pool, __stat) -#define recycle_stat_inc(pool, __stat) -#define recycle_stat_add(pool, __stat, val) +#define alloc_stat_inc(...) do { } while (0) +#define recycle_stat_inc(...) do { } while (0) +#define recycle_stat_add(...) do { } while (0) #endif +static bool page_pool_producer_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + bool in_softirq = in_softirq(); + + if (in_softirq) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); + + return in_softirq; +} + +static void page_pool_producer_unlock(struct page_pool *pool, + bool in_softirq) + __releases(&pool->ring.producer_lock) +{ + if (in_softirq) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + +static void page_pool_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); + CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, + PAGE_POOL_FRAG_GROUP_ALIGN); +} + static int page_pool_init(struct page_pool *pool, - const struct page_pool_params *params) + const struct page_pool_params *params, + int cpuid) { unsigned int ring_qsize = 1024; /* Default */ + struct netdev_rx_queue *rxq; + int err; + + page_pool_struct_check(); - memcpy(&pool->p, params, sizeof(pool->p)); + memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); + memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); + + pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ - if (pool->p.flags & ~(PP_FLAG_ALL)) + if (pool->slow.flags & ~PP_FLAG_ALL) return -EINVAL; if (pool->p.pool_size) - ring_qsize = pool->p.pool_size; - - /* Sanity limit mem that can be pinned down */ - if (ring_qsize > 32768) - return -E2BIG; + ring_qsize = min(pool->p.pool_size, 16384); /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ - if (pool->p.flags & PP_FLAG_DMA_MAP) { + if (pool->slow.flags & PP_FLAG_DMA_MAP) { if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + + pool->dma_map = true; } - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page * needs to be mapped */ - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) return -EINVAL; if (!pool->p.max_len) return -EINVAL; + pool->dma_sync = true; + /* pool->p.offset has to be set according to the address * offset used by the DMA engine to start copying rx data */ } - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && - pool->p.flags & PP_FLAG_PAGE_FRAG) - return -EINVAL; + pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS - pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); - if (!pool->recycle_stats) - return -ENOMEM; + if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; + } else { + /* For system page pool instance we use a singular stats object + * instead of allocating a separate percpu variable for each + * (also percpu) page pool instance. + */ + pool->recycle_stats = &pp_system_recycle_stats; + pool->system = true; + } #endif - if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif return -ENOMEM; + } atomic_set(&pool->pages_state_release_cnt, 0); /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->p.flags & PP_FLAG_DMA_MAP) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); + + if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { + netdev_assert_locked(pool->slow.netdev); + rxq = __netif_get_rx_queue(pool->slow.netdev, + pool->slow.queue_idx); + pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; + } + + if (pool->mp_ops) { + if (!pool->dma_map || !pool->dma_sync) { + err = -EOPNOTSUPP; + goto free_ptr_ring; + } + + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); + if (err) { + pr_warn("%s() mem-provider init failed %d\n", __func__, + err); + goto free_ptr_ring; + } + + static_branch_inc(&page_pool_mem_providers); + } else if (pool->p.order > MAX_PAGE_ORDER) { + err = -EINVAL; + goto free_ptr_ring; + } return 0; + +free_ptr_ring: + ptr_ring_cleanup(&pool->ring, NULL); + xa_destroy(&pool->dma_mapped); +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif + return err; } -struct page_pool *page_pool_create(const struct page_pool_params *params) +static void page_pool_uninit(struct page_pool *pool) +{ + ptr_ring_cleanup(&pool->ring, NULL); + xa_destroy(&pool->dma_mapped); + +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif +} + +/** + * page_pool_create_percpu() - create a page pool for a given cpu. + * @params: parameters, see struct page_pool_params + * @cpuid: cpu identifier + */ +struct page_pool * +page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; @@ -209,30 +344,47 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) if (!pool) return ERR_PTR(-ENOMEM); - err = page_pool_init(pool, params); - if (err < 0) { - pr_warn("%s() gave up with errno %d\n", __func__, err); - kfree(pool); - return ERR_PTR(err); - } + err = page_pool_init(pool, params, cpuid); + if (err < 0) + goto err_free; + + err = page_pool_list(pool); + if (err) + goto err_uninit; return pool; + +err_uninit: + page_pool_uninit(pool); +err_free: + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); +} +EXPORT_SYMBOL(page_pool_create_percpu); + +/** + * page_pool_create() - create a page pool + * @params: parameters, see struct page_pool_params + */ +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + return page_pool_create_percpu(params, -1); } EXPORT_SYMBOL(page_pool_create); -static void page_pool_return_page(struct page_pool *pool, struct page *page); +static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem); -noinline -static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) +static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; - struct page *page; + netmem_ref netmem; int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) { alloc_stat_inc(pool, empty); - return NULL; + return 0; } /* Softirq guarantee CPU and thus NUMA node is stable. This, @@ -247,99 +399,166 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) /* Refill alloc array, but only if NUMA match */ do { - page = __ptr_ring_consume(r); - if (unlikely(!page)) + netmem = (__force netmem_ref)__ptr_ring_consume(r); + if (unlikely(!netmem)) break; - if (likely(page_to_nid(page) == pref_nid)) { - pool->alloc.cache[pool->alloc.count++] = page; + if (likely(netmem_is_pref_nid(netmem, pref_nid))) { + pool->alloc.cache[pool->alloc.count++] = netmem; } else { /* NUMA mismatch; * (1) release 1 page to page-allocator and * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ - page_pool_return_page(pool, page); + page_pool_return_netmem(pool, netmem); alloc_stat_inc(pool, waive); - page = NULL; + netmem = 0; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) { - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, refill); } - return page; + return netmem; } /* fast path */ -static struct page *__page_pool_get_cached(struct page_pool *pool) +static netmem_ref __page_pool_get_cached(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ if (likely(pool->alloc.count)) { /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, fast); } else { - page = page_pool_refill_alloc_cache(pool); + netmem = page_pool_refill_alloc_cache(pool); } - return page; + return netmem; } -static void page_pool_dma_sync_for_device(struct page_pool *pool, - struct page *page, - unsigned int dma_sync_size) +static void __page_pool_dma_sync_for_device(const struct page_pool *pool, + netmem_ref netmem, + u32 dma_sync_size) { - dma_addr_t dma_addr = page_pool_get_dma_addr(page); +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); dma_sync_size = min(dma_sync_size, pool->p.max_len); - dma_sync_single_range_for_device(pool->p.dev, dma_addr, - pool->p.offset, dma_sync_size, - pool->p.dma_dir); + __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, + dma_sync_size, pool->p.dma_dir); +#endif } -static bool page_pool_dma_map(struct page_pool *pool, struct page *page) +static __always_inline void +page_pool_dma_sync_for_device(const struct page_pool *pool, + netmem_ref netmem, + u32 dma_sync_size) +{ + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } +} + +static int page_pool_register_dma_index(struct page_pool *pool, + netmem_ref netmem, gfp_t gfp) +{ + int err = 0; + u32 id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + goto out; + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto out; + } + + netmem_set_dma_index(netmem, id); +out: + return err; +} + +static int page_pool_release_dma_index(struct page_pool *pool, + netmem_ref netmem) +{ + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + return 0; + + id = netmem_get_dma_index(netmem); + if (!id) + return -1; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return -1; + + netmem_set_dma_index(netmem, 0); + + return 0; +} + +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ - dma = dma_map_page_attrs(pool->p.dev, page, 0, - (PAGE_SIZE << pool->p.order), - pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); + dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, + (PAGE_SIZE << pool->p.order), pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC | + DMA_ATTR_WEAK_ORDERING); if (dma_mapping_error(pool->p.dev, dma)) return false; - page_pool_set_dma_addr(page, dma); + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); + goto unmap_failed; + } - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + err = page_pool_register_dma_index(pool, netmem, gfp); + if (err) + goto unset_failed; - return true; -} + page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); -static void page_pool_set_pp_info(struct page_pool *pool, - struct page *page) -{ - page->pp = pool; - page->pp_magic |= PP_SIGNATURE; - if (pool->p.init_callback) - pool->p.init_callback(page, pool->p.init_arg); -} + return true; -static void page_pool_clear_pp_info(struct page *page) -{ - page->pp_magic = 0; - page->pp = NULL; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); +unmap_failed: + dma_unmap_page_attrs(pool->p.dev, dma, + PAGE_SIZE << pool->p.order, pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); + return false; } static struct page *__page_pool_alloc_page_order(struct page_pool *pool, @@ -352,94 +571,108 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if ((pool->p.flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } alloc_stat_inc(pool, slow_high_order); - page_pool_set_pp_info(pool, page); + page_pool_set_pp_info(pool, page_to_netmem(page)); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + trace_page_pool_state_hold(pool, page_to_netmem(page), + pool->pages_state_hold_cnt); return page; } /* slow path */ -noinline -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, - gfp_t gfp) +static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool, + gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; - unsigned int pp_flags = pool->p.flags; unsigned int pp_order = pool->p.order; - struct page *page; + bool dma_map = pool->dma_map; + netmem_ref netmem; int i, nr_pages; + /* Unconditionally set NOWARN if allocating from NAPI. + * Drivers forget to set it, and OOM reports on packet Rx are useless. + */ + if ((gfp & GFP_ATOMIC) == GFP_ATOMIC) + gfp |= __GFP_NOWARN; + /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) - return __page_pool_alloc_page_order(pool, gfp); + return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); /* Unnecessary as alloc cache is empty, but guarantees zero count */ if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, - pool->alloc.cache); + nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, + (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) - return NULL; + return 0; /* Pages have been filled into alloc.cache array, but count is zero and * page element have not been (possibly) DMA mapped. */ for (i = 0; i < nr_pages; i++) { - page = pool->alloc.cache[i]; - if ((pp_flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { - put_page(page); + netmem = pool->alloc.cache[i]; + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { + put_page(netmem_to_page(netmem)); continue; } - page_pool_set_pp_info(pool, page); - pool->alloc.cache[pool->alloc.count++] = page; + page_pool_set_pp_info(pool, netmem); + pool->alloc.cache[pool->alloc.count++] = netmem; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Return last page */ if (likely(pool->alloc.count > 0)) { - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, slow); } else { - page = NULL; + netmem = 0; } /* When page just alloc'ed is should/must have refcnt 1. */ - return page; + return netmem; } /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ -struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { - struct page *page; + netmem_ref netmem; /* Fast-path: Get a page from cache */ - page = __page_pool_get_cached(pool); - if (page) - return page; + netmem = __page_pool_get_cached(pool); + if (netmem) + return netmem; /* Slow-path: cache empty, do real allocation */ - page = __page_pool_alloc_pages_slow(pool, gfp); - return page; + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); + else + netmem = __page_pool_alloc_netmems_slow(pool, gfp); + return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_netmems); +ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); @@ -448,7 +681,7 @@ EXPORT_SYMBOL(page_pool_alloc_pages); */ #define _distance(a, b) (s32)((a) - (b)) -static s32 page_pool_inflight(struct page_pool *pool) +s32 page_pool_inflight(const struct page_pool *pool, bool strict) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); @@ -456,73 +689,106 @@ static s32 page_pool_inflight(struct page_pool *pool) inflight = _distance(hold_cnt, release_cnt); - trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); - WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + if (strict) { + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); + WARN(inflight < 0, "Negative(%d) inflight packet-pages", + inflight); + } else { + inflight = max(0, inflight); + } return inflight; } -/* Disconnects a page (from a page_pool). API users can have a need - * to disconnect a page (from a page_pool), to allow it to be used as - * a regular page (that will eventually be returned to the normal - * page-allocator via put_page). - */ -void page_pool_release_page(struct page_pool *pool, struct page *page) +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) +{ + netmem_set_pp(netmem, pool); + netmem_or_pp_magic(netmem, PP_SIGNATURE); + + /* Ensuring all pages have been split into one fragment initially: + * page_pool_set_pp_info() is only called once for every page when it + * is allocated from the page allocator and page_pool_fragment_page() + * is dirtying the same cache line as the page->pp_magic above, so + * the overhead is negligible. + */ + page_pool_fragment_netmem(netmem, 1); + if (pool->has_init_callback) + pool->slow.init_callback(netmem, pool->slow.init_arg); +} + +void page_pool_clear_pp_info(netmem_ref netmem) +{ + netmem_clear_pp_magic(netmem); + netmem_set_pp(netmem, NULL); +} + +static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, + netmem_ref netmem) { dma_addr_t dma; - int count; - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + if (!pool->dma_map) /* Always account for inflight pages, even if we didn't * map them */ - goto skip_dma_unmap; + return; + + if (page_pool_release_dma_index(pool, netmem)) + return; - dma = page_pool_get_dma_addr(page); + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, - DMA_ATTR_SKIP_CPU_SYNC); - page_pool_set_dma_addr(page, 0); -skip_dma_unmap: - page_pool_clear_pp_info(page); + DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); + page_pool_set_dma_addr_netmem(netmem, 0); +} + +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem) +{ + int count; + bool put; + + put = true; + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); + else + __page_pool_release_netmem_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); - trace_page_pool_state_release(pool, page, count); -} -EXPORT_SYMBOL(page_pool_release_page); + trace_page_pool_state_release(pool, netmem, count); -/* Return a page to the page allocator, cleaning up our state */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) -{ - page_pool_release_page(pool, page); - - put_page(page); + if (put) { + page_pool_clear_pp_info(netmem); + put_page(netmem_to_page(netmem)); + } /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). */ } -static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) +static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { - int ret; - /* BH protection not needed if current is serving softirq */ - if (in_serving_softirq()) - ret = ptr_ring_produce(&pool->ring, page); - else - ret = ptr_ring_produce_bh(&pool->ring, page); + bool in_softirq, ret; - if (!ret) { + /* BH protection not needed if current is softirq */ + in_softirq = page_pool_producer_lock(pool); + ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); + if (ret) recycle_stat_inc(pool, ring); - return true; - } + page_pool_producer_unlock(pool, in_softirq); - return false; + return ret; } /* Only allow direct recycling in special circumstances, into the @@ -530,7 +796,7 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) * * Caller must provide appropriate safe context. */ -static bool page_pool_recycle_in_cache(struct page *page, +static bool page_pool_recycle_in_cache(netmem_ref netmem, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { @@ -539,21 +805,30 @@ static bool page_pool_recycle_in_cache(struct page *page, } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ - pool->alloc.cache[pool->alloc.count++] = page; + pool->alloc.cache[pool->alloc.count++] = netmem; recycle_stat_inc(pool, cached); return true; } +static bool __page_pool_page_can_be_recycled(netmem_ref netmem) +{ + return netmem_is_net_iov(netmem) || + (page_ref_count(netmem_to_page(netmem)) == 1 && + !page_is_pfmemalloc(netmem_to_page(netmem))); +} + /* If the page refcnt == 1, this will try to recycle the page. - * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * If pool->dma_sync is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -static __always_inline struct page * -__page_pool_put_page(struct page_pool *pool, struct page *page, +static __always_inline netmem_ref +__page_pool_put_page(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { + lockdep_assert_no_hardirq(); + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -563,20 +838,18 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { + if (likely(__page_pool_page_can_be_recycled(netmem))) { /* Read barrier done in page_ref_count / READ_ONCE */ - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, - dma_sync_size); + page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); - if (allow_direct && in_serving_softirq() && - page_pool_recycle_in_cache(page, pool)) - return NULL; + if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) + return 0; /* Page found as candidate for recycling */ - return page; + return netmem; } + /* Fallback/non-XDP mode: API user have elevated refcnt. * * Many drivers split up the page into fragments, and some @@ -591,184 +864,279 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - /* Do not replace this with page_pool_return_page() */ - page_pool_release_page(pool, page); - put_page(page); + page_pool_return_netmem(pool, netmem); - return NULL; + return 0; } -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, +static bool page_pool_napi_local(const struct page_pool *pool) +{ + const struct napi_struct *napi; + u32 cpuid; + + /* On PREEMPT_RT the softirq can be preempted by the consumer */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + +void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { - page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); - if (page && !page_pool_recycle_in_ring(pool, page)) { + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + + netmem = __page_pool_put_page(pool, netmem, dma_sync_size, + allow_direct); + if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); - page_pool_return_page(pool, page); + page_pool_return_netmem(pool, netmem); } } -EXPORT_SYMBOL(page_pool_put_defragged_page); +EXPORT_SYMBOL(page_pool_put_unrefed_netmem); -/* Caller must not use data area after call, as this function overwrites it */ -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { - int i, bulk_len = 0; - - for (i = 0; i < count; i++) { - struct page *page = virt_to_head_page(data[i]); - - /* It is not the last user for the page frag case */ - if (!page_pool_is_last_frag(pool, page)) - continue; + page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, + allow_direct); +} +EXPORT_SYMBOL(page_pool_put_unrefed_page); - page = __page_pool_put_page(pool, page, -1, false); - /* Approved for bulk recycling in ptr_ring cache */ - if (page) - data[bulk_len++] = page; - } +static void page_pool_recycle_ring_bulk(struct page_pool *pool, + netmem_ref *bulk, + u32 bulk_len) +{ + bool in_softirq; + u32 i; - if (unlikely(!bulk_len)) - return; + /* Bulk produce into ptr_ring page_pool cache */ + in_softirq = page_pool_producer_lock(pool); - /* Bulk producer into ptr_ring page_pool cache */ - page_pool_ring_lock(pool); for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) { + if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } + + page_pool_producer_unlock(pool, in_softirq); recycle_stat_add(pool, ring, i); - page_pool_ring_unlock(pool); - /* Hopefully all pages was return into ptr_ring */ + /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; - /* ptr_ring cache full, free remaining pages outside producer lock - * since put_page() with refcnt == 1 can be an expensive operation + /* + * ptr_ring cache is full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) - page_pool_return_page(pool, data[i]); + page_pool_return_netmem(pool, bulk[i]); } -EXPORT_SYMBOL(page_pool_put_page_bulk); -static struct page *page_pool_drain_frag(struct page_pool *pool, - struct page *page) +/** + * page_pool_put_netmem_bulk() - release references on multiple netmems + * @data: array holding netmem references + * @count: number of entries in @data + * + * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() + * will release leftover netmems to the memory provider. + * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_netmem_bulk(), as this function overwrites it. + */ +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) +{ + u32 bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + netmem_ref netmem = netmem_compound_head(data[i]); + + if (page_pool_unref_and_test(netmem)) + data[bulk_len++] = netmem; + } + + count = bulk_len; + while (count) { + netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; + struct page_pool *pool = NULL; + bool allow_direct; + u32 foreign = 0; + + bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + struct page_pool *netmem_pp; + netmem_ref netmem = data[i]; + + netmem_pp = netmem_get_pp(netmem); + if (unlikely(!pool)) { + pool = netmem_pp; + allow_direct = page_pool_napi_local(pool); + } else if (netmem_pp != pool) { + /* + * If the netmem belongs to a different + * page_pool, save it for another round. + */ + data[foreign++] = netmem; + continue; + } + + netmem = __page_pool_put_page(pool, netmem, -1, + allow_direct); + /* Approved for bulk recycling in ptr_ring cache */ + if (netmem) + bulk[bulk_len++] = netmem; + } + + if (bulk_len) + page_pool_recycle_ring_bulk(pool, bulk, bulk_len); + + count = foreign; + } +} +EXPORT_SYMBOL(page_pool_put_netmem_bulk); + +static netmem_ref page_pool_drain_frag(struct page_pool *pool, + netmem_ref netmem) { long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ - if (likely(page_pool_defrag_page(page, drain_count))) - return NULL; - - if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, -1); + if (likely(page_pool_unref_netmem(netmem, drain_count))) + return 0; - return page; + if (__page_pool_page_can_be_recycled(netmem)) { + page_pool_dma_sync_for_device(pool, netmem, -1); + return netmem; } - page_pool_return_page(pool, page); - return NULL; + page_pool_return_netmem(pool, netmem); + return 0; } static void page_pool_free_frag(struct page_pool *pool) { long drain_count = BIAS_MAX - pool->frag_users; - struct page *page = pool->frag_page; + netmem_ref netmem = pool->frag_page; - pool->frag_page = NULL; + pool->frag_page = 0; - if (!page || page_pool_defrag_page(page, drain_count)) + if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; - page_pool_return_page(pool, page); + page_pool_return_netmem(pool, netmem); } -struct page *page_pool_alloc_frag(struct page_pool *pool, - unsigned int *offset, - unsigned int size, gfp_t gfp) +netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, + unsigned int *offset, unsigned int size, + gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; - struct page *page = pool->frag_page; + netmem_ref netmem = pool->frag_page; - if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || - size > max_size)) - return NULL; + if (WARN_ON(size > max_size)) + return 0; size = ALIGN(size, dma_get_cache_alignment()); *offset = pool->frag_offset; - if (page && *offset + size > max_size) { - page = page_pool_drain_frag(pool, page); - if (page) { + if (netmem && *offset + size > max_size) { + netmem = page_pool_drain_frag(pool, netmem); + if (netmem) { + recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } } - if (!page) { - page = page_pool_alloc_pages(pool, gfp); - if (unlikely(!page)) { - pool->frag_page = NULL; - return NULL; + if (!netmem) { + netmem = page_pool_alloc_netmems(pool, gfp); + if (unlikely(!netmem)) { + pool->frag_page = 0; + return 0; } - pool->frag_page = page; + pool->frag_page = netmem; frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; - page_pool_fragment_page(page, BIAS_MAX); - return page; + page_pool_fragment_netmem(netmem, BIAS_MAX); + return netmem; } pool->frag_users++; pool->frag_offset = *offset + size; - alloc_stat_inc(pool, fast); - return page; + return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_frag_netmem); + +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, + gfp)); } EXPORT_SYMBOL(page_pool_alloc_frag); static void page_pool_empty_ring(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; /* Empty recycle ring */ - while ((page = ptr_ring_consume_bh(&pool->ring))) { + while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ - if (!(page_ref_count(page) == 1)) + if (!(netmem_ref_count(netmem) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", - __func__, page_ref_count(page)); + __func__, netmem_ref_count(netmem)); - page_pool_return_page(pool, page); + page_pool_return_netmem(pool, netmem); } } -static void page_pool_free(struct page_pool *pool) +static void __page_pool_destroy(struct page_pool *pool) { if (pool->disconnect) pool->disconnect(pool); - ptr_ring_cleanup(&pool->ring, NULL); + page_pool_unlist(pool); + page_pool_uninit(pool); - if (pool->p.flags & PP_FLAG_DMA_MAP) - put_device(pool->p.dev); + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); + static_branch_dec(&page_pool_mem_providers); + } -#ifdef CONFIG_PAGE_POOL_STATS - free_percpu(pool->recycle_stats); -#endif kfree(pool); } static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; if (pool->destroy_cnt) return; @@ -778,15 +1146,36 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) * call concurrently. */ while (pool->alloc.count) { - page = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, page); + netmem = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_netmem(pool, netmem); } } static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_netmem_dma(pool, page_to_netmem((struct page *)ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. @@ -796,12 +1185,16 @@ static void page_pool_scrub(struct page_pool *pool) static int page_pool_release(struct page_pool *pool) { + bool in_softirq; int inflight; page_pool_scrub(pool); - inflight = page_pool_inflight(pool); + inflight = page_pool_inflight(pool, true); + /* Acquire producer lock to make sure producers have exited. */ + in_softirq = page_pool_producer_lock(pool); + page_pool_producer_unlock(pool, in_softirq); if (!inflight) - page_pool_free(pool); + __page_pool_destroy(pool); return inflight; } @@ -810,18 +1203,27 @@ static void page_pool_release_retry(struct work_struct *wq) { struct delayed_work *dwq = to_delayed_work(wq); struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + void *netdev; int inflight; inflight = page_pool_release(pool); - if (!inflight) + /* In rare cases, a driver bug may cause inflight to go negative. + * Don't reschedule release if inflight is 0 or negative. + * - If 0, the page_pool has been destroyed + * - if negative, we will never recover + * in both cases no reschedule is necessary. + */ + if (inflight <= 0) return; - /* Periodic warning */ - if (time_after_eq(jiffies, pool->defer_warn)) { + /* Periodic warning for page pools the user can't see */ + netdev = READ_ONCE(pool->slow.netdev); + if (time_after_eq(jiffies, pool->defer_warn) && + (!netdev || netdev == NET_PTR_POISON)) { int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; - pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", - __func__, inflight, sec); + pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", + __func__, pool->user.id, inflight, sec); pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; } @@ -830,13 +1232,60 @@ static void page_pool_release_retry(struct work_struct *wq) } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem) + const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } +/** + * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI + * @pool: page pool to modify + * @napi: NAPI instance to associate the page pool with + * + * Associate a page pool with a NAPI instance for lockless page recycling. + * This is useful when a new page pool has to be added to a NAPI instance + * without disabling that NAPI instance, to mark the point at which control + * path "hands over" the page pool to the NAPI instance. In most cases driver + * can simply set the @napi field in struct page_pool_params, and does not + * have to call this helper. + * + * The function is idempotent, but does not implement any refcounting. + * Single page_pool_disable_direct_recycling() will disable recycling, + * no matter how many times enable was called. + */ +void page_pool_enable_direct_recycling(struct page_pool *pool, + struct napi_struct *napi) +{ + if (READ_ONCE(pool->p.napi) == napi) + return; + WARN_ON(!napi || pool->p.napi); + + mutex_lock(&page_pools_lock); + WRITE_ONCE(pool->p.napi, napi); + mutex_unlock(&page_pools_lock); +} +EXPORT_SYMBOL(page_pool_enable_direct_recycling); + +void page_pool_disable_direct_recycling(struct page_pool *pool) +{ + /* Disable direct recycling based on pool->cpuid. + * Paired with READ_ONCE() in page_pool_napi_local(). + */ + WRITE_ONCE(pool->cpuid, -1); + + if (!pool->p.napi) + return; + + napi_assert_will_not_race(pool->p.napi); + + mutex_lock(&page_pools_lock); + WRITE_ONCE(pool->p.napi, NULL); + mutex_unlock(&page_pools_lock); +} +EXPORT_SYMBOL(page_pool_disable_direct_recycling); + void page_pool_destroy(struct page_pool *pool) { if (!pool) @@ -845,11 +1294,13 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; + page_pool_disable_direct_recycling(pool); page_pool_free_frag(pool); if (!page_pool_release(pool)) return; + page_pool_detached(pool); pool->defer_start = jiffies; pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; @@ -861,44 +1312,43 @@ EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { - struct page *page; + netmem_ref netmem; trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { - page = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, page); + netmem = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid); -bool page_pool_return_skb_page(struct page *page) +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) { - struct page_pool *pp; + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} - page = compound_head(page); +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) - return false; + page_pool_set_pp_info(pool, netmem); - pp = page->pp; + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, false); +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); - return true; + page_pool_clear_pp_info(netmem); } -EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h new file mode 100644 index 000000000000..2fb06d5f6d55 --- /dev/null +++ b/net/core/page_pool_priv.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PAGE_POOL_PRIV_H +#define __PAGE_POOL_PRIV_H + +#include <net/page_pool/helpers.h> + +#include "netmem_priv.h" + +extern struct mutex page_pools_lock; + +s32 page_pool_inflight(const struct page_pool *pool, bool strict); + +int page_pool_list(struct page_pool *pool); +void page_pool_detached(struct page_pool *pool); +void page_pool_unlist(struct page_pool *pool); + +static inline bool +page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr) +{ + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { + netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT); + + /* We assume page alignment to shave off bottom bits, + * if this "compression" doesn't work we need to drop. + */ + return addr != (dma_addr_t)netmem_get_dma_addr(netmem) + << PAGE_SHIFT; + } + + netmem_set_dma_addr(netmem, addr); + return false; +} + +static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr); +} + +#if defined(CONFIG_PAGE_POOL) +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem); +void page_pool_clear_pp_info(netmem_ref netmem); +int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq); +#else +static inline void page_pool_set_pp_info(struct page_pool *pool, + netmem_ref netmem) +{ +} +static inline void page_pool_clear_pp_info(netmem_ref netmem) +{ +} +static inline int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq) +{ + return 0; +} +#endif + +#endif diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c new file mode 100644 index 000000000000..c82a95beceff --- /dev/null +++ b/net/core/page_pool_user.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/xarray.h> +#include <net/busy_poll.h> +#include <net/net_debug.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/types.h> +#include <net/page_pool/memory_provider.h> +#include <net/sock.h> + +#include "page_pool_priv.h" +#include "netdev-genl-gen.h" + +static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); +/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, + * pool->user. + * Ordering: inside rtnl_lock + */ +DEFINE_MUTEX(page_pools_lock); + +/* Page pools are only reachable from user space (via netlink) if they are + * linked to a netdev at creation time. Following page pool "visibility" + * states are possible: + * - normal + * - user.list: linked to real netdev, netdev: real netdev + * - orphaned - real netdev has disappeared + * - user.list: linked to lo, netdev: lo + * - invisible - either (a) created without netdev linking, (b) unlisted due + * to error, or (c) the entire namespace which owned this pool disappeared + * - user.list: unhashed, netdev: unknown + */ + +typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info); + +static int +netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill) +{ + struct page_pool *pool; + struct sk_buff *rsp; + int err; + + mutex_lock(&page_pools_lock); + pool = xa_load(&page_pools, id); + if (!pool || hlist_unhashed(&pool->user.list) || + !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) { + err = -ENOENT; + goto err_unlock; + } + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) { + err = -ENOMEM; + goto err_unlock; + } + + err = fill(rsp, pool, info); + if (err) + goto err_free_msg; + + mutex_unlock(&page_pools_lock); + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +struct page_pool_dump_cb { + unsigned long ifindex; + u32 pp_id; +}; + +static int +netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, + pp_nl_fill_cb fill) +{ + struct page_pool_dump_cb *state = (void *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + struct page_pool *pool; + int err = 0; + + rtnl_lock(); + mutex_lock(&page_pools_lock); + for_each_netdev_dump(net, netdev, state->ifindex) { + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + if (state->pp_id && state->pp_id < pool->user.id) + continue; + + state->pp_id = pool->user.id; + err = fill(skb, pool, info); + if (err) + goto out; + } + + state->pp_id = 0; + } +out: + mutex_unlock(&page_pools_lock); + rtnl_unlock(); + + return err; +} + +static int +page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct page_pool_stats stats = {}; + struct nlattr *nest; + void *hdr; + + if (!page_pool_get_stats(pool, &stats)) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) || + (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex))) + goto err_cancel_nest; + + nla_nest_end(rsp, nest); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST, + stats.alloc_stats.fast) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, + stats.alloc_stats.slow) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, + stats.alloc_stats.slow_high_order) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, + stats.alloc_stats.empty) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, + stats.alloc_stats.refill) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, + stats.alloc_stats.waive) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, + stats.recycle_stats.cached) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, + stats.recycle_stats.cache_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, + stats.recycle_stats.ring) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, + stats.recycle_stats.ring_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, + stats.recycle_stats.released_refcnt)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel_nest: + nla_nest_cancel(rsp, nest); +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +#else + GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS"); + return -EOPNOTSUPP; +#endif +} + +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)]; + struct nlattr *nest; + int err; + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO)) + return -EINVAL; + + nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO]; + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + netdev_page_pool_info_nl_policy, + info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NETDEV_A_PAGE_POOL_IFINDEX], + "selecting by ifindex not supported"); + return -EINVAL; + } + + id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill); +} + +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill); +} + +static int +page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ + size_t inflight, refsz; + unsigned int napi_id; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id)) + goto err_cancel; + + if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex)) + goto err_cancel; + + napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; + if (napi_id_valid(napi_id) && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) + goto err_cancel; + + inflight = page_pool_inflight(pool, false); + refsz = PAGE_SIZE << pool->p.order; + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, + inflight * refsz)) + goto err_cancel; + if (pool->user.detach_time && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, + pool->user.detach_time)) + goto err_cancel; + + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) + goto err_cancel; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + struct net *net; + + lockdep_assert_held(&page_pools_lock); + + /* 'invisible' page pools don't matter */ + if (hlist_unhashed(&pool->user.list)) + return; + net = dev_net(pool->slow.netdev); + + if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL)) + return; + + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + if (page_pool_nl_fill(ntf, pool, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&netdev_nl_family, net, ntf, + 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL); +} + +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + + id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill); +} + +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill); +} + +int page_pool_list(struct page_pool *pool) +{ + static u32 id_alloc_next; + int err; + + mutex_lock(&page_pools_lock); + err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b, + &id_alloc_next, GFP_KERNEL); + if (err < 0) + goto err_unlock; + + INIT_HLIST_NODE(&pool->user.list); + if (pool->slow.netdev) { + hlist_add_head(&pool->user.list, + &pool->slow.netdev->page_pools); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); + } + + mutex_unlock(&page_pools_lock); + return 0; + +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +void page_pool_detached(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + pool->user.detach_time = ktime_get_boottime_seconds(); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + mutex_unlock(&page_pools_lock); +} + +void page_pool_unlist(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); + xa_erase(&page_pools, pool->user.id); + if (!hlist_unhashed(&pool->user.list)) + hlist_del(&pool->user.list); + mutex_unlock(&page_pools_lock); +} + +int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq) +{ + void *binding = rxq->mp_params.mp_priv; + struct page_pool *pool; + struct hlist_node *n; + + if (!binding) + return 0; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { + if (pool->mp_priv != binding) + continue; + + if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { + mutex_unlock(&page_pools_lock); + return 0; + } + } + mutex_unlock(&page_pools_lock); + return -ENODATA; +} + +static void page_pool_unreg_netdev_wipe(struct net_device *netdev) +{ + struct page_pool *pool; + struct hlist_node *n; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) { + hlist_del_init(&pool->user.list); + pool->slow.netdev = NET_PTR_POISON; + } + mutex_unlock(&page_pools_lock); +} + +static void page_pool_unreg_netdev(struct net_device *netdev) +{ + struct page_pool *pool, *last; + struct net_device *lo; + + lo = dev_net(netdev)->loopback_dev; + + mutex_lock(&page_pools_lock); + last = NULL; + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + pool->slow.netdev = lo; + netdev_nl_page_pool_event(pool, + NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + last = pool; + } + if (last) + hlist_splice_init(&netdev->page_pools, &last->user.list, + &lo->page_pools); + mutex_unlock(&page_pools_lock); +} + +static int +page_pool_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + if (hlist_empty(&netdev->page_pools)) + return NOTIFY_OK; + + if (netdev->ifindex != LOOPBACK_IFINDEX) + page_pool_unreg_netdev(netdev); + else + page_pool_unreg_netdev_wipe(netdev); + return NOTIFY_OK; +} + +static struct notifier_block page_pool_netdevice_nb = { + .notifier_call = page_pool_netdevice_event, +}; + +static int __init page_pool_user_init(void) +{ + return register_netdevice_notifier(&page_pool_netdevice_nb); +} + +subsys_initcall(page_pool_user_init); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 760238196db1..d41b03fd1f63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -69,7 +69,7 @@ * * By design there should only be *one* "controlling" process. In practice * multiple write accesses gives unpredictable result. Understood by "write" - * to /proc gives result code thats should be read be the "writer". + * to /proc gives result code that should be read be the "writer". * For practical use this should be no problem. * * Note when adding devices to a specific CPU there good idea to also assign @@ -114,6 +114,7 @@ #include <linux/sys.h> #include <linux/types.h> +#include <linux/minmax.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> @@ -158,9 +159,7 @@ #include <net/udp.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> -#ifdef CONFIG_XFRM #include <net/xfrm.h> -#endif #include <net/netns/generic.h> #include <asm/byteorder.h> #include <linux/rcupdate.h> @@ -179,7 +178,7 @@ #define MAX_IMIX_ENTRIES 20 #define IMIX_PRECISION 100 /* Precision of IMIX distribution */ -#define func_enter() pr_debug("entering %s\n", __func__); +#define func_enter() pr_debug("entering %s\n", __func__) #define PKT_FLAGS \ pf(IPV6) /* Interface in IPV6 Mode */ \ @@ -200,6 +199,7 @@ pf(VID_RND) /* Random VLAN ID */ \ pf(SVID_RND) /* Random SVLAN ID */ \ pf(NODE) /* Node memory alloc*/ \ + pf(SHARED) /* Shared SKB */ \ #define pf(flag) flag##_SHIFT, enum pkt_flags { @@ -228,12 +228,12 @@ static char *pkt_flag_names[] = { /* Xmit modes */ #define M_START_XMIT 0 /* Default normal TX */ -#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ -#define if_lock(t) mutex_lock(&(t->if_lock)); -#define if_unlock(t) mutex_unlock(&(t->if_lock)); +#define if_lock(t) mutex_lock(&(t->if_lock)) +#define if_unlock(t) mutex_unlock(&(t->if_lock)) /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 @@ -284,7 +284,8 @@ struct pktgen_dev { int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ + * removal by worker thread + */ struct page *page; u64 delay; /* nano-seconds */ @@ -347,10 +348,12 @@ struct pktgen_dev { __u16 udp_dst_max; /* exclusive, dest UDP port */ /* DSCP + ECN */ - __u8 tos; /* six MSB of (former) IPv4 TOS - are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 - (see RFC 3260, sec. 4) */ + __u8 tos; /* six MSB of (former) IPv4 TOS + * are for dscp codepoint + */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + * (see RFC 3260, sec. 4) + */ /* IMIX */ unsigned int n_imix_entries; @@ -390,12 +393,12 @@ struct pktgen_dev { __u8 hh[14]; /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; + * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + * + * We fill in SRC address later + * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + * 0x08, 0x00 + * }; */ __u16 pad; /* pad out the hh struct to an even 16 bytes */ @@ -459,7 +462,8 @@ struct pktgen_thread { char result[512]; /* Field for thread to receive "posted" events terminate, - stop ifs etc. */ + * stop ifs etc. + */ u32 control; int cpu; @@ -473,8 +477,7 @@ struct pktgen_thread { #define FIND 0 static const char version[] = - "Packet Generator for packet performance testing. " - "Version: " VERSION "\n"; + "Packet Generator for packet performance testing. Version: " VERSION "\n"; static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); @@ -516,21 +519,23 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char data[128]; + size_t max; struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (count == 0) + if (count < 1) return -EINVAL; - if (count > sizeof(data)) - count = sizeof(data); - - if (copy_from_user(data, buf, count)) + max = min(count, sizeof(data) - 1); + if (copy_from_user(data, buf, max)) return -EFAULT; - data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ + if (data[max - 1] == '\n') + data[max - 1] = 0; /* strip trailing '\n', terminate string */ + else + data[max] = 0; /* terminate string */ if (!strcmp(data, "stop")) pktgen_stop_all_threads(pn); @@ -623,8 +628,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, - " udp_src_min: %d udp_src_max: %d" - " udp_dst_min: %d udp_dst_max: %d\n", + " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); @@ -669,19 +673,19 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_puts(seq, " Flags: "); for (i = 0; i < NR_PKT_FLAGS; i++) { - if (i == F_FLOW_SEQ) + if (i == FLOW_SEQ_SHIFT) if (!pkt_dev->cflows) continue; - if (pkt_dev->flags & (1 << i)) + if (pkt_dev->flags & (1 << i)) { seq_printf(seq, "%s ", pkt_flag_names[i]); - else if (i == F_FLOW_SEQ) - seq_puts(seq, "FLOW_RND "); - #ifdef CONFIG_XFRM - if (i == F_IPSEC && pkt_dev->spi) - seq_printf(seq, "spi:%u", pkt_dev->spi); + if (i == IPSEC_SHIFT && pkt_dev->spi) + seq_printf(seq, "spi:%u ", pkt_dev->spi); #endif + } else if (i == FLOW_SEQ_SHIFT) { + seq_puts(seq, "FLOW_RND "); + } } seq_puts(seq, "\n"); @@ -743,34 +747,37 @@ static int pktgen_if_show(struct seq_file *seq, void *v) } -static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, - __u32 *num) +static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen, + __u32 *num) { - int i = 0; + size_t i = 0; + *num = 0; for (; i < maxlen; i++) { int value; char c; - *num <<= 4; + if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); - if (value >= 0) + if (value >= 0) { + *num <<= 4; *num |= value; - else + } else { break; + } } return i; } -static int count_trail_chars(const char __user * user_buffer, - unsigned int maxlen) +static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -789,14 +796,15 @@ done: return i; } -static long num_arg(const char __user *user_buffer, unsigned long maxlen, - unsigned long *num) +static ssize_t num_arg(const char __user *user_buffer, size_t maxlen, + unsigned long *num) { - int i; + size_t i; *num = 0; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; if ((c >= '0') && (c <= '9')) { @@ -808,12 +816,13 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen, return i; } -static int strn_len(const char __user * user_buffer, unsigned int maxlen) +static ssize_t strn_len(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -822,6 +831,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\r': case '\t': case ' ': + case '=': goto done_str; default: break; @@ -837,11 +847,11 @@ done_str: * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. */ static ssize_t get_imix_entries(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { - const int max_digits = 10; - int i = 0; - long len; + size_t i = 0, max; + ssize_t len; char c; pkt_dev->n_imix_entries = 0; @@ -850,21 +860,33 @@ static ssize_t get_imix_entries(const char __user *buffer, unsigned long weight; unsigned long size; - len = num_arg(&buffer[i], max_digits, &size); + if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES) + return -E2BIG; + + if (i >= maxlen) + return -EINVAL; + + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &size); if (len < 0) return len; i += len; + if (i >= maxlen) + return -EINVAL; if (get_user(c, &buffer[i])) return -EFAULT; /* Check for comma between size_i and weight_i */ if (c != ',') return -EINVAL; i++; + if (i >= maxlen) + return -EINVAL; if (size < 14 + 20 + 8) size = 14 + 20 + 8; - len = num_arg(&buffer[i], max_digits, &weight); + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &weight); if (len < 0) return len; if (weight <= 0) @@ -874,42 +896,55 @@ static ssize_t get_imix_entries(const char __user *buffer, pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; i += len; + pkt_dev->n_imix_entries++; + + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; - i++; - pkt_dev->n_imix_entries++; - - if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES) - return -E2BIG; } while (c == ' '); return i; } -static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +static ssize_t get_labels(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { unsigned int n = 0; + size_t i = 0, max; + ssize_t len; char c; - ssize_t i = 0; - int len; pkt_dev->nr_labels = 0; do { __u32 tmp; - len = hex32_arg(&buffer[i], 8, &tmp); - if (len <= 0) + + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + + if (i >= maxlen) + return -EINVAL; + + max = min(8, maxlen - i); + len = hex32_arg(&buffer[i], max, &tmp); + if (len < 0) return len; + + /* return empty list in case of invalid input or zero value */ + if (len == 0 || tmp == 0) + return maxlen; + pkt_dev->labels[n] = htonl(tmp); if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM) pkt_dev->flags |= F_MPLS_RND; i += len; + n++; + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; i++; - n++; - if (n >= MAX_MPLS_LABELS) - return -E2BIG; } while (c == ','); pkt_dev->nr_labels = n; @@ -946,16 +981,16 @@ static __u32 pktgen_read_flag(const char *f, bool *disable) } static ssize_t pktgen_if_write(struct file *file, - const char __user * user_buffer, size_t count, - loff_t * offset) + const char __user *user_buffer, size_t count, + loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i, max, len; + size_t i, max; + ssize_t len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; - int tmp = 0; char buf[128]; pg_result = &(pkt_dev->result[0]); @@ -966,16 +1001,16 @@ static ssize_t pktgen_if_write(struct file *file, } max = count; - tmp = count_trail_chars(user_buffer, max); - if (tmp < 0) { + len = count_trail_chars(user_buffer, max); + if (len < 0) { pr_warn("illegal format\n"); - return tmp; + return len; } - i = tmp; + i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1003,11 +1038,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "min_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -1020,11 +1055,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "max_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->max_pkt_size) { @@ -1039,11 +1074,11 @@ static ssize_t pktgen_if_write(struct file *file, /* Shortcut for min = max */ if (!strcmp(name, "pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -1059,43 +1094,43 @@ static ssize_t pktgen_if_write(struct file *file, if (pkt_dev->clone_skb > 0) return -EINVAL; - len = get_imix_entries(&user_buffer[i], pkt_dev); + max = count - i; + len = get_imix_entries(&user_buffer[i], max, pkt_dev); if (len < 0) return len; fill_imix_distribution(pkt_dev); - i += len; return count; } if (!strcmp(name, "debug")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; debug = value; sprintf(pg_result, "OK: debug=%u", debug); return count; } if (!strcmp(name, "frags")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->nfrags = value; sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags); return count; } if (!strcmp(name, "delay")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value == 0x7FFFFFFF) pkt_dev->delay = ULLONG_MAX; else @@ -1106,13 +1141,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "rate")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1121,13 +1156,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "ratep")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = NSEC_PER_SEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1136,11 +1171,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_min) { pkt_dev->udp_src_min = value; pkt_dev->cur_udp_src = value; @@ -1149,11 +1184,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_min) { pkt_dev->udp_dst_min = value; pkt_dev->cur_udp_dst = value; @@ -1162,11 +1197,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_max) { pkt_dev->udp_src_max = value; pkt_dev->cur_udp_src = value; @@ -1175,11 +1210,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_max) { pkt_dev->udp_dst_max = value; pkt_dev->cur_udp_dst = value; @@ -1188,7 +1223,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "clone_skb")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; /* clone_skb is not supported for netif_receive xmit_mode and @@ -1197,33 +1233,33 @@ static ssize_t pktgen_if_write(struct file *file, if ((value > 0) && ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) - return -ENOTSUPP; - if (value > 0 && pkt_dev->n_imix_entries > 0) + return -EOPNOTSUPP; + if (value > 0 && (pkt_dev->n_imix_entries > 0 || + !(pkt_dev->flags & F_SHARED))) return -EINVAL; - i += len; pkt_dev->clone_skb = value; sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); return count; } if (!strcmp(name, "count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->count = value; sprintf(pg_result, "OK: count=%llu", (unsigned long long)pkt_dev->count); return count; } if (!strcmp(name, "src_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->src_mac_count != value) { pkt_dev->src_mac_count = value; pkt_dev->cur_src_mac_offset = 0; @@ -1233,11 +1269,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->dst_mac_count != value) { pkt_dev->dst_mac_count = value; pkt_dev->cur_dst_mac_offset = 0; @@ -1247,27 +1283,30 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "burst")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value > 1) && ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || ((pkt_dev->xmit_mode == M_START_XMIT) && (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) - return -ENOTSUPP; + return -EOPNOTSUPP; + + if (value > 1 && !(pkt_dev->flags & F_SHARED)) + return -EINVAL; + pkt_dev->burst = value < 1 ? 1 : value; sprintf(pg_result, "OK: burst=%u", pkt_dev->burst); return count; } if (!strcmp(name, "node")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; - if (node_possible(value)) { pkt_dev->node = value; sprintf(pg_result, "OK: node=%d", pkt_dev->node); @@ -1275,29 +1314,29 @@ static ssize_t pktgen_if_write(struct file *file, put_page(pkt_dev->page); pkt_dev->page = NULL; } - } - else + } else { sprintf(pg_result, "ERROR: node not possible"); + } return count; } if (!strcmp(name, "xmit_mode")) { char f[32]; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, sizeof(f)); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; if (strcmp(f, "start_xmit") == 0) { pkt_dev->xmit_mode = M_START_XMIT; } else if (strcmp(f, "netif_receive") == 0) { /* clone_skb set earlier, not supported in this mode */ if (pkt_dev->clone_skb > 0) - return -ENOTSUPP; + return -EOPNOTSUPP; pkt_dev->xmit_mode = M_NETIF_RECEIVE; @@ -1318,46 +1357,62 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "flag")) { + bool disable = false; __u32 flag; char f[32]; - bool disable = false; + char *end; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, 32); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; flag = pktgen_read_flag(f, &disable); - if (flag) { - if (disable) + if (disable) { + /* If "clone_skb", or "burst" parameters are + * configured, it means that the skb still + * needs to be referenced by the pktgen, so + * the skb must be shared. + */ + if (flag == F_SHARED && (pkt_dev->clone_skb || + pkt_dev->burst > 1)) + return -EINVAL; pkt_dev->flags &= ~flag; - else + } else { pkt_dev->flags |= flag; - } else { - sprintf(pg_result, - "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", - f, - "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " - "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " - "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " - "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " - "NO_TIMESTAMP, " -#ifdef CONFIG_XFRM - "IPSEC, " -#endif - "NODE_ALLOC\n"); + } + + sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); return count; } - sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); + + /* Unknown flag */ + end = pkt_dev->result + sizeof(pkt_dev->result); + pg_result += sprintf(pg_result, + "Flag -:%s:- unknown\n" + "Available flags, (prepend ! to un-set flag):\n", f); + + for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) { + if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT) + continue; + pg_result += snprintf(pg_result, end - pg_result, + "%s, ", pkt_flag_names[n]); + } + if (!WARN_ON_ONCE(pg_result >= end)) { + /* Remove the comma and whitespace at the end */ + *(pg_result - 2) = '\0'; + } + return count; } if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + max = min(sizeof(pkt_dev->dst_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1365,19 +1420,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { - memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strcpy(pkt_dev->dst_min, buf); + strscpy_pad(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } if (debug) pr_debug("dst_min set to: %s\n", pkt_dev->dst_min); - i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); return count; } if (!strcmp(name, "dst_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + max = min(sizeof(pkt_dev->dst_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1385,19 +1440,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { - memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strcpy(pkt_dev->dst_max, buf); + strscpy_pad(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } if (debug) pr_debug("dst_max set to: %s\n", pkt_dev->dst_max); - i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); return count; } if (!strcmp(name, "dst6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1415,12 +1470,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6=%s", buf); return count; } if (!strcmp(name, "dst6_min")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1437,12 +1492,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_min set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_min=%s", buf); return count; } if (!strcmp(name, "dst6_max")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1458,12 +1513,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_max set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_max=%s", buf); return count; } if (!strcmp(name, "src6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1481,12 +1536,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("src6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: src6=%s", buf); return count; } if (!strcmp(name, "src_min")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + max = min(sizeof(pkt_dev->src_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1494,19 +1549,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { - memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strcpy(pkt_dev->src_min, buf); + strscpy_pad(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } if (debug) pr_debug("src_min set to: %s\n", pkt_dev->src_min); - i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); return count; } if (!strcmp(name, "src_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + max = min(sizeof(pkt_dev->src_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1514,19 +1569,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { - memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strcpy(pkt_dev->src_max, buf); + strscpy_pad(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } if (debug) pr_debug("src_max set to: %s\n", pkt_dev->src_max); - i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); return count; } if (!strcmp(name, "dst_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1543,7 +1598,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "src_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1567,11 +1623,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "flows")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value > MAX_CFLOWS) value = MAX_CFLOWS; @@ -1581,44 +1637,44 @@ static ssize_t pktgen_if_write(struct file *file, } #ifdef CONFIG_XFRM if (!strcmp(name, "spi")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->spi = value; sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); return count; } #endif if (!strcmp(name, "flowlen")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->lflow = value; sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); return count; } if (!strcmp(name, "queue_map_min")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_min = value; sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); return count; } if (!strcmp(name, "queue_map_max")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_max = value; sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); return count; @@ -1627,10 +1683,11 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "mpls")) { unsigned int n, cnt; - len = get_labels(&user_buffer[i], pkt_dev); + max = count - i; + len = get_labels(&user_buffer[i], max, pkt_dev); if (len < 0) return len; - i += len; + cnt = sprintf(pg_result, "OK: mpls="); for (n = 0; n < pkt_dev->nr_labels; n++) cnt += sprintf(pg_result + cnt, @@ -1648,11 +1705,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value <= 4095) { pkt_dev->vlan_id = value; /* turn on VLAN */ @@ -1675,11 +1732,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_p = value; sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p); @@ -1690,11 +1747,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_cfi = value; sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi); @@ -1705,11 +1762,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { pkt_dev->svlan_id = value; /* turn on SVLAN */ @@ -1732,11 +1789,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_p = value; sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p); @@ -1747,11 +1804,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_cfi = value; sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi); @@ -1762,12 +1819,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "tos")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->tos = tmp_value; sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos); @@ -1778,12 +1836,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "traffic_class")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->traffic_class = tmp_value; sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class); @@ -1794,11 +1853,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "skb_priority")) { - len = num_arg(&user_buffer[i], 9, &value); + max = min(9, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->skb_priority = value; sprintf(pg_result, "OK: skb_priority=%i", pkt_dev->skb_priority); @@ -1853,12 +1912,13 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) } static ssize_t pktgen_thread_write(struct file *file, - const char __user * user_buffer, - size_t count, loff_t * offset) + const char __user *user_buffer, + size_t count, loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i, max, len, ret; + size_t i, max; + ssize_t len, ret; char name[40]; char *pg_result; @@ -1875,8 +1935,8 @@ static ssize_t pktgen_thread_write(struct file *file, i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1905,15 +1965,17 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; + memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) { ret = len; goto out; } if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; + mutex_lock(&pktgen_thread_lock); ret = pktgen_add_device(t, f); mutex_unlock(&pktgen_thread_lock); @@ -2264,7 +2326,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) s64 remaining; struct hrtimer_sleeper t; - hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); @@ -2337,24 +2399,25 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) } -#ifdef CONFIG_XFRM /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... -*/ + */ #define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { +#ifdef CONFIG_XFRM struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { if (pkt_dev->spi) { /* We need as quick as possible to find the right SA - * Searching with minimum criteria to archieve this. + * Searching with minimum criteria to achieve, this. */ x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); } else { - /* slow path: we dont already have xfrm_state */ + /* slow path: we don't already have xfrm_state */ x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, @@ -2369,16 +2432,16 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) } } -} #endif +} static void set_cur_queue_map(struct pktgen_dev *pkt_dev) { - if (pkt_dev->flags & F_QUEUE_MAP_CPU) pkt_dev->cur_queue_map = smp_processor_id(); else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { t = get_random_u32_inclusive(pkt_dev->queue_map_min, pkt_dev->queue_map_max); @@ -2460,6 +2523,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_MPLS_RND) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | @@ -2504,6 +2568,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) imx = ntohl(pkt_dev->saddr_max); if (imn < imx) { __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) t = get_random_u32_inclusive(imn, imx - 1); else { @@ -2524,6 +2589,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { do { @@ -2548,10 +2614,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].flags |= F_INIT; pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; -#ifdef CONFIG_XFRM if (pkt_dev->flags & F_IPSEC) get_ipsec_sa(pkt_dev, flow); -#endif pkt_dev->nflows++; } } @@ -2573,6 +2637,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { t = get_random_u32_inclusive(pkt_dev->min_pkt_size, pkt_dev->max_pkt_size - 1); @@ -2639,7 +2704,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) if (!x) return 0; /* XXX: we dont support tunnel mode for now until - * we resolve the dst issue */ + * we resolve the dst issue + */ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; @@ -2674,8 +2740,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { xfrm_state_put(x); pkt_dev->flows[i].x = NULL; @@ -2690,6 +2758,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; + if (x) { struct ethhdr *eth; struct iphdr *iph; @@ -2733,6 +2802,7 @@ err: static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; @@ -2772,8 +2842,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } i = 0; - frag_len = (datalen/frags) < PAGE_SIZE ? - (datalen/frags) : PAGE_SIZE; + frag_len = min_t(int, datalen / frags, PAGE_SIZE); while (datalen > 0) { if (unlikely(!pkt_dev->page)) { int node = numa_node_id(); @@ -2785,14 +2854,16 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, break; } get_page(pkt_dev->page); - skb_frag_set_page(skb, i, pkt_dev->page); - skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0); + /*last fragment, fill rest of data*/ if (i == (frags - 1)) - skb_frag_size_set(&skb_shinfo(skb)->frags[i], - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, + min(datalen, PAGE_SIZE)); else - skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); @@ -2842,7 +2913,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb->dev = dev; } } else { - skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } /* the caller pre-fetches from skb->data and reserves for the mac hdr */ @@ -2923,7 +2994,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->priority = pkt_dev->skb_priority; memcpy(eth, pkt_dev->hh, 12); - *(__be16 *) & eth[12] = protocol; + *(__be16 *)ð[12] = protocol; /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - @@ -3152,11 +3223,11 @@ static void pktgen_run(struct pktgen_thread *t) set_pkt_overhead(pkt_dev); - strcpy(pkt_dev->result, "Starting"); + strscpy(pkt_dev->result, "Starting"); pkt_dev->running = 1; /* Cranke yeself! */ started++; } else - strcpy(pkt_dev->result, "Error starting"); + strscpy(pkt_dev->result, "Error starting"); } rcu_read_unlock(); if (started) @@ -3415,6 +3486,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static void pktgen_resched(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); + schedule(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } @@ -3437,12 +3509,24 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { - unsigned int burst = READ_ONCE(pkt_dev->burst); + bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED); struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; + unsigned int burst = 1; struct sk_buff *skb; + int clone_skb = 0; int ret; + /* If 'skb_shared' is false, the read of possible + * new values (if any) for 'burst' and 'clone_skb' will be skipped to + * prevent some concurrent changes from slipping in. And the stabilized + * config will be read in during the next run of pktgen_xmit. + */ + if (skb_shared) { + burst = READ_ONCE(pkt_dev->burst); + clone_skb = READ_ONCE(pkt_dev->clone_skb); + } + /* If device is offline, then don't send */ if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) { pktgen_stop_device(pkt_dev); @@ -3459,7 +3543,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* If no skb or clone count exhausted then get new one */ if (!pkt_dev->skb || (pkt_dev->last_ok && - ++pkt_dev->clone_count >= pkt_dev->clone_skb)) { + ++pkt_dev->clone_count >= clone_skb)) { /* build a new pkt */ kfree_skb(pkt_dev->skb); @@ -3480,7 +3564,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) { skb = pkt_dev->skb; skb->protocol = eth_type_trans(skb, skb->dev); - refcount_add(burst, &skb->users); + if (skb_shared) + refcount_add(burst, &skb->users); local_bh_disable(); do { ret = netif_receive_skb(skb); @@ -3488,6 +3573,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->errors++; pkt_dev->sofar++; pkt_dev->seq_num++; + if (unlikely(!skb_shared)) { + pkt_dev->skb = NULL; + break; + } if (refcount_read(&skb->users) != burst) { /* skb was queued by rps/rfs or taps, * so cannot reuse this skb @@ -3506,9 +3595,14 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { local_bh_disable(); - refcount_inc(&pkt_dev->skb->users); + if (skb_shared) + refcount_inc(&pkt_dev->skb->users); ret = dev_queue_xmit(pkt_dev->skb); + + if (!skb_shared && dev_xmit_complete(ret)) + pkt_dev->skb = NULL; + switch (ret) { case NET_XMIT_SUCCESS: pkt_dev->sofar++; @@ -3546,11 +3640,15 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; goto unlock; } - refcount_add(burst, &pkt_dev->skb->users); + if (skb_shared) + refcount_add(burst, &pkt_dev->skb->users); xmit_more: ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); + if (!skb_shared && dev_xmit_complete(ret)) + pkt_dev->skb = NULL; + switch (ret) { case NETDEV_TX_OK: pkt_dev->last_ok = 1; @@ -3572,7 +3670,8 @@ xmit_more: fallthrough; case NETDEV_TX_BUSY: /* Retry it next time */ - refcount_dec(&(pkt_dev->skb->users)); + if (skb_shared) + refcount_dec(&pkt_dev->skb->users); pkt_dev->last_ok = 0; } if (unlikely(burst)) @@ -3585,7 +3684,8 @@ out: /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { - pktgen_wait_for_skb(pkt_dev); + if (pkt_dev->skb) + pktgen_wait_for_skb(pkt_dev); /* Done with this */ pktgen_stop_device(pkt_dev); @@ -3602,7 +3702,7 @@ static int pktgen_thread_worker(void *arg) struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - WARN_ON(smp_processor_id() != cpu); + WARN_ON_ONCE(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); complete(&t->start_done); @@ -3617,10 +3717,8 @@ static int pktgen_thread_worker(void *arg) if (unlikely(!pkt_dev && t->control == 0)) { if (t->net->pktgen_exiting) break; - wait_event_interruptible_timeout(t->queue, - t->control != 0, - HZ/10); - try_to_freeze(); + wait_event_freezable_timeout(t->queue, + t->control != 0, HZ / 10); continue; } @@ -3704,7 +3802,8 @@ static int add_dev_to_thread(struct pktgen_thread *t, * userspace on another CPU than the kthread. The if_lock() * is used here to sync with concurrent instances of * _rem_dev_from_if_list() invoked via kthread, which is also - * updating the if_list */ + * updating the if_list + */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3742,7 +3841,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) if (!pkt_dev) return -ENOMEM; - strcpy(pkt_dev->odevname, ifname); + strscpy(pkt_dev->odevname, ifname); pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, sizeof(struct flow_state)), node); @@ -3768,6 +3867,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_id = 0xffff; pkt_dev->burst = 1; pkt_dev->node = NUMA_NO_NODE; + pkt_dev->flags = F_SHARED; /* SKB shared by default */ err = pktgen_setup_dev(t->net, pkt_dev, ifname); if (err) @@ -3787,8 +3887,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; pkt_dev->ipsproto = IPPROTO_ESP; - /* xfrm tunnel mode needs additional dst to extract outter - * ip header protocol/ttl/id field, here creat a phony one. + /* xfrm tunnel mode needs additional dst to extract outer + * ip header protocol/ttl/id field, here create a phony one. * instead of looking for a valid rt, which definitely hurting * performance under such circumstance. */ @@ -3832,17 +3932,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) list_add_tail(&t->th_list, &pn->pktgen_threads); init_completion(&t->start_done); - p = kthread_create_on_node(pktgen_thread_worker, - t, - cpu_to_node(cpu), - "kpktgend_%d", cpu); + p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d"); if (IS_ERR(p)) { pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu); list_del(&t->th_list); kfree(t); return PTR_ERR(p); } - kthread_bind(p, cpu); + t->tsk = p; pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, @@ -3901,7 +3998,8 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race - * with proc_create_data() */ + * with proc_create_data() + */ proc_remove(pkt_dev->entry); /* And update the thread if_list */ @@ -3938,6 +4036,7 @@ static int __net_init pg_net_init(struct net *net) goto remove; } + cpus_read_lock(); for_each_online_cpu(cpu) { int err; @@ -3946,6 +4045,7 @@ static int __net_init pg_net_init(struct net *net) pr_warn("Cannot create thread for cpu %d (%d)\n", cpu, err); } + cpus_read_unlock(); if (list_empty(&pn->pktgen_threads)) { pr_err("Initialization failed for all threads\n"); @@ -3979,8 +4079,7 @@ static void __net_exit pg_net_exit(struct net *net) list_for_each_safe(q, n, &list) { t = list_entry(q, struct pktgen_thread, th_list); list_del(&t->th_list); - kthread_stop(t->tsk); - put_task_struct(t->tsk); + kthread_stop_put(t->tsk); kfree(t); } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index f35c2e998406..897a8f01a67b 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -33,9 +33,6 @@ void reqsk_queue_alloc(struct request_sock_queue *queue) { - spin_lock_init(&queue->rskq_lock); - - spin_lock_init(&queue->fastopenq.lock); queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; @@ -80,9 +77,7 @@ void reqsk_queue_alloc(struct request_sock_queue *queue) * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to - * acquire a child's lock while holding listener's socket lock. A corner - * case might also exist in tcp_v4_hnd_req() that will trigger this locking - * order. + * acquire a child's lock while holding listener's socket lock. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 64289bc98887..b1ed55141d8a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -53,12 +53,17 @@ #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include <net/netdev_lock.h> #include <net/devlink.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/addrconf.h> +#endif +#include <linux/dpll.h> #include "dev.h" #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 40 +#define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; @@ -76,11 +81,15 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_interruptible(void) +{ + return mutex_lock_interruptible(&rtnl_mutex); +} + int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } -EXPORT_SYMBOL(rtnl_lock_killable); static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) @@ -175,6 +184,176 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_lock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_lock); + +void __rtnl_net_unlock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_unlock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_unlock); + +void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); + __rtnl_net_lock(net); +} +EXPORT_SYMBOL(rtnl_net_lock); + +void rtnl_net_unlock(struct net *net) +{ + __rtnl_net_unlock(net); + rtnl_unlock(); +} +EXPORT_SYMBOL(rtnl_net_unlock); + +int rtnl_net_trylock(struct net *net) +{ + int ret = rtnl_trylock(); + + if (ret) + __rtnl_net_lock(net); + + return ret; +} +EXPORT_SYMBOL(rtnl_net_trylock); + +int rtnl_net_lock_killable(struct net *net) +{ + int ret = rtnl_lock_killable(); + + if (!ret) + __rtnl_net_lock(net); + + return ret; +} + +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + if (net_eq(net_a, net_b)) + return 0; + + /* always init_net first */ + if (net_eq(net_a, &init_net)) + return -1; + + if (net_eq(net_b, &init_net)) + return 1; + + /* otherwise lock in ascending order */ + return net_a < net_b ? -1 : 1; +} + +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) +{ + const struct net *net_a, *net_b; + + net_a = container_of(a, struct net, rtnl_mutex.dep_map); + net_b = container_of(b, struct net, rtnl_mutex.dep_map); + + return rtnl_net_cmp_locks(net_a, net_b); +} + +bool rtnl_net_is_locked(struct net *net) +{ + return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_net_is_locked); + +bool lockdep_rtnl_net_is_held(struct net *net) +{ + return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_net_is_held); +#else +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + /* No need to swap */ + return -1; +} +#endif + +struct rtnl_nets { + /* ->newlink() needs to freeze 3 netns at most; + * 2 for the new device, 1 for its peer. + */ + struct net *net[3]; + unsigned char len; +}; + +static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) +{ + memset(rtnl_nets, 0, sizeof(*rtnl_nets)); +} + +static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) { + put_net(rtnl_nets->net[i]); + rtnl_nets->net[i] = NULL; + } + + rtnl_nets->len = 0; +} + +/** + * rtnl_nets_add - Add netns to be locked before ->newlink(). + * + * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). + * @net: netns pointer with an extra refcnt held. + * + * The extra refcnt is released in rtnl_nets_destroy(). + */ +static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) +{ + int i; + + DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); + + for (i = 0; i < rtnl_nets->len; i++) { + switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { + case 0: + put_net(net); + return; + case 1: + swap(rtnl_nets->net[i], net); + } + } + + rtnl_nets->net[i] = net; + rtnl_nets->len++; +} + +static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) +{ + int i; + + rtnl_lock(); + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_lock(rtnl_nets->net[i]); +} + +static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_unlock(rtnl_nets->net[i]); + + rtnl_unlock(); +} + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) @@ -265,64 +444,13 @@ unlock: } /** - * rtnl_register_module - Register a rtnetlink message type - * - * @owner: module registering the hook (THIS_MODULE) - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Like rtnl_register, but for use by removable modules. - */ -int rtnl_register_module(struct module *owner, - int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - return rtnl_register_internal(owner, protocol, msgtype, - doit, dumpit, flags); -} -EXPORT_SYMBOL_GPL(rtnl_register_module); - -/** - * rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - */ -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - int err; - - err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, - flags); - if (err) - pr_err("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", protocol, msgtype); -} - -/** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ -int rtnl_unregister(int protocol, int msgtype) +static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; @@ -338,21 +466,19 @@ int rtnl_unregister(int protocol, int msgtype) return -ENOENT; } - link = rtnl_dereference(tab[msgindex]); - RCU_INIT_POINTER(tab[msgindex], NULL); + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); return 0; } -EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * - * Identical to calling rtnl_unregster() for all registered message types + * Identical to calling rtnl_unregister() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) @@ -364,18 +490,13 @@ void rtnl_unregister_all(int protocol) BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); if (!tab) { rtnl_unlock(); return; } - RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { - link = rtnl_dereference(tab[msgindex]); - if (!link) - continue; - - RCU_INIT_POINTER(tab[msgindex], NULL); + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); @@ -386,46 +507,86 @@ void rtnl_unregister_all(int protocol) } EXPORT_SYMBOL_GPL(rtnl_unregister_all); -static LIST_HEAD(link_ops); - -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +/** + * __rtnl_register_many - Register rtnetlink message types + * @handlers: Array of struct rtnl_msg_handlers + * @n: The length of @handlers + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * When one element of @handlers fails to register, + * 1) built-in: panics. + * 2) modules : the previous successful registrations are unwinded + * and an error is returned. + * + * Use rtnl_register_many(). + */ +int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { - const struct rtnl_link_ops *ops; + const struct rtnl_msg_handler *handler; + int i, err; - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; + for (i = 0, handler = handlers; i < n; i++, handler++) { + err = rtnl_register_internal(handler->owner, handler->protocol, + handler->msgtype, handler->doit, + handler->dumpit, handler->flags); + if (err) { + if (!handler->owner) + panic("Unable to register rtnetlink message " + "handlers, %pS\n", handlers); + + __rtnl_unregister_many(handlers, i); + break; + } } - return NULL; + + return err; } +EXPORT_SYMBOL_GPL(__rtnl_register_many); -/** - * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register - * - * The caller must hold the rtnl_mutex. This function should be used - * by drivers that create devices during module initialization. It - * must be called before registering the devices. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_link_register(struct rtnl_link_ops *ops) +void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) { - if (rtnl_link_ops_get(ops->kind)) - return -EEXIST; + const struct rtnl_msg_handler *handler; + int i; - /* The check for alloc/setup is here because if ops - * does not have that filled up, it is not possible - * to use the ops for creating device. So do not - * fill up dellink as well. That disables rtnl_dellink. - */ - if ((ops->alloc || ops->setup) && !ops->dellink) - ops->dellink = unregister_netdevice_queue; + for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) + rtnl_unregister(handler->protocol, handler->msgtype); +} +EXPORT_SYMBOL_GPL(__rtnl_unregister_many); - list_add_tail(&ops->list, &link_ops); - return 0; +static DEFINE_MUTEX(link_ops_mutex); +static LIST_HEAD(link_ops); + +static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) +{ + struct rtnl_link_ops *ops; + + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } + } + + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } -EXPORT_SYMBOL_GPL(__rtnl_link_register); /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. @@ -435,6 +596,7 @@ EXPORT_SYMBOL_GPL(__rtnl_link_register); */ int rtnl_link_register(struct rtnl_link_ops *ops) { + struct rtnl_link_ops *tmp; int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ @@ -442,9 +604,31 @@ int rtnl_link_register(struct rtnl_link_ops *ops) ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; - rtnl_lock(); - err = __rtnl_link_register(ops); - rtnl_unlock(); + /* The check for alloc/setup is here because if ops + * does not have that filled up, it is not possible + * to use the ops for creating device. So do not + * fill up dellink as well. That disables rtnl_dellink. + */ + if ((ops->alloc || ops->setup) && !ops->dellink) + ops->dellink = unregister_netdevice_queue; + + err = init_srcu_struct(&ops->srcu); + if (err) + return err; + + mutex_lock(&link_ops_mutex); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) { + err = -EEXIST; + goto unlock; + } + } + + list_add_tail_rcu(&ops->list, &link_ops); +unlock: + mutex_unlock(&link_ops_mutex); + return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); @@ -461,48 +645,20 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -/** - * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - * - * The caller must hold the rtnl_mutex and guarantee net_namespace_list - * integrity (hold pernet_ops_rwsem for writing to close the race - * with setup_net() and cleanup_net()). - */ -void __rtnl_link_unregister(struct rtnl_link_ops *ops) -{ - struct net *net; - - for_each_net(net) { - __rtnl_kill_links(net, ops); - } - list_del(&ops->list); -} -EXPORT_SYMBOL_GPL(__rtnl_link_unregister); - /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ static void rtnl_lock_unregistering_all(void) { - struct net *net; - bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - unregistering = false; rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ - for_each_net(net) { - if (atomic_read(&net->dev_unreg_count) > 0) { - unregistering = true; - break; - } - } - if (!unregistering) + if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); @@ -517,10 +673,22 @@ static void rtnl_lock_unregistering_all(void) */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { + struct net *net; + + mutex_lock(&link_ops_mutex); + list_del_rcu(&ops->list); + mutex_unlock(&link_ops_mutex); + + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); - __rtnl_link_unregister(ops); + + for_each_net(net) + __rtnl_kill_links(net, ops); + rtnl_unlock(); up_write(&pernet_ops_rwsem); } @@ -577,31 +745,51 @@ static size_t rtnl_link_get_size(const struct net_device *dev) static LIST_HEAD(rtnl_af_ops); -static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { - const struct rtnl_af_ops *ops; + struct rtnl_af_ops *ops; ASSERT_RTNL(); - list_for_each_entry(ops, &rtnl_af_ops, list) { - if (ops->family == family) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + if (ops->family == family) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * - * Returns 0 on success or a negative error code. + * Return: 0 on success or a negative error code. */ -void rtnl_af_register(struct rtnl_af_ops *ops) +int rtnl_af_register(struct rtnl_af_ops *ops) { + int err = init_srcu_struct(&ops->srcu); + + if (err) + return err; + rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); + + return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); @@ -616,6 +804,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops) rtnl_unlock(); synchronize_rcu(); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -836,11 +1026,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_error = error, .rta_id = id, }; + unsigned long delta; if (dst) { - ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + delta = jiffies - READ_ONCE(dst->lastuse); + ci.rta_lastuse = jiffies_delta_to_clock_t(delta); ci.rta_used = dst->__use; - ci.rta_clntref = atomic_read(&dst->__refcnt); + ci.rta_clntref = rcuref_read(&dst->__rcuref); } if (expires) { unsigned long clock; @@ -853,9 +1045,22 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); +void netif_set_operstate(struct net_device *dev, int newstate) +{ + unsigned int old = READ_ONCE(dev->operstate); + + do { + if (old == newstate) + return; + } while (!try_cmpxchg(&dev->operstate, &old, newstate)); + + netif_state_change(dev); +} +EXPORT_SYMBOL(netif_set_operstate); + static void set_operstate(struct net_device *dev, unsigned char transition) { - unsigned char operstate = dev->operstate; + unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: @@ -877,12 +1082,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - if (dev->operstate != operstate) { - write_lock(&dev_base_lock); - dev->operstate = operstate; - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } + netif_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) @@ -958,24 +1158,30 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + - nla_total_size(0) + /* nest IFLA_VF_STATS */ - /* IFLA_VF_STATS_RX_PACKETS */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_PACKETS */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_RX_BYTES */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_BYTES */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_BROADCAST */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_MULTICAST */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_RX_DROPPED */ - nla_total_size_64bit(sizeof(__u64)) + - /* IFLA_VF_STATS_TX_DROPPED */ - nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + size += num_vfs * + (nla_total_size(0) + /* nest IFLA_VF_STATS */ + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64))); + } + if (dev->netdev_ops->ndo_get_vf_guid) + size += num_vfs * 2 * + nla_total_size(sizeof(struct ifla_vf_guid)); return size; } else return 0; @@ -1019,22 +1225,25 @@ static size_t rtnl_xdp_size(void) static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; - size_t size; + unsigned int cnt = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) + cnt++; + rcu_read_unlock(); - if (list_empty(&dev->name_node->list)) + if (!cnt) return 0; - size = nla_total_size(0); - list_for_each_entry(name_node, &dev->name_node->list, list) - size += nla_total_size(ALTIFNAMSIZ); - return size; + + return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); - if (dev->proto_down_reason) - size += nla_total_size(0) + nla_total_size(4); + /* Assume dev->proto_down_reason is not zero. */ + size += nla_total_size(0) + nla_total_size(4); return size; } @@ -1049,16 +1258,25 @@ static size_t rtnl_devlink_port_size(const struct net_device *dev) return size; } +static size_t rtnl_dpll_pin_size(const struct net_device *dev) +{ + size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ + + size += dpll_netdev_pin_handle_size(dev); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { - return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + size_t size; + + size = NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) - + nla_total_size(sizeof(struct rtnl_link_stats)) - + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ @@ -1074,10 +1292,13 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ @@ -1103,7 +1324,17 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + + rtnl_dpll_pin_size(dev) + + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + + nla_total_size(2) /* IFLA_HEADROOM */ + + nla_total_size(2) /* IFLA_TAILROOM */ + 0; + + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) + size += nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + + return size; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -1225,7 +1456,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) struct netdev_phys_item_id ppid = { }; int err; - err = dev_get_port_parent_id(dev, &ppid, false); + err = netif_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -1265,7 +1496,7 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo) + u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; @@ -1334,7 +1565,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) - goto nla_put_vfinfo_failure; + return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || @@ -1371,40 +1602,40 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); - memset(&vf_stats, 0, sizeof(vf_stats)); - if (dev->netdev_ops->ndo_get_vf_stats) - dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, - &vf_stats); - vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); - if (!vfstats) - goto nla_put_vf_failure; - if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, - vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || - nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, - vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { - nla_nest_cancel(skb, vfstats); - goto nla_put_vf_failure; + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); + if (!vfstats) + goto nla_put_vf_failure; + if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfstats); } - nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; nla_put_vf_failure: nla_nest_cancel(skb, vf); -nla_put_vfinfo_failure: - nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } @@ -1430,25 +1661,28 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) + if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { + nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; + } } nla_nest_end(skb, vfinfo); return 0; } -static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) +static int rtnl_fill_link_ifmap(struct sk_buff *skb, + const struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); - map.mem_start = dev->mem_start; - map.mem_end = dev->mem_end; - map.base_addr = dev->base_addr; - map.irq = dev->irq; - map.dma = dev->dma; - map.port = dev->if_port; + map.mem_start = READ_ONCE(dev->mem_start); + map.mem_end = READ_ONCE(dev->mem_end); + map.base_addr = READ_ONCE(dev->base_addr); + map.irq = READ_ONCE(dev->irq); + map.dma = READ_ONCE(dev->dma); + map.port = READ_ONCE(dev->if_port); if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; @@ -1459,13 +1693,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; + u32 res = 0; - ASSERT_RTNL(); + rcu_read_lock(); + generic_xdp_prog = rcu_dereference(dev->xdp_prog); + if (generic_xdp_prog) + res = generic_xdp_prog->aux->id; + rcu_read_unlock(); - generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (!generic_xdp_prog) - return 0; - return generic_xdp_prog->aux->id; + return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) @@ -1585,7 +1821,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) - ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + ret = nla_put_u32(skb, IFLA_MASTER, + READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; @@ -1594,10 +1831,10 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { - int ifindex = dev_get_iflink(dev); + int iflink = dev_get_iflink(dev); - if (force || dev->ifindex != ifindex) - return nla_put_u32(skb, IFLA_LINK, ifindex); + if (force || READ_ONCE(dev->ifindex) != iflink) + return nla_put_u32(skb, IFLA_LINK, iflink); return 0; } @@ -1681,7 +1918,7 @@ static int rtnl_fill_alt_ifnames(struct sk_buff *skb, struct netdev_name_node *name_node; int count = 0; - list_for_each_entry(name_node, &dev->name_node->list, list) { + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; @@ -1689,6 +1926,7 @@ static int rtnl_fill_alt_ifnames(struct sk_buff *skb, return count; } +/* RCU protected. */ static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { @@ -1717,10 +1955,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb, struct nlattr *pr; u32 preason; - if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; - preason = dev->proto_down_reason; + preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; @@ -1764,6 +2002,28 @@ nest_cancel: return ret; } +static int rtnl_fill_dpll_pin(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *dpll_pin_nest; + int ret; + + dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN); + if (!dpll_pin_nest) + return -EMSGSIZE; + + ret = dpll_netdev_add_pin_handle(skb, dev); + if (ret < 0) + goto nest_cancel; + + nla_nest_end(skb, dpll_pin_nest); + return 0; + +nest_cancel: + nla_nest_cancel(skb, dpll_pin_nest); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1771,6 +2031,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { + char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; @@ -1783,39 +2044,54 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; - ifm->ifi_type = dev->type; - ifm->ifi_index = dev->ifindex; - ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_type = READ_ONCE(dev->type); + ifm->ifi_index = READ_ONCE(dev->ifindex); + ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; - qdisc = rtnl_dereference(dev->qdisc); - if (nla_put_string(skb, IFLA_IFNAME, dev->name) || - nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || + netdev_copy_name(dev, devname); + if (nla_put_string(skb, IFLA_IFNAME, devname)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || - nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || - nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || - nla_put_u32(skb, IFLA_GROUP, dev->group) || - nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || - nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || - nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || - nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || - nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || - nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || + netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN) || + nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || + nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || + nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || + nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || + nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || + nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, + READ_ONCE(dev->num_tx_queues)) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, + READ_ONCE(dev->gso_max_segs)) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, + READ_ONCE(dev->gso_max_size)) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, + READ_ONCE(dev->gro_max_size)) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, + READ_ONCE(dev->gso_ipv4_max_size)) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, + READ_ONCE(dev->gro_ipv4_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, + READ_ONCE(dev->tso_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, + READ_ONCE(dev->tso_max_segs)) || + nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, + READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS - nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, + READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (qdisc && - nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + @@ -1823,7 +2099,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, - atomic_read(&dev->carrier_down_count))) + atomic_read(&dev->carrier_down_count)) || + nla_put_u16(skb, IFLA_HEADROOM, + READ_ONCE(dev->needed_headroom)) || + nla_put_u16(skb, IFLA_TAILROOM, + READ_ONCE(dev->needed_tailroom))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) @@ -1834,9 +2114,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_ifmap(skb, dev)) - goto nla_put_failure; - if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) @@ -1852,7 +2129,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; - if (rtnl_fill_stats(skb, dev)) + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS) && + rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) @@ -1869,9 +2147,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) - goto nla_put_failure; - if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; @@ -1884,12 +2159,18 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; rcu_read_lock(); + if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) + goto nla_put_failure_rcu; + qdisc = rcu_dereference(dev->qdisc); + if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) + goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; - rcu_read_unlock(); - + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure_rcu; if (rtnl_fill_prop_list(skb, dev)) - goto nla_put_failure; + goto nla_put_failure_rcu; + rcu_read_unlock(); if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, @@ -1904,6 +2185,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (rtnl_fill_devlink_port(skb, dev)) goto nla_put_failure; + if (rtnl_fill_dpll_pin(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1915,6 +2199,7 @@ nla_put_failure: } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, @@ -1943,7 +2228,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, - [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, @@ -1968,6 +2253,11 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, + [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), + [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, + [IFLA_HEADROOM] = { .type = NLA_REJECT }, + [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2021,10 +2311,11 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; -static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, + int *ops_srcu_index) { - const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; + struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; @@ -2033,7 +2324,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; @@ -2116,12 +2407,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); @@ -2154,24 +2445,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; + struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct net *tgt_net = net; - int h, s_h; - int idx = 0, s_idx; - struct net_device *dev; - struct hlist_head *head; + unsigned int flags = NLM_F_MULTI; struct nlattr *tb[IFLA_MAX+1]; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; + struct net *tgt_net = net; u32 ext_filter_mask = 0; - const struct rtnl_link_ops *kind_ops = NULL; - unsigned int flags = NLM_F_MULTI; + struct net_device *dev; + int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; - s_h = cb->args[0]; - s_idx = cb->args[1]; - err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) @@ -2191,7 +2480,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); - return PTR_ERR(tgt_net); + err = PTR_ERR(tgt_net); + netnsid = -1; + goto out; } break; case IFLA_EXT_MASK: @@ -2201,12 +2492,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: - kind_ops = linkinfo_to_kind_ops(tb[i]); + kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); - return -EINVAL; + err = -EINVAL; + goto out; } } } @@ -2215,55 +2507,59 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) flags |= NLM_F_DUMP_FILTERED; walk_entries: - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &tgt_net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (link_dump_filtered(dev, master_idx, kind_ops)) - goto cont; - if (idx < s_idx) - goto cont; - err = rtnl_fill_ifinfo(skb, dev, net, - RTM_NEWLINK, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL, 0, - netnsid, GFP_KERNEL); - - if (err < 0) { - if (likely(skb->len)) - goto out; - - goto out_err; - } -cont: - idx++; - } + err = 0; + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; + err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, 0, flags, + ext_filter_mask, 0, NULL, 0, + netnsid, GFP_KERNEL); + if (err < 0) + break; } -out: - err = skb->len; -out_err: - cb->args[1] = idx; - cb->args[0] = h; + + cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + +out: + + if (kind_ops) + rtnl_link_ops_put(kind_ops, ops_srcu_index); if (netnsid >= 0) put_net(tgt_net); return err; } -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, - struct netlink_ext_ack *exterr) +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr) { - return nla_parse_deprecated(tb, IFLA_MAX, head, len, ifla_policy, + const struct ifinfomsg *ifmp; + const struct nlattr *attrs; + size_t len; + + ifmp = nla_data(nla_peer); + attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); + len = nla_len(nla_peer) - sizeof(struct ifinfomsg); + + if (ifmp->ifi_index < 0) { + NL_SET_ERR_MSG_ATTR(exterr, nla_peer, + "ifindex can't be negative"); + return -EINVAL; + } + + return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } -EXPORT_SYMBOL(rtnl_nla_parse_ifla); +EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); -struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { - struct net *net; + struct net *net = NULL; + /* Examine the link attributes and figure out which * network namespace we are talking about. */ @@ -2271,8 +2567,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); - else + + return net; +} + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net = rtnl_link_get_net_ifla(tb); + + if (!net) net = get_net(src_net); + return net; } EXPORT_SYMBOL(rtnl_link_get_net); @@ -2368,14 +2673,43 @@ static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { - if (dev) { - if (tb[IFLA_ADDRESS] && - nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) - return -EINVAL; + if (tb[IFLA_ADDRESS] && + nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) + return -EINVAL; - if (tb[IFLA_BROADCAST] && - nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) - return -EINVAL; + if (tb[IFLA_BROADCAST] && + nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) + return -EINVAL; + + if (tb[IFLA_GSO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GSO_MAX_SEGS] && + (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || + nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { + NL_SET_ERR_MSG(extack, "too big gso_max_segs"); + return -EINVAL; + } + + if (tb[IFLA_GRO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GSO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); + return -EINVAL; } if (tb[IFLA_AF_SPEC]) { @@ -2383,20 +2717,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - af_ops = rtnl_af_lookup(nla_type(af)); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) - return -EOPNOTSUPP; - - if (af_ops->validate_link_af) { + err = -EOPNOTSUPP; + else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); - if (err < 0) - return err; - } + else + err = 0; + + rtnl_af_put(af_ops, af_ops_srcu_index); + + if (err < 0) + return err; } } @@ -2462,7 +2800,7 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || - nla_len(attr) < NLA_HDRLEN) { + nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) @@ -2595,12 +2933,19 @@ static int do_set_master(struct net_device *dev, int ifindex, const struct net_device_ops *ops; int err; + /* Release the lower lock, the upper is responsible for locking + * the lower if needed. None of the existing upper devices + * use netdev instance lock, so don't grab it. + */ + if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { + netdev_unlock_ops(dev); err = ops->ndo_del_slave(upper_dev, dev); + netdev_lock_ops(dev); if (err) return err; } else { @@ -2614,7 +2959,9 @@ static int do_set_master(struct net_device *dev, int ifindex, return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { + netdev_unlock_ops(dev); err = ops->ndo_add_slave(upper_dev, dev, extack); + netdev_lock_ops(dev); if (err) return err; } else { @@ -2640,7 +2987,7 @@ static int do_set_proto_down(struct net_device *dev, bool proto_down; int err; - if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) { + if (!dev->change_proto_down) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } @@ -2664,7 +3011,7 @@ static int do_set_proto_down(struct net_device *dev, if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); - dev_change_proto_down_reason(dev, mask, value); + netdev_change_proto_down_reason_locked(dev, mask, value); } if (nl_proto_down) { @@ -2675,8 +3022,7 @@ static int do_set_proto_down(struct net_device *dev, NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } - err = dev_change_proto_down(dev, - proto_down); + err = netif_change_proto_down(dev, proto_down); if (err) return err; } @@ -2687,8 +3033,8 @@ static int do_set_proto_down(struct net_device *dev, #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 -static int do_setlink(const struct sk_buff *skb, - struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, struct net_device *dev, + struct net *tgt_net, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { @@ -2705,30 +3051,22 @@ static int do_setlink(const struct sk_buff *skb, else ifname[0] = '\0'; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { + if (!net_eq(tgt_net, dev_net(dev))) { const char *pat = ifname[0] ? ifname : NULL; - struct net *net; int new_ifindex; - net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); - if (IS_ERR(net)) { - err = PTR_ERR(net); - goto errout; - } - - if (tb[IFLA_NEW_IFINDEX]) - new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); - else - new_ifindex = 0; + new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); - err = __dev_change_net_namespace(dev, net, pat, new_ifindex); - put_net(net); + err = __dev_change_net_namespace(dev, tgt_net, pat, + new_ifindex, extack); if (err) - goto errout; + return err; + status |= DO_SETLINK_MODIFIED; } + netdev_lock_ops(dev); + if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; @@ -2759,35 +3097,35 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_ADDRESS]) { - struct sockaddr *sa; - int len; - - len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, - sizeof(*sa)); - sa = kmalloc(len, GFP_KERNEL); - if (!sa) { - err = -ENOMEM; + struct sockaddr_storage ss = { }; + + netdev_unlock_ops(dev); + + /* dev_addr_sem is an outer lock, enforce proper ordering */ + down_write(&dev_addr_sem); + netdev_lock_ops(dev); + + ss.ss_family = dev->type; + memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); + err = netif_set_mac_address(dev, &ss, extack); + if (err) { + up_write(&dev_addr_sem); goto errout; } - sa->sa_family = dev->type; - memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), - dev->addr_len); - err = dev_set_mac_address_user(dev, sa, extack); - kfree(sa); - if (err) - goto errout; status |= DO_SETLINK_MODIFIED; + + up_write(&dev_addr_sem); } if (tb[IFLA_MTU]) { - err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); + err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } @@ -2797,15 +3135,15 @@ static int do_setlink(const struct sk_buff *skb, * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { - err = dev_change_name(dev, ifname); + err = netif_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { - err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), - nla_len(tb[IFLA_IFALIAS])); + err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), + nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; @@ -2816,6 +3154,13 @@ static int do_setlink(const struct sk_buff *skb, call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } + if (ifm->ifi_flags || ifm->ifi_change) { + err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); + if (err < 0) + goto errout; + } + if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) @@ -2823,15 +3168,8 @@ static int do_setlink(const struct sk_buff *skb, status |= DO_SETLINK_MODIFIED; } - if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), - extack); - if (err < 0) - goto errout; - } - if (tb[IFLA_CARRIER]) { - err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); + err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -2840,7 +3178,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); - err = dev_change_tx_queue_len(dev, value); + err = netif_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -2849,11 +3187,6 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > dev->tso_max_size) { - err = -EINVAL; - goto errout; - } - if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; @@ -2863,11 +3196,6 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); - if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) { - err = -EINVAL; - goto errout; - } - if (dev->gso_max_segs ^ max_segs) { netif_set_gso_max_segs(dev, max_segs); status |= DO_SETLINK_MODIFIED; @@ -2883,17 +3211,33 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); + + if (dev->gso_ipv4_max_size ^ max_size) { + netif_set_gso_ipv4_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); + + if (dev->gro_ipv4_max_size ^ gro_max_size) { + netif_set_gro_ipv4_max_size(dev, gro_max_size); + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; - dev->link_mode = value; - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->link_mode, value); } if (tb[IFLA_VFINFO_LIST]) { @@ -2978,11 +3322,18 @@ static int do_setlink(const struct sk_buff *skb, int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); + if (!af_ops) { + err = -EAFNOSUPPORT; + goto errout; + } err = af_ops->set_link_af(dev, af, extack); + rtnl_af_put(af_ops, af_ops_srcu_index); + if (err < 0) goto errout; @@ -3051,13 +3402,15 @@ static int do_setlink(const struct sk_buff *skb, errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) - netdev_state_change(dev); + netif_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } + netdev_unlock_ops(dev); + return err; } @@ -3079,11 +3432,13 @@ static struct net_device *rtnl_dev_get(struct net *net, static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); - struct ifinfomsg *ifm; - struct net_device *dev; - int err; struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + struct rtnl_nets rtnl_nets; + struct net *tgt_net; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3094,21 +3449,32 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - err = -EINVAL; - ifm = nlmsg_data(nlh); + tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + err = PTR_ERR(tgt_net); + goto errout; + } + + rtnl_nets_init(&rtnl_nets); + rtnl_nets_add(&rtnl_nets, get_net(net)); + rtnl_nets_add(&rtnl_nets, tgt_net); + + rtnl_nets_lock(&rtnl_nets); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else - goto errout; + err = -EINVAL; - if (dev == NULL) { + if (dev) + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); + else if (!err) err = -ENODEV; - goto errout; - } - err = do_setlink(skb, dev, ifm, extack, tb, 0); + rtnl_nets_unlock(&rtnl_nets); + rtnl_nets_destroy(&rtnl_nets); errout: return err; } @@ -3168,14 +3534,14 @@ EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; - struct net *tgt_net = net; - struct net_device *dev = NULL; - struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; - int err; + struct net_device *dev = NULL; + struct net *tgt_net = net; int netnsid = -1; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3193,27 +3559,24 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - err = -EINVAL; - ifm = nlmsg_data(nlh); + rtnl_net_lock(tgt_net); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(tgt_net, tb); + + if (dev) + err = rtnl_delete_link(dev, portid, nlh); + else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + err = -ENODEV; else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - goto out; - - if (!dev) { - if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0) - err = -ENODEV; - - goto out; - } + err = -EINVAL; - err = rtnl_delete_link(dev, portid, nlh); + rtnl_net_unlock(tgt_net); -out: if (netnsid >= 0) put_net(tgt_net); @@ -3223,7 +3586,7 @@ out: int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { - unsigned int old_flags; + unsigned int old_flags, changed; int err; old_flags = dev->flags; @@ -3234,12 +3597,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, return err; } - if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); - } else { - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); + changed = old_flags ^ dev->flags; + if (dev->rtnl_link_initializing) { + dev->rtnl_link_initializing = false; + changed = ~0U; } + + __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); @@ -3253,6 +3617,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; + int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); @@ -3288,13 +3653,18 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (!dev) return ERR_PTR(-ENOMEM); + err = validate_linkmsg(dev, tb, extack); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } + dev_net_set(dev, net); dev->rtnl_link_ops = ops; - dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); - int err; err = dev_validate_mtu(dev, mtu, extack); if (err) { @@ -3318,30 +3688,106 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) + netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) + netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; } EXPORT_SYMBOL(rtnl_create_link); +struct rtnl_newlink_tbs { + struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; +}; + +static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net_device *dev, struct net *tgt_net, + struct rtnl_newlink_tbs *tbs, + struct nlattr **data, + struct netlink_ext_ack *extack) +{ + struct nlattr ** const linkinfo = tbs->linkinfo; + struct nlattr ** const tb = tbs->tb; + int status = 0; + int err; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + const struct rtnl_link_ops *m_ops = NULL; + struct nlattr **slave_data = NULL; + struct net_device *master_dev; + + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + m_ops = master_dev->rtnl_link_ops; + + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; + + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; + + if (m_ops->slave_maxtype) { + err = nla_parse_nested_deprecated(tbs->slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + + slave_data = tbs->slave_attr; + } + + err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); +} + static int rtnl_group_changelink(const struct sk_buff *skb, - struct net *net, int group, - struct ifinfomsg *ifm, - struct netlink_ext_ack *extack, - struct nlattr **tb) + struct net *net, struct net *tgt_net, + int group, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = do_setlink(skb, dev, ifm, extack, tb, 0); + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } @@ -3352,14 +3798,21 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, + struct net *tgt_net, struct net *link_net, + struct net *peer_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; - struct net *net = sock_net(skb->sk); + struct rtnl_newlink_params params = { + .src_net = sock_net(skb->sk), + .link_net = link_net, + .peer_net = peer_net, + .tb = tb, + .data = data, + }; u32 portid = NETLINK_CB(skb).portid; - struct net *dest_net, *link_net; struct net_device *dev; char ifname[IFNAMSIZ]; int err; @@ -3374,28 +3827,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } else { - link_net = NULL; - } - - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb, extack); + dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb, + extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; @@ -3404,7 +3837,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, dev->ifindex = ifm->ifi_index; if (ops->newlink) - err = ops->newlink(link_net ? : net, dev, tb, data, extack); + err = ops->newlink(dev, ¶ms, extack); else err = register_netdevice(dev); if (err < 0) { @@ -3412,25 +3845,22 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, goto out; } + netdev_lock_ops(dev); + err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } + + netdev_unlock_ops(dev); out: - if (link_net) - put_net(link_net); - put_net(dest_net); return err; out_unregister: + netdev_unlock_ops(dev); if (ops->newlink) { LIST_HEAD(list_kill); @@ -3442,199 +3872,212 @@ out_unregister: goto out; } -struct rtnl_newlink_tbs { +static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *tbp[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ struct nlattr *tb[IFLA_MAX + 1]; - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; -}; + int err; + + if (!data || !data[ops->peer_type]) + return rtnl_link_get_net_ifla(tbp); + + err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); + if (err < 0) + return ERR_PTR(err); + + if (ops->validate) { + err = ops->validate(tb, NULL, extack); + if (err < 0) + return ERR_PTR(err); + } + + return rtnl_link_get_net_ifla(tb); +} static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net *tgt_net, struct net *link_net, + struct net *peer_net, struct rtnl_newlink_tbs *tbs, + struct nlattr **data, struct netlink_ext_ack *extack) { - struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr ** const tb = tbs->tb; - const struct rtnl_link_ops *m_ops; - struct net_device *master_dev; struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; - struct nlattr **slave_data; - char kind[MODULE_NAME_LEN]; + struct net *device_net; struct net_device *dev; struct ifinfomsg *ifm; - struct nlattr **data; bool link_specified; - int err; -#ifdef CONFIG_MODULES -replay: -#endif - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, - ifla_policy, extack); - if (err < 0) - return err; - - err = rtnl_ensure_unique_netns(tb, extack, false); - if (err < 0) - return err; + /* When creating, lookup for existing device in target net namespace */ + device_net = (nlh->nlmsg_flags & NLM_F_CREATE) && + (nlh->nlmsg_flags & NLM_F_EXCL) ? + tgt_net : net; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; - dev = __dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(device_net, ifm->ifi_index); + } else if (ifm->ifi_index < 0) { + NL_SET_ERR_MSG(extack, "ifindex can't be negative"); + return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(device_net, tb); } else { link_specified = false; dev = NULL; } - master_dev = NULL; - m_ops = NULL; - if (dev) { - master_dev = netdev_master_upper_dev_get(dev); - if (master_dev) - m_ops = master_dev->rtnl_link_ops; + if (dev) + return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, + * or it's for a group + */ + if (link_specified || !tb[IFLA_GROUP]) + return -ENODEV; + + return rtnl_group_changelink(skb, net, tgt_net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, extack, tb); } - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; + + if (!ops) { + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } + + return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh, + tb, data, extack); +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *tgt_net, *link_net = NULL, *peer_net = NULL; + struct nlattr **tb, **linkinfo, **data = NULL; + struct rtnl_link_ops *ops = NULL; + struct rtnl_newlink_tbs *tbs; + struct rtnl_nets rtnl_nets; + int ops_srcu_index; + int ret; + + tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); + if (!tbs) + return -ENOMEM; + + tb = tbs->tb; + ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + if (ret < 0) + goto free; + ret = rtnl_ensure_unique_netns(tb, extack, false); + if (ret < 0) + goto free; + + linkinfo = tbs->linkinfo; if (tb[IFLA_LINKINFO]) { - err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, + ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); - if (err < 0) - return err; - } else - memset(linkinfo, 0, sizeof(linkinfo)); + if (ret < 0) + goto free; + } else { + memset(linkinfo, 0, sizeof(tbs->linkinfo)); + } if (linkinfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); - } else { - kind[0] = '\0'; - ops = NULL; + ops = rtnl_link_ops_get(kind, &ops_srcu_index); +#ifdef CONFIG_MODULES + if (!ops) { + request_module("rtnl-link-%s", kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); + } +#endif } - data = NULL; + rtnl_nets_init(&rtnl_nets); + if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + if (ops->maxtype > RTNL_MAX_TYPE) { + ret = -EINVAL; + goto put_ops; + } if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, + ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); - if (err < 0) - return err; + if (ret < 0) + goto put_ops; + data = tbs->attr; } + if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; + ret = ops->validate(tb, data, extack); + if (ret < 0) + goto put_ops; } - } - - slave_data = NULL; - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested_deprecated(tbs->slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - extack); - if (err < 0) - return err; - slave_data = tbs->slave_attr; + if (ops->peer_type) { + peer_net = rtnl_get_peer_net(ops, tb, data, extack); + if (IS_ERR(peer_net)) { + ret = PTR_ERR(peer_net); + goto put_ops; + } + if (peer_net) + rtnl_nets_add(&rtnl_nets, peer_net); } } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; - - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + ret = PTR_ERR(tgt_net); + goto put_net; + } - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + rtnl_nets_add(&rtnl_nets, tgt_net); - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - err = m_ops->slave_changelink(master_dev, dev, tb, - slave_data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; + link_net = get_net_ns_by_id(tgt_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + ret = -EINVAL; + goto put_net; } - return do_setlink(skb, dev, ifm, extack, tb, status); - } - - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, - * or it's for a group - */ - if (link_specified) - return -ENODEV; - if (tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, - nla_get_u32(tb[IFLA_GROUP]), - ifm, extack, tb); - return -ENODEV; - } - - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + rtnl_nets_add(&rtnl_nets, link_net); - if (!ops) { -#ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto put_net; } -#endif - NL_SET_ERR_MSG(extack, "Unknown device type"); - return -EOPNOTSUPP; } - return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack); -} - -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct rtnl_newlink_tbs *tbs; - int ret; - - tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); - if (!tbs) - return -ENOMEM; + rtnl_nets_lock(&rtnl_nets); + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack); + rtnl_nets_unlock(&rtnl_nets); - ret = __rtnl_newlink(skb, nlh, tbs, extack); +put_net: + rtnl_nets_destroy(&rtnl_nets); +put_ops: + if (ops) + rtnl_link_ops_put(ops, ops_srcu_index); +free: kfree(tbs); return ret; } @@ -3647,7 +4090,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, struct ifinfomsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } @@ -3656,7 +4100,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); @@ -3732,10 +4175,18 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -ENOBUFS; - nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); + nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) goto out; + /* Synchronize the carrier state so we don't report a state + * that we're not actually going to honour immediately; if + * the driver just did a carrier off->on transition, we can + * only TX if link watch work has run, but without this we'd + * already report carrier on, even if it doesn't work yet. + */ + linkwatch_sync_dev(dev); + err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, @@ -3855,22 +4306,28 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } -static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb, + struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); size_t min_ifinfo_dump_size = 0; - struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; struct net_device *dev; - int hdrlen; + struct nlattr *nla; + int hdrlen, rem; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return NLMSG_GOODSIZE; + + nla_for_each_attr_type(nla, IFLA_EXT_MASK, + nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), rem) { + if (nla_len(nla) == sizeof(u32)) + ext_filter_mask = nla_get_u32(nla); } if (!ext_filter_mask) @@ -3939,16 +4396,23 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex, u32 portid, u32 seq) + int new_ifindex, u32 portid, + const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; + u32 seq = 0; skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; + if (nlmsg_report(nlh)) + seq = nlmsg_seq(nlh); + else + portid = 0; + err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, portid, seq, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); @@ -3960,8 +4424,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, } return skb; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_LINK, err); + rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } @@ -3984,7 +4447,7 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, - new_ifindex, portid, nlmsg_seq(nlh)); + new_ifindex, portid, nlh); if (skb) rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); } @@ -4025,7 +4488,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) + if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) @@ -4039,10 +4502,10 @@ nla_put_failure: return -EMSGSIZE; } -static inline size_t rtnl_fdb_nlmsg_size(void) +static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + - nla_total_size(ETH_ALEN) + /* NDA_LLADDR */ + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } @@ -4054,7 +4517,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC); + skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; @@ -4186,9 +4649,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; + bool notified = false; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, - nlh->nlmsg_flags, extack); + nlh->nlmsg_flags, ¬ified, extack); if (err) goto out; else @@ -4197,16 +4661,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { + bool notified = false; + if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, - extack); + ¬ified, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); - if (!err) { + if (!err && !notified) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; @@ -4243,13 +4709,6 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_del); -static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = { - [NDA_VLAN] = { .type = NLA_U16 }, - [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), - [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, - [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, -}; - static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -4263,15 +4722,14 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, int err; u16 vid; - if (!netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - if (!del_bulk) { err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } else { - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, - fdb_del_bulk_policy, extack); + /* For bulk delete, the drivers will parse the message with + * policy. + */ + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } if (err < 0) return err; @@ -4294,6 +4752,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); + + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); + if (err) + return err; } if (dev->type != ARPHRD_ETHER) { @@ -4301,25 +4763,22 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } - err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); - if (err) - return err; - err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); + bool notified = false; ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); } else { if (ops->ndo_fdb_del_bulk) - err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, - extack); + err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (err) @@ -4330,22 +4789,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { + bool notified = false; + ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { /* in case err was cleared by NTF_MASTER call */ err = -EOPNOTSUPP; if (ops->ndo_fdb_del_bulk) - err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, - extack); + err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (!err) { - if (!del_bulk) + if (!del_bulk && !notified) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; @@ -4361,15 +4822,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, int *idx, struct netdev_hw_addr_list *list) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; - int err; u32 portid, seq; + int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, @@ -4425,12 +4887,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); @@ -4508,18 +4970,16 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct net_device *dev; - struct net_device *br_dev = NULL; - const struct net_device_ops *ops = NULL; - const struct net_device_ops *cops = NULL; + const struct net_device_ops *ops = NULL, *cops = NULL; + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; + struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); - struct hlist_head *head; int brport_idx = 0; int br_idx = 0; - int h, s_h; - int idx = 0, s_idx; - int err = 0; int fidx = 0; + int err; + + NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, @@ -4538,70 +4998,51 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - s_h = cb->args[0]; - s_idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - - if (brport_idx && (dev->ifindex != brport_idx)) - continue; - - if (!br_idx) { /* user did not specify a specific bridge */ - if (netif_is_bridge_port(dev)) { - br_dev = netdev_master_upper_dev_get(dev); - cops = br_dev->netdev_ops; - } - } else { - if (dev != br_dev && - !netif_is_bridge_port(dev)) - continue; + for_each_netdev_dump(net, dev, ctx->ifindex) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; - if (br_dev != netdev_master_upper_dev_get(dev) && - !netif_is_bridge_master(dev)) - continue; - cops = ops; + if (!br_idx) { /* user did not specify a specific bridge */ + if (netif_is_bridge_port(dev)) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; } + } else { + if (dev != br_dev && + !netif_is_bridge_port(dev)) + continue; - if (idx < s_idx) - goto cont; + if (br_dev != netdev_master_upper_dev_get(dev) && + !netif_is_bridge_master(dev)) + continue; + cops = ops; + } - if (netif_is_bridge_port(dev)) { - if (cops && cops->ndo_fdb_dump) { - err = cops->ndo_fdb_dump(skb, cb, - br_dev, dev, - &fidx); - if (err == -EMSGSIZE) - goto out; - } + if (netif_is_bridge_port(dev)) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + break; } + } - if (dev->netdev_ops->ndo_fdb_dump) - err = dev->netdev_ops->ndo_fdb_dump(skb, cb, - dev, NULL, - &fidx); - else - err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, - &fidx); - if (err == -EMSGSIZE) - goto out; + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); + if (err == -EMSGSIZE) + break; - cops = NULL; + cops = NULL; - /* reset fdb offset to 0 for rest of the interfaces */ - cb->args[2] = 0; - fidx = 0; -cont: - idx++; - } + /* reset fdb offset to 0 for rest of the interfaces */ + ctx->fdb_idx = 0; + fidx = 0; } -out: - cb->args[0] = h; - cb->args[1] = idx; - cb->args[2] = fidx; + ctx->fdb_idx = fidx; return skb->len; } @@ -4614,12 +5055,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); @@ -4798,7 +5239,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; - ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = 0; @@ -4886,12 +5327,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); @@ -5047,10 +5488,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; - struct nlattr *br_spec, *attr = NULL; + struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; - bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; @@ -5068,13 +5508,17 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; - have_flags = true; + br_flags_attr = attr; flags = nla_get_u16(attr); - break; + } + + if (nla_type(attr) == IFLA_BRIDGE_MODE) { + if (nla_len(attr) < sizeof(u16)) + return -EINVAL; } } } @@ -5112,8 +5556,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, } } - if (have_flags) - memcpy(nla_data(attr), &flags, sizeof(flags)); + if (br_flags_attr) + memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } @@ -5144,15 +5588,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { - if (nla_len(attr) < sizeof(flags)) - return -EINVAL; + nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, + rem) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; - have_flags = true; - flags = nla_get_u16(attr); - break; - } + have_flags = true; + flags = nla_get_u16(attr); + break; } } @@ -5411,13 +5854,11 @@ static unsigned int rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, enum netdev_offload_xstats_type type) { - bool enabled = netdev_offload_xstats_enabled(dev, type); - return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ nla_total_size(sizeof(u8)) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ - (enabled ? nla_total_size(sizeof(u8)) : 0) + + nla_total_size(sizeof(u8)) + 0; } @@ -5783,7 +6224,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, { struct if_stats_msg *ifsm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { + ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); + if (!ifsm) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } @@ -5791,8 +6233,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, if (!strict_check) return 0; - ifsm = nlmsg_data(nlh); - /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ @@ -5863,19 +6303,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; - int h, s_h, err, s_idx, s_idxattr, s_prividx; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; - struct hlist_head *head; + struct { + unsigned long ifindex; + int idxattr; + int prividx; + } *ctx = (void *)cb->ctx; struct net_device *dev; - int idx = 0; - - s_h = cb->args[0]; - s_idx = cb->args[1]; - s_idxattr = cb->args[2]; - s_prividx = cb->args[3]; + int err; cb->seq = net->dev_base_seq; @@ -5894,39 +6332,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - flags, &filters, - &s_idxattr, &s_prividx, - extack); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + for_each_netdev_dump(net, dev, ctx->ifindex) { + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, &filters, + &ctx->idxattr, &ctx->prividx, + extack); + /* If we ran out of room on the first message, + * we're in trouble. + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; - s_prividx = 0; - s_idxattr = 0; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } + if (err < 0) + break; + ctx->prividx = 0; + ctx->idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } -out: - cb->args[3] = s_prividx; - cb->args[2] = s_idxattr; - cb->args[1] = idx; - cb->args[0] = h; - return skb->len; + return err; } void rtnl_offload_xstats_notify(struct net_device *dev) @@ -6030,8 +6455,407 @@ static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct br_port_msg *bpm; + + bpm = nlmsg_payload(nlh, sizeof(*bpm)); + if (!bpm) { + NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); + return -EINVAL; + } + + if (bpm->ifindex) { + NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); + return -EINVAL; + } + if (nlmsg_attrlen(nlh, sizeof(*bpm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); + return -EINVAL; + } + + return 0; +} + +struct rtnl_mdb_dump_ctx { + long idx; +}; + +static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int idx, s_idx; + int err; + + NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); + + if (cb->strict_check) { + err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); + if (err) + return err; + } + + s_idx = ctx->idx; + idx = 0; + + for_each_netdev(net, dev) { + if (idx < s_idx) + goto skip; + if (!dev->netdev_ops->ndo_mdb_dump) + goto skip; + + err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb); + if (err == -EMSGSIZE) + goto out; + /* Moving on to next device, reset markers and sequence + * counters since they are all maintained per-device. + */ + memset(cb->ctx, 0, sizeof(cb->ctx)); + cb->prev_seq = 0; + cb->seq = 0; +skip: + idx++; + } + +out: + ctx->idx = idx; + return skb->len; +} + +static int rtnl_validate_mdb_entry_get(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->ifindex) { + NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified"); + return -EINVAL; + } + + if (entry->state) { + NL_SET_ERR_MSG(extack, "Entry state cannot be specified"); + return -EINVAL; + } + + if (entry->flags) { + NL_SET_ERR_MSG(extack, "Entry flags cannot be specified"); + return -EINVAL; + } + + if (entry->vid >= VLAN_VID_MASK) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + if (entry->addr.proto != htons(ETH_P_IP) && + entry->addr.proto != htons(ETH_P_IPV6) && + entry->addr.proto != 0) { + NL_SET_ERR_MSG(extack, "Unknown entry protocol"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = { + [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry_get, + sizeof(struct br_mdb_entry)), + [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + +static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1]; + struct net *net = sock_net(in_skb->sk); + struct br_port_msg *bpm; + struct net_device *dev; + int err; + + err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb, + MDBA_GET_ENTRY_MAX, mdba_get_policy, extack); + if (err) + return err; + + bpm = nlmsg_data(nlh); + if (!bpm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, bpm->ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "Device doesn't exist"); + return -ENODEV; + } + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) { + NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute"); + return -EINVAL; + } + + if (!dev->netdev_ops->ndo_mdb_get) { + NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); + return -EOPNOTSUPP; + } + + return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); +} + +static int rtnl_validate_mdb_entry(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->ifindex == 0) { + NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed"); + return -EINVAL; + } + + if (entry->addr.proto == htons(ETH_P_IP)) { + if (!ipv4_is_multicast(entry->addr.u.ip4) && + !ipv4_is_zeronet(entry->addr.u.ip4)) { + NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0"); + return -EINVAL; + } + if (ipv4_is_local_multicast(entry->addr.u.ip4)) { + NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast"); + return -EINVAL; + } +#if IS_ENABLED(CONFIG_IPV6) + } else if (entry->addr.proto == htons(ETH_P_IPV6)) { + if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { + NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes"); + return -EINVAL; + } +#endif + } else if (entry->addr.proto == 0) { + /* L2 mdb */ + if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { + NL_SET_ERR_MSG(extack, "L2 entry group is not multicast"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(extack, "Unknown entry protocol"); + return -EINVAL; + } + + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { + NL_SET_ERR_MSG(extack, "Unknown entry state"); + return -EINVAL; + } + if (entry->vid >= VLAN_VID_MASK) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = { + [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 }, + [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry, + sizeof(struct br_mdb_entry)), + [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + +static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct br_port_msg *bpm; + struct net_device *dev; + int err; + + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, mdba_policy, extack); + if (err) + return err; + + bpm = nlmsg_data(nlh); + if (!bpm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, bpm->ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "Device doesn't exist"); + return -ENODEV; + } + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { + NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); + return -EINVAL; + } + + if (!dev->netdev_ops->ndo_mdb_add) { + NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); + return -EOPNOTSUPP; + } + + return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack); +} + +static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + struct br_mdb_entry zero_entry = {}; + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { + NL_SET_ERR_MSG(extack, "Unknown entry state"); + return -EINVAL; + } + + if (entry->flags) { + NL_SET_ERR_MSG(extack, "Entry flags cannot be set"); + return -EINVAL; + } + + if (entry->vid >= VLAN_N_VID - 1) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) { + NL_SET_ERR_MSG(extack, "Entry address cannot be set"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = { + [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry_del_bulk, + sizeof(struct br_mdb_entry)), + [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + +static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); + struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct br_port_msg *bpm; + struct net_device *dev; + int err; + + if (!del_bulk) + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, mdba_policy, + extack); + else + err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, + mdba_del_bulk_policy, extack); + if (err) + return err; + + bpm = nlmsg_data(nlh); + if (!bpm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, bpm->ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "Device doesn't exist"); + return -ENODEV; + } + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { + NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); + return -EINVAL; + } + + if (del_bulk) { + if (!dev->netdev_ops->ndo_mdb_del_bulk) { + NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion"); + return -EOPNOTSUPP; + } + return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack); + } + + if (!dev->netdev_ops->ndo_mdb_del) { + NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); + return -EOPNOTSUPP; + } + + return dev->netdev_ops->ndo_mdb_del(dev, tb, extack); +} + /* Process one rtnetlink message. */ +static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED); + rtnl_dumpit_func dumpit = cb->data; + int err; + + /* Previous iteration have already finished, avoid calling->dumpit() + * again, it may not expect to be called after it reached the end. + */ + if (!dumpit) + return 0; + + if (needs_lock) + rtnl_lock(); + err = dumpit(skb, cb); + if (needs_lock) + rtnl_unlock(); + + /* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). + * Some applications which parse netlink manually depend on this. + */ + if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { + if (err < 0 && err != -EMSGSIZE) + return err; + if (!err) + cb->data = NULL; + + return skb->len; + } + return err; +} + +static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) +{ + if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE || + !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) { + WARN_ON(control->data); + control->data = control->dump; + control->dump = rtnl_dumpit; + } + + return netlink_dump_start(ssk, skb, nlh, control); +} + static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -6076,6 +6900,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } owner = link->owner; dumpit = link->dumpit; + flags = link->flags; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); @@ -6093,8 +6918,9 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .dump = dumpit, .min_dump_alloc = min_dump_alloc, .module = owner, + .flags = flags, }; - err = netlink_dump_start(rtnl, skb, nlh, &c); + err = rtnetlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ @@ -6209,7 +7035,6 @@ static int __net_init rtnetlink_net_init(struct net *net) struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, - .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, .bind = rtnetlink_bind, }; @@ -6232,6 +7057,41 @@ static struct pernet_operations rtnetlink_net_ops = { .exit = rtnetlink_net_exit, }; +static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, + {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, + .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, + {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, + {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get, + .dumpit = rtnl_stats_dump}, + {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set}, + {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop}, + {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK, + .dumpit = rtnl_bridge_getlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK, + .doit = rtnl_bridge_dellink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK, + .doit = rtnl_bridge_setlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get, + .dumpit = rtnl_fdb_dump}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get, + .dumpit = rtnl_mdb_dump}, +}; + void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) @@ -6239,29 +7099,5 @@ void __init rtnetlink_init(void) register_netdevice_notifier(&rtnetlink_dev_notifier); - rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, - rtnl_dump_ifinfo, 0); - rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); - - rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); - rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); - rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); - - rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); - - rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, - RTNL_FLAG_BULK_DEL_SUPPORTED); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); - - rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); - rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); - - rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, - 0); - rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0); + rtnl_register_many(rtnetlink_rtnl_msg_handlers); } diff --git a/net/core/scm.c b/net/core/scm.c index 5c356f0dee30..cd87f66671aa 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -23,9 +23,12 @@ #include <linux/security.h> #include <linux/pid_namespace.h> #include <linux/pid.h> +#include <uapi/linux/pidfd.h> +#include <linux/pidfs.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/errqueue.h> +#include <linux/io_uring.h> #include <linux/uaccess.h> @@ -35,6 +38,7 @@ #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> +#include <net/af_unix.h> /* @@ -84,8 +88,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; + fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -103,6 +114,14 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; + /* don't allow io_uring files */ + if (io_is_uring_fops(file)) { + fput(file); + return -EINVAL; + } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } @@ -128,8 +147,25 @@ void __scm_destroy(struct scm_cookie *scm) } EXPORT_SYMBOL(__scm_destroy); +static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid) +{ + int err; + + /* drop all previous references */ + scm_destroy_cred(scm); + + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + scm->pid = pid; + scm->creds.pid = pid_vnr(pid); + return 0; +} + int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct cmsghdr *cmsg; int err; @@ -153,7 +189,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) switch (cmsg->cmsg_type) { case SCM_RIGHTS: - if (!sock->ops || sock->ops->family != PF_UNIX) + if (!ops || ops->family != PF_UNIX) goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) @@ -171,15 +207,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - p->creds.pid = creds.pid; if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(creds.pid); if (!pid) goto error; - put_pid(p->pid); - p->pid = pid; + + /* pass a struct pid reference from + * find_get_pid() to scm_replace_pid(). + */ + err = scm_replace_pid(p, pid); + if (err) { + put_pid(pid); + goto error; + } } err = -EINVAL; @@ -229,15 +271,15 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) if (msg->msg_control_is_user) { struct cmsghdr __user *cm = msg->msg_control_user; - if (!user_write_access_begin(cm, cmlen)) - goto efault; + check_object_size(data, cmlen - sizeof(*cm), true); - unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); - unsafe_put_user(level, &cm->cmsg_level, efault_end); - unsafe_put_user(type, &cm->cmsg_type, efault_end); - unsafe_copy_to_user(CMSG_USER_DATA(cm), data, - cmlen - sizeof(*cm), efault_end); - user_write_access_end(); + scoped_user_write_access_size(cm, cmlen, efault) { + unsafe_put_user(cmlen, &cm->cmsg_len, efault); + unsafe_put_user(level, &cm->cmsg_level, efault); + unsafe_put_user(type, &cm->cmsg_type, efault); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault); + } } else { struct cmsghdr *cm = msg->msg_control; @@ -248,17 +290,28 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) } cmlen = min(CMSG_SPACE(len), msg->msg_controllen); - msg->msg_control += cmlen; + if (msg->msg_control_is_user) + msg->msg_control_user += cmlen; + else + msg->msg_control += cmlen; msg->msg_controllen -= cmlen; return 0; -efault_end: - user_write_access_end(); efault: return -EFAULT; } EXPORT_SYMBOL(put_cmsg); +int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data) +{ + /* Don't produce truncated CMSGs */ + if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) + return -ETOOSMALL; + + return put_cmsg(msg, level, type, len, data); +} + void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; @@ -297,7 +350,7 @@ static int scm_max_fds(struct msghdr *msg) void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm = - (__force struct cmsghdr __user *)msg->msg_control; + (__force struct cmsghdr __user *)msg->msg_control_user; unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); int __user *cmsg_data = CMSG_USER_DATA(cm); @@ -313,7 +366,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) } for (i = 0; i < fdmax; i++) { - err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } @@ -330,7 +383,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) cmlen = CMSG_SPACE(i * sizeof(int)); if (msg->msg_controllen < cmlen) cmlen = msg->msg_controllen; - msg->msg_control += cmlen; + msg->msg_control_user += cmlen; msg->msg_controllen -= cmlen; } } @@ -359,9 +412,137 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } EXPORT_SYMBOL(scm_fp_dup); + +#ifdef CONFIG_SECURITY_NETWORK +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ + struct lsm_context ctx; + int err; + + if (sk->sk_scm_security) { + err = security_secid_to_secctx(scm->secid, &ctx); + + if (err >= 0) { + put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, + ctx.context); + + security_release_secctx(&ctx); + } + } +} + +static bool scm_has_secdata(struct sock *sk) +{ + return sk->sk_scm_security; +} +#else +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ +} + +static bool scm_has_secdata(struct sock *sk) +{ + return false; +} +#endif + +static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) +{ + struct file *pidfd_file = NULL; + int len, pidfd; + + /* put_cmsg() doesn't return an error if CMSG is truncated, + * that's why we need to opencode these checks here. + */ + if (msg->msg_flags & MSG_CMSG_COMPAT) + len = sizeof(struct compat_cmsghdr) + sizeof(int); + else + len = sizeof(struct cmsghdr) + sizeof(int); + + if (msg->msg_controllen < len) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + + if (!scm->pid) + return; + + pidfd = pidfd_prepare(scm->pid, PIDFD_STALE, &pidfd_file); + + if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + + return; + } + + if (pidfd_file) + fd_install(pidfd, pidfd_file); +} + +static bool __scm_recv_common(struct sock *sk, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!msg->msg_control) { + if (sk->sk_scm_credentials || sk->sk_scm_pidfd || + scm->fp || scm_has_secdata(sk)) + msg->msg_flags |= MSG_CTRUNC; + + scm_destroy(scm); + return false; + } + + if (sk->sk_scm_credentials) { + struct user_namespace *current_ns = current_user_ns(); + struct ucred ucreds = { + .pid = scm->creds.pid, + .uid = from_kuid_munged(current_ns, scm->creds.uid), + .gid = from_kgid_munged(current_ns, scm->creds.gid), + }; + + put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); + } + + scm_passec(sk, msg, scm); + + if (scm->fp) + scm_detach_fds(msg, scm); + + return true; +} + +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + scm_destroy_cred(scm); +} +EXPORT_SYMBOL(scm_recv); + +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + if (sock->sk->sk_scm_pidfd) + scm_pidfd_recv(msg, scm); + + scm_destroy_cred(scm); +} diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index b0ff6153be62..9a3965680451 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -71,7 +71,7 @@ u32 secure_tcpv6_ts_off(const struct net *net, return siphash(&combined, offsetofend(typeof(combined), daddr), &ts_secret); } -EXPORT_SYMBOL(secure_tcpv6_ts_off); +EXPORT_IPV6_MOD(secure_tcpv6_ts_off); u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) @@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif - -#if IS_ENABLED(CONFIG_IP_DCCP) -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) -{ - u64 seq; - net_secret_init(); - seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccp_sequence_number); - -#if IS_ENABLED(CONFIG_IPV6) -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - __be16 sport; - __be16 dport; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - .sport = sport, - .dport = dport - }; - u64 seq; - net_secret_init(); - seq = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccpv6_sequence_number); -#endif -#endif diff --git a/net/core/selftests.c b/net/core/selftests.c index acb1ee97bbd3..8b81feb82c4a 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -14,45 +14,10 @@ #include <net/tcp.h> #include <net/udp.h> -struct net_packet_attrs { - const unsigned char *src; - const unsigned char *dst; - u32 ip_src; - u32 ip_dst; - bool tcp; - u16 sport; - u16 dport; - int timeout; - int size; - int max_size; - u8 id; - u16 queue_mapping; -}; - -struct net_test_priv { - struct net_packet_attrs *packet; - struct packet_type pt; - struct completion comp; - int double_vlan; - int vlan_id; - int ok; -}; - -struct netsfhdr { - __be32 version; - __be64 magic; - u8 id; -} __packed; - static u8 net_test_next_id; -#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ - sizeof(struct netsfhdr)) -#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL -#define NET_LB_TIMEOUT msecs_to_jiffies(200) - -static struct sk_buff *net_test_get_skb(struct net_device *ndev, - struct net_packet_attrs *attr) +struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr) { struct sk_buff *skb = NULL; struct udphdr *uhdr = NULL; @@ -100,10 +65,10 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, ehdr->h_proto = htons(ETH_P_IP); if (attr->tcp) { + memset(thdr, 0, sizeof(*thdr)); thdr->source = htons(attr->sport); thdr->dest = htons(attr->dport); thdr->doff = sizeof(struct tcphdr) / 4; - thdr->check = 0; } else { uhdr->source = htons(attr->sport); uhdr->dest = htons(attr->dport); @@ -141,21 +106,44 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, shdr = skb_put(skb, sizeof(*shdr)); shdr->version = 0; shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); - attr->id = net_test_next_id; - shdr->id = net_test_next_id++; + attr->id = id; + shdr->id = id; + + if (attr->size) { + void *payload = skb_put(skb, attr->size); + + memset(payload, 0, attr->size); + } + + if (attr->max_size && attr->max_size > skb->len) { + size_t pad_len = attr->max_size - skb->len; + void *pad = skb_put(skb, pad_len); - if (attr->size) - skb_put(skb, attr->size); - if (attr->max_size && attr->max_size > skb->len) - skb_put(skb, attr->max_size - skb->len); + memset(pad, 0, pad_len); + } skb->csum = 0; skb->ip_summed = CHECKSUM_PARTIAL; if (attr->tcp) { - thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, - ihdr->daddr, 0); + int l4len = skb->len - skb_transport_offset(skb); + + thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); + + if (attr->bad_csum) { + /* Force mangled checksum */ + if (skb_checksum_help(skb)) { + kfree_skb(skb); + return NULL; + } + + if (thdr->check != CSUM_MANGLED_0) + thdr->check = CSUM_MANGLED_0; + else + thdr->check = csum16_sub(thdr->check, + cpu_to_be16(1)); + } } else { udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); } @@ -166,6 +154,7 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, return skb; } +EXPORT_SYMBOL_GPL(net_test_get_skb); static int net_test_loopback_validate(struct sk_buff *skb, struct net_device *ndev, @@ -230,7 +219,11 @@ static int net_test_loopback_validate(struct sk_buff *skb, if (tpriv->packet->id != shdr->id) goto out; - tpriv->ok = true; + if (tpriv->packet->bad_csum && skb->ip_summed == CHECKSUM_UNNECESSARY) + tpriv->ok = -EIO; + else + tpriv->ok = true; + complete(&tpriv->comp); out: kfree_skb(skb); @@ -258,12 +251,13 @@ static int __net_test_loopback(struct net_device *ndev, tpriv->packet = attr; dev_add_pack(&tpriv->pt); - skb = net_test_get_skb(ndev, attr); + skb = net_test_get_skb(ndev, net_test_next_id, attr); if (!skb) { ret = -ENOMEM; goto cleanup; } + net_test_next_id++; ret = dev_direct_xmit(skb, attr->queue_mapping); if (ret < 0) { goto cleanup; @@ -276,7 +270,12 @@ static int __net_test_loopback(struct net_device *ndev, attr->timeout = NET_LB_TIMEOUT; wait_for_completion_timeout(&tpriv->comp, attr->timeout); - ret = tpriv->ok ? 0 : -ETIMEDOUT; + if (tpriv->ok < 0) + ret = tpriv->ok; + else if (!tpriv->ok) + ret = -ETIMEDOUT; + else + ret = 0; cleanup: dev_remove_pack(&tpriv->pt); @@ -299,7 +298,7 @@ static int net_test_phy_loopback_enable(struct net_device *ndev) if (!ndev->phydev) return -EOPNOTSUPP; - return phy_loopback(ndev->phydev, true); + return phy_loopback(ndev->phydev, true, 0); } static int net_test_phy_loopback_disable(struct net_device *ndev) @@ -307,7 +306,7 @@ static int net_test_phy_loopback_disable(struct net_device *ndev) if (!ndev->phydev) return -EOPNOTSUPP; - return phy_loopback(ndev->phydev, false); + return phy_loopback(ndev->phydev, false, 0); } static int net_test_phy_loopback_udp(struct net_device *ndev) @@ -336,6 +335,42 @@ static int net_test_phy_loopback_tcp(struct net_device *ndev) return __net_test_loopback(ndev, &attr); } +/** + * net_test_phy_loopback_tcp_bad_csum - PHY loopback test with a deliberately + * corrupted TCP checksum + * @ndev: the network device to test + * + * Builds the same minimal Ethernet/IPv4/TCP frame as + * net_test_phy_loopback_tcp(), then flips the least-significant bit of the TCP + * checksum so the resulting value is provably invalid (neither 0 nor 0xFFFF). + * The frame is transmitted through the device’s internal PHY loopback path: + * + * test code -> MAC driver -> MAC HW -> xMII -> PHY -> + * internal PHY loopback -> xMII -> MAC HW -> MAC driver -> test code + * + * Result interpretation + * --------------------- + * 0 The frame is delivered to the stack and the driver reports + * ip_summed as CHECKSUM_NONE or CHECKSUM_COMPLETE - both are + * valid ways to indicate “bad checksum, let the stack verify.” + * -ETIMEDOUT The MAC/PHY silently dropped the frame; hardware checksum + * verification filtered it out before the driver saw it. + * -EIO The driver returned the frame with ip_summed == + * CHECKSUM_UNNECESSARY, falsely claiming a valid checksum and + * indicating a serious RX-path defect. + * + * Return: 0 on success or a negative error code on failure. + */ +static int net_test_phy_loopback_tcp_bad_csum(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + attr.bad_csum = true; + return __net_test_loopback(ndev, &attr); +} + static const struct net_test { char name[ETH_GSTRING_LEN]; int (*fn)(struct net_device *ndev); @@ -360,6 +395,9 @@ static const struct net_test { .name = "PHY internal loopback, TCP ", .fn = net_test_phy_loopback_tcp, }, { + .name = "PHY loopback, bad TCP csum ", + .fn = net_test_phy_loopback_tcp_bad_csum, + }, { /* This test should be done after all PHY loopback test */ .name = "PHY internal loopback, disable", .fn = net_test_phy_loopback_disable, @@ -397,16 +435,14 @@ EXPORT_SYMBOL_GPL(net_selftest_get_count); void net_selftest_get_strings(u8 *data) { - u8 *p = data; int i; - for (i = 0; i < net_selftest_get_count(); i++) { - snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1, - net_selftests[i].name); - p += ETH_GSTRING_LEN; - } + for (i = 0; i < net_selftest_get_count(); i++) + ethtool_sprintf(&data, "%2d. %s", i + 1, + net_selftests[i].name); } EXPORT_SYMBOL_GPL(net_selftest_get_strings); +MODULE_DESCRIPTION("Common library for generic PHY ethtool selftests"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>"); diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c new file mode 100644 index 000000000000..4235db6bdfad --- /dev/null +++ b/net/core/skb_fault_injection.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/debugfs.h> +#include <linux/fault-inject.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> + +static struct { + struct fault_attr attr; + char devname[IFNAMSIZ]; + bool filtered; +} skb_realloc = { + .attr = FAULT_ATTR_INITIALIZER, + .filtered = false, +}; + +static bool should_fail_net_realloc_skb(struct sk_buff *skb) +{ + struct net_device *net = skb->dev; + + if (skb_realloc.filtered && + strncmp(net->name, skb_realloc.devname, IFNAMSIZ)) + /* device name filter set, but names do not match */ + return false; + + if (!should_fail(&skb_realloc.attr, 1)) + return false; + + return true; +} +ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE); + +void skb_might_realloc(struct sk_buff *skb) +{ + if (!should_fail_net_realloc_skb(skb)) + return; + + pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_might_realloc); + +static int __init fail_skb_realloc_setup(char *str) +{ + return setup_fault_attr(&skb_realloc.attr, str); +} +__setup("fail_skb_realloc=", fail_skb_realloc_setup); + +static void reset_settings(void) +{ + skb_realloc.filtered = false; + memset(&skb_realloc.devname, 0, IFNAMSIZ); +} + +static ssize_t devname_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + ssize_t ret; + + reset_settings(); + ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ, + ppos, buffer, count); + if (ret < 0) + return ret; + + skb_realloc.devname[IFNAMSIZ - 1] = '\0'; + /* Remove a possible \n at the end of devname */ + strim(skb_realloc.devname); + + if (strnlen(skb_realloc.devname, IFNAMSIZ)) + skb_realloc.filtered = true; + + return count; +} + +static ssize_t devname_read(struct file *file, + char __user *buffer, + size_t size, loff_t *ppos) +{ + if (!skb_realloc.filtered) + return 0; + + return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname, + strlen(skb_realloc.devname)); +} + +static const struct file_operations devname_ops = { + .write = devname_write, + .read = devname_read, +}; + +static int __init fail_skb_realloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_skb_realloc", NULL, + &skb_realloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_file("devname", mode, dir, NULL, &devname_ops); + + return 0; +} + +late_initcall(fail_skb_realloc_debugfs); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4a0eb5593275..a00808f7be6a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@ #endif #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> @@ -58,20 +59,29 @@ #include <linux/scatterlist.h> #include <linux/errqueue.h> #include <linux/prefetch.h> +#include <linux/bitfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/kcov.h> +#include <linux/iov_iter.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/gro.h> +#include <net/gso.h> +#include <net/hotdata.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> +#include <net/psp/types.h> +#include <net/dropreason.h> +#include <net/xdp_sock.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -79,25 +89,107 @@ #include <linux/capability.h> #include <linux/user_namespace.h> #include <linux/indirect_call_wrapper.h> +#include <linux/textsearch.h> #include "dev.h" +#include "devmem.h" +#include "netmem_priv.h" #include "sock_destructor.h" -struct kmem_cache *skbuff_head_cache __ro_after_init; -static struct kmem_cache *skbuff_fclone_cache __ro_after_init; #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); + +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) + +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. + * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique + * size, and we can differentiate heads from skb_small_head_cache + * vs system slabs by looking at their size (skb_end_offset()). + */ +#define SKB_SMALL_HEAD_CACHE_SIZE \ + (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ + (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ + SKB_SMALL_HEAD_SIZE) + +#define SKB_SMALL_HEAD_HEADROOM \ + SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) + +/* kcm_write_msgs() relies on casting paged frags to bio_vec to use + * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the + * netmem is a page. + */ +static_assert(offsetof(struct bio_vec, bv_page) == + offsetof(skb_frag_t, netmem)); +static_assert(sizeof_field(struct bio_vec, bv_page) == + sizeof_field(skb_frag_t, netmem)); + +static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); +static_assert(sizeof_field(struct bio_vec, bv_len) == + sizeof_field(skb_frag_t, len)); + +static_assert(offsetof(struct bio_vec, bv_offset) == + offsetof(skb_frag_t, offset)); +static_assert(sizeof_field(struct bio_vec, bv_offset) == + sizeof_field(skb_frag_t, offset)); #undef FN #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, -const char * const drop_reasons[] = { +static const char * const drop_reasons[] = { [SKB_CONSUMED] = "CONSUMED", DEFINE_DROP_REASON(FN, FN) }; -EXPORT_SYMBOL(drop_reasons); + +static const struct drop_reason_list drop_reasons_core = { + .reasons = drop_reasons, + .n_reasons = ARRAY_SIZE(drop_reasons), +}; + +const struct drop_reason_list __rcu * +drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { + [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), +}; +EXPORT_SYMBOL(drop_reasons_by_subsys); + +/** + * drop_reasons_register_subsys - register another drop reason subsystem + * @subsys: the subsystem to register, must not be the core + * @list: the list of drop reasons within the subsystem, must point to + * a statically initialized list + */ +void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, + const struct drop_reason_list *list) +{ + if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || + subsys >= ARRAY_SIZE(drop_reasons_by_subsys), + "invalid subsystem %d\n", subsys)) + return; + + /* must point to statically allocated memory, so INIT is OK */ + RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); +} +EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); + +/** + * drop_reasons_unregister_subsys - unregister a drop reason subsystem + * @subsys: the subsystem to remove, must not be the core + * + * Note: This will synchronize_rcu() to ensure no users when it returns. + */ +void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) +{ + if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || + subsys >= ARRAY_SIZE(drop_reasons_by_subsys), + "invalid subsystem %d\n", subsys)) + return; + + RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); /** * skb_panic - private function for out-of-line support @@ -131,101 +223,35 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) skb_panic(skb, sz, addr, __func__); } -#define NAPI_SKB_CACHE_SIZE 64 -#define NAPI_SKB_CACHE_BULK 16 -#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) - -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif +#define NAPI_SKB_CACHE_SIZE 128 +#define NAPI_SKB_CACHE_BULK 32 +#define NAPI_SKB_CACHE_FREE 32 struct napi_alloc_cache { + local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); - -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + void *data; fragsz = SKB_DATA_ALIGN(fragsz); - return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + data = __page_frag_alloc_align(&nc->page, fragsz, + GFP_ATOMIC | __GFP_NOWARN, align_mask); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + return data; + } EXPORT_SYMBOL(__napi_alloc_frag_align); @@ -233,43 +259,114 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { void *data; - fragsz = SKB_DATA_ALIGN(fragsz); if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); + fragsz = SKB_DATA_ALIGN(fragsz); + data = __page_frag_alloc_align(nc, fragsz, + GFP_ATOMIC | __GFP_NOWARN, + align_mask); } else { - struct napi_alloc_cache *nc; - local_bh_disable(); - nc = this_cpu_ptr(&napi_alloc_cache); - data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + data = __napi_alloc_frag_align(fragsz, align_mask); local_bh_enable(); } return data; } EXPORT_SYMBOL(__netdev_alloc_frag_align); -static struct sk_buff *napi_skb_cache_get(void) +/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler + * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset. + */ +static u32 skbuff_cache_size __read_mostly; + +static struct sk_buff *napi_skb_cache_get(bool alloc) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { - nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, - GFP_ATOMIC, - NAPI_SKB_CACHE_BULK, - nc->skb_cache); - if (unlikely(!nc->skb_count)) + if (alloc) + nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, + NAPI_SKB_CACHE_BULK, + nc->skb_cache); + if (unlikely(!nc->skb_count)) { + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; + } } skb = nc->skb_cache[--nc->skb_count]; - kasan_unpoison_object_data(skbuff_head_cache, skb); + if (nc->skb_count) + prefetch(nc->skb_cache[nc->skb_count - 1]); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + kasan_mempool_unpoison_object(skb, skbuff_cache_size); return skb; } +/** + * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache + * @skbs: pointer to an at least @n-sized array to fill with skb pointers + * @n: number of entries to provide + * + * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes + * the pointers into the provided array @skbs. If there are less entries + * available, tries to replenish the cache and bulk-allocates the diff from + * the MM layer if needed. + * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are + * ready for {,__}build_skb_around() and don't have any data buffers attached. + * Must be called *only* from the BH context. + * + * Return: number of successfully allocated skbs (@n if no actual allocation + * needed or kmem_cache_alloc_bulk() didn't fail). + */ +u32 napi_skb_cache_get_bulk(void **skbs, u32 n) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 bulk, total = n; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + + if (nc->skb_count >= n) + goto get; + + /* No enough cached skbs. Try refilling the cache first */ + bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); + nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, bulk, + &nc->skb_cache[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* Still not enough. Bulk-allocate the missing part directly, zeroed */ + n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, + n - nc->skb_count, &skbs[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* kmem_cache didn't allocate the number we need, limit the output */ + total -= n - nc->skb_count; + n = nc->skb_count; + +get: + for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { + skbs[i] = nc->skb_cache[base + i]; + + kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); + memset(skbs[i], 0, offsetof(struct sk_buff, tail)); + } + + nc->skb_count -= n; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + + return total; +} +EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); + static inline void __finalize_skb_around(struct sk_buff *skb, void *data, unsigned int size) { @@ -295,8 +392,7 @@ static inline void __finalize_skb_around(struct sk_buff *skb, void *data, skb_set_kcov_handle(skb, kcov_common_handle()); } -static inline void *__slab_build_skb(struct sk_buff *skb, void *data, - unsigned int *size) +static inline void *__slab_build_skb(void *data, unsigned int *size) { void *resized; @@ -323,12 +419,13 @@ struct sk_buff *slab_build_skb(void *data) struct sk_buff *skb; unsigned int size; - skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); - data = __slab_build_skb(skb, data, &size); + data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); return skb; @@ -345,7 +442,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, * using slab buffer should use slab_build_skb() instead. */ if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) - data = __slab_build_skb(skb, data, &size); + data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); } @@ -374,7 +471,8 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; @@ -386,17 +484,14 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) /* build_skb() is wrapper over __build_skb(), that specifically * takes care of skb->head and skb->pfmemalloc - * This means that if @frag_size is not zero, then @data must be backed - * by a page fragment, not kmalloc() or vmalloc() */ struct sk_buff *build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb = __build_skb(data, frag_size); - if (skb && frag_size) { + if (likely(skb && frag_size)) { skb->head_frag = 1; - if (page_is_pfmemalloc(virt_to_head_page(data))) - skb->pfmemalloc = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } @@ -406,7 +501,7 @@ EXPORT_SYMBOL(build_skb); * build_skb_around - build a network buffer around provided skb * @skb: sk_buff provide by caller, must be memset cleared * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data */ struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size) @@ -418,8 +513,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, if (frag_size) { skb->head_frag = 1; - if (page_is_pfmemalloc(virt_to_head_page(data))) - skb->pfmemalloc = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); } return skb; } @@ -428,7 +522,7 @@ EXPORT_SYMBOL(build_skb_around); /** * __napi_build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data * * Version of __build_skb() that uses NAPI percpu caches to obtain * skbuff_head instead of inplace allocation. @@ -439,7 +533,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = napi_skb_cache_get(); + skb = napi_skb_cache_get(true); if (unlikely(!skb)) return NULL; @@ -452,7 +546,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) /** * napi_build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data * * Version of __napi_build_skb() that takes care of skb->head_frag * and skb->pfmemalloc when the data is a page or page fragment. @@ -479,17 +573,39 @@ EXPORT_SYMBOL(napi_build_skb); * may be used. Otherwise, the packet data may be discarded until enough * memory is free */ -static void *kmalloc_reserve(size_t size, gfp_t flags, int node, +static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, bool *pfmemalloc) { - void *obj; bool ret_pfmemalloc = false; + size_t obj_size; + void *obj; + + obj_size = SKB_HEAD_ALIGN(*size); + if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && + !(flags & KMALLOC_NOT_NORMAL_BITS)) { + obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + *size = SKB_SMALL_HEAD_CACHE_SIZE; + if (obj || !(gfp_pfmemalloc_allowed(flags))) + goto out; + /* Try again but now we are using pfmemalloc reserves */ + ret_pfmemalloc = true; + obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); + goto out; + } + + obj_size = kmalloc_size_roundup(obj_size); + /* The following cast might truncate high-order bits of obj_size, this + * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. + */ + *size = (unsigned int)obj_size; /* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. */ - obj = kmalloc_node_track_caller(size, + obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); if (obj || !(gfp_pfmemalloc_allowed(flags))) @@ -497,7 +613,7 @@ static void *kmalloc_reserve(size_t size, gfp_t flags, int node, /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(size, flags, node); + obj = kmalloc_node_track_caller(obj_size, flags, node); out: if (pfmemalloc) @@ -532,26 +648,38 @@ out: struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags, int node) { + struct sk_buff *skb = NULL; struct kmem_cache *cache; - struct sk_buff *skb; - unsigned int osize; bool pfmemalloc; u8 *data; - cache = (flags & SKB_ALLOC_FCLONE) - ? skbuff_fclone_cache : skbuff_head_cache; - if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; - /* Get the HEAD */ - if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && - likely(node == NUMA_NO_NODE || node == numa_mem_id())) - skb = napi_skb_cache_get(); - else + if (flags & SKB_ALLOC_FCLONE) { + cache = net_hotdata.skbuff_fclone_cache; + goto fallback; + } + cache = net_hotdata.skbuff_cache; + if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id())) + goto fallback; + + if (flags & SKB_ALLOC_NAPI) { + skb = napi_skb_cache_get(true); + if (unlikely(!skb)) + return NULL; + } else if (!in_hardirq() && !irqs_disabled()) { + local_bh_disable(); + skb = napi_skb_cache_get(false); + local_bh_enable(); + } + + if (!skb) { +fallback: skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); - if (unlikely(!skb)) - return NULL; + if (unlikely(!skb)) + return NULL; + } prefetchw(skb); /* We do our best to align skb_shared_info on a separate cache @@ -559,18 +687,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - osize = kmalloc_size_roundup(size); - data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc); + data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - size = SKB_WITH_OVERHEAD(osize); - prefetchw(data + size); + prefetchw(data + SKB_WITH_OVERHEAD(size)); /* * Only clear those fields we need to clear, not those that we will @@ -578,7 +702,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); - __build_skb_around(skb, data, osize); + __build_skb_around(skb, data, size); skb->pfmemalloc = pfmemalloc; if (flags & SKB_ALLOC_FCLONE) { @@ -624,7 +748,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); @@ -633,8 +757,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, goto skb_success; } - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); + len = SKB_HEAD_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; @@ -642,12 +765,16 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); } else { local_bh_disable(); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache.page); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); + + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); local_bh_enable(); } @@ -674,10 +801,9 @@ skb_fail: EXPORT_SYMBOL(__netdev_alloc_skb); /** - * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used @@ -686,9 +812,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; @@ -699,10 +825,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -712,33 +836,17 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, goto skb_success; } - nc = this_cpu_ptr(&napi_alloc_cache); + len = SKB_HEAD_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache); - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = nc->page.pfmemalloc; - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) return NULL; @@ -760,23 +868,27 @@ skb_success: skb_fail: return skb; } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb); -void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, - int size, unsigned int truesize) +void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, + int off, int size, unsigned int truesize) { - skb_fill_page_desc(skb, i, page, off, size); + DEBUG_NET_WARN_ON_ONCE(size > truesize); + + skb_fill_netmem_desc(skb, i, netmem, off, size); skb->len += size; skb->data_len += size; skb->truesize += truesize; } -EXPORT_SYMBOL(skb_add_rx_frag); +EXPORT_SYMBOL(skb_add_rx_frag_netmem); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + DEBUG_NET_WARN_ON_ONCE(size > truesize); + skb_frag_size_add(frag, size); skb->len += size; skb->data_len += size; @@ -803,11 +915,157 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, + unsigned int headroom) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + u32 size, truesize, len, max_head_size, off; + struct sk_buff *skb = *pskb, *nskb; + int err, i, head_off; + void *data; + + /* XDP does not support fraglist so we need to linearize + * the skb. + */ + if (skb_has_frag_list(skb)) + return -EOPNOTSUPP; + + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); + if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) + return -ENOMEM; + + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + headroom; + data = page_pool_dev_alloc_va(pool, &truesize); + if (!data) + return -ENOMEM; + + nskb = napi_build_skb(data, truesize); + if (!nskb) { + page_pool_free_va(pool, data, true); + return -ENOMEM; + } + + skb_reserve(nskb, headroom); + skb_copy_header(nskb, skb); + skb_mark_for_recycle(nskb); + + err = skb_copy_bits(skb, 0, nskb->data, size); + if (err) { + consume_skb(nskb); + return err; + } + skb_put(nskb, size); + + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + + off = size; + len = skb->len - off; + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + struct page *page; + u32 page_off; + + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = page_pool_dev_alloc(pool, &page_off, &truesize); + if (!page) { + consume_skb(nskb); + return -ENOMEM; + } + + skb_add_rx_frag(nskb, i, page, page_off, size, truesize); + err = skb_copy_bits(skb, off, page_address(page) + page_off, + size); + if (err) { + consume_skb(nskb); + return err; + } + + len -= size; + off += size; + } + + consume_skb(skb); + *pskb = nskb; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} +EXPORT_SYMBOL(skb_pp_cow_data); + +int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, + const struct bpf_prog *prog) +{ + if (!prog->aux->xdp_has_frags) + return -EINVAL; + + return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); +} +EXPORT_SYMBOL(skb_cow_data_for_xdp); + +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(netmem_ref netmem) +{ + netmem = netmem_compound_head(netmem); + + if (unlikely(!netmem_is_pp(netmem))) + return false; + + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data)); + return napi_pp_put_page(page_to_netmem(virt_to_page(data))); +} + +/** + * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb + * @skb: page pool aware skb + * + * Increase the fragment reference count (pp_ref_count) of a skb. This is + * intended to gain fragment references only for page pool aware skbs, + * i.e. when skb->pp_recycle is true, and not for fragments in a + * non-pp-recycling skb. It has a fallback to increase references on normal + * pages, as page pool aware skbs may also have normal page fragments. + */ +static int skb_pp_frag_ref(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + netmem_ref head_netmem; + int i; + + if (!skb->pp_recycle) + return -EINVAL; + + shinfo = skb_shinfo(skb); + + for (i = 0; i < shinfo->nr_frags; i++) { + head_netmem = netmem_compound_head(shinfo->frags[i].netmem); + if (likely(netmem_is_pp(head_netmem))) + page_pool_ref_netmem(head_netmem); + else + page_ref_inc(netmem_to_page(head_netmem)); + } + return 0; +} + +static void skb_kfree_head(void *head, unsigned int end_offset) +{ + if (end_offset == SKB_SMALL_HEAD_HEADROOM) + kmem_cache_free(net_hotdata.skb_small_head_cache, head); + else + kfree(head); } static void skb_free_head(struct sk_buff *skb) @@ -819,7 +1077,7 @@ static void skb_free_head(struct sk_buff *skb) return; skb_free_frag(head); } else { - kfree(head); + skb_kfree_head(head, skb_end_offset(skb)); } } @@ -828,9 +1086,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) struct skb_shared_info *shinfo = skb_shinfo(skb); int i; - if (skb->cloned && - atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &shinfo->dataref)) + if (!skb_data_unref(skb, shinfo)) goto exit; if (skb_zcopy(skb)) { @@ -871,7 +1127,7 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: - kmem_cache_free(skbuff_head_cache, skb); + kmem_cache_free(net_hotdata.skbuff_cache, skb); return; case SKB_FCLONE_ORIG: @@ -892,7 +1148,7 @@ static void kfree_skbmem(struct sk_buff *skb) if (!refcount_dec_and_test(&fclones->fclone_ref)) return; fastpath: - kmem_cache_free(skbuff_fclone_cache, fclones); + kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones); } void skb_release_head_state(struct sk_buff *skb) @@ -900,12 +1156,22 @@ void skb_release_head_state(struct sk_buff *skb) skb_dst_drop(skb); if (skb->destructor) { DEBUG_NET_WARN_ON_ONCE(in_hardirq()); - skb->destructor(skb); - } -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb_nfct(skb)); +#ifdef CONFIG_INET + INDIRECT_CALL_4(skb->destructor, + tcp_wfree, __sock_wfree, sock_wfree, + xsk_destruct_skb, + skb); +#else + INDIRECT_CALL_2(skb->destructor, + sock_wfree, xsk_destruct_skb, + skb); + #endif - skb_ext_put(skb); + skb->destructor = NULL; + skb->sk = NULL; + } + nf_reset_ct(skb); + skb_ext_reset(skb); } /* Free everything but the sk_buff shell. */ @@ -932,40 +1198,90 @@ void __kfree_skb(struct sk_buff *skb) } EXPORT_SYMBOL(__kfree_skb); +static __always_inline +bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason) +{ + if (unlikely(!skb_unref(skb))) + return false; + + DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || + u32_get_bits(reason, + SKB_DROP_REASON_SUBSYS_MASK) >= + SKB_DROP_REASON_SUBSYS_NUM); + + if (reason == SKB_CONSUMED) + trace_consume_skb(skb, __builtin_return_address(0)); + else + trace_kfree_skb(skb, __builtin_return_address(0), reason, sk); + return true; +} + /** - * kfree_skb_reason - free an sk_buff with special reason + * sk_skb_reason_drop - free an sk_buff with special reason + * @sk: the socket to receive @skb, or NULL if not applicable * @skb: buffer to free * @reason: reason why this skb is dropped * - * Drop a reference to the buffer and free it if the usage count has - * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' - * tracepoint. + * Drop a reference to the buffer and free it if the usage count has hit + * zero. Meanwhile, pass the receiving socket and drop reason to + * 'kfree_skb' tracepoint. */ void __fix_address -kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - if (unlikely(!skb_unref(skb))) + if (__sk_skb_reason_drop(sk, skb, reason)) + __kfree_skb(skb); +} +EXPORT_SYMBOL(sk_skb_reason_drop); + +#define KFREE_SKB_BULK_SIZE 16 + +struct skb_free_array { + unsigned int skb_count; + void *skb_array[KFREE_SKB_BULK_SIZE]; +}; + +static void kfree_skb_add_bulk(struct sk_buff *skb, + struct skb_free_array *sa, + enum skb_drop_reason reason) +{ + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); return; + } - DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + skb_release_all(skb, reason); + sa->skb_array[sa->skb_count++] = skb; - if (reason == SKB_CONSUMED) - trace_consume_skb(skb); - else - trace_kfree_skb(skb, __builtin_return_address(0), reason); - __kfree_skb(skb); + if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { + kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, + sa->skb_array); + sa->skb_count = 0; + } } -EXPORT_SYMBOL(kfree_skb_reason); -void kfree_skb_list_reason(struct sk_buff *segs, - enum skb_drop_reason reason) +void __fix_address +kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) { + struct skb_free_array sa; + + sa.skb_count = 0; + while (segs) { struct sk_buff *next = segs->next; - kfree_skb_reason(segs, reason); + if (__sk_skb_reason_drop(NULL, segs, reason)) { + skb_poison_list(segs); + kfree_skb_add_bulk(segs, &sa, reason); + } + segs = next; } + + if (sa.skb_count) + kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array); } EXPORT_SYMBOL(kfree_skb_list_reason); @@ -997,22 +1313,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) has_trans = skb_transport_header_was_set(skb); printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" - "mac=(%d,%d) net=(%d,%d) trans=%d\n" + "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" - "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" - "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", + "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" + "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" + "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", level, skb->len, headroom, skb_headlen(skb), tailroom, has_mac ? skb->mac_header : -1, has_mac ? skb_mac_header_len(skb) : -1, + skb->mac_len, skb->network_header, has_trans ? skb_network_header_len(skb) : -1, has_trans ? skb->transport_header : -1, sh->tx_flags, sh->nr_frags, sh->gso_size, sh->gso_type, sh->gso_segs, - skb->csum, skb->ip_summed, skb->csum_complete_sw, - skb->csum_valid, skb->csum_level, + skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level, skb->hash, skb->sw_hash, skb->l4_hash, - ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, + skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, + skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, + skb->inner_network_header, skb->inner_transport_header); if (dev) printk("%sdev name=%s feat=%pNF\n", @@ -1041,6 +1363,14 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) struct page *p; u8 *vaddr; + if (skb_frag_is_net_iov(frag)) { + printk("%sskb frag %d: not readable\n", level, i); + len -= skb_frag_size(frag); + if (!len) + break; + continue; + } + skb_frag_foreach_page(frag, skb_frag_off(frag), skb_frag_size(frag), p, p_off, p_len, copied) { @@ -1094,7 +1424,7 @@ void consume_skb(struct sk_buff *skb) if (!skb_unref(skb)) return; - trace_consume_skb(skb); + trace_consume_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); @@ -1109,7 +1439,7 @@ EXPORT_SYMBOL(consume_skb); */ void __consume_stateless_skb(struct sk_buff *skb) { - trace_consume_skb(skb); + trace_consume_skb(skb, __builtin_return_address(0)); skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1117,25 +1447,31 @@ void __consume_stateless_skb(struct sk_buff *skb) static void napi_skb_cache_put(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - u32 i; - kasan_poison_object_data(skbuff_head_cache, skb); + if (!kasan_mempool_poison_object(skb)) + return; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { - for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) - kasan_unpoison_object_data(skbuff_head_cache, - nc->skb_cache[i]); + u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE; + + for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++) + kasan_mempool_unpoison_object(nc->skb_cache[i], + skbuff_cache_size); - kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, - nc->skb_cache + NAPI_SKB_CACHE_HALF); - nc->skb_count = NAPI_SKB_CACHE_HALF; + kmem_cache_free_bulk(net_hotdata.skbuff_cache, + NAPI_SKB_CACHE_FREE, + nc->skb_cache + remaining); + nc->skb_count = remaining; } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } -void __kfree_skb_defer(struct sk_buff *skb) +void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); + skb_release_all(skb, reason); napi_skb_cache_put(skb); } @@ -1154,18 +1490,23 @@ void napi_skb_free_stolen_head(struct sk_buff *skb) void napi_consume_skb(struct sk_buff *skb, int budget) { /* Zero budget indicate non-NAPI context called us, like netpoll */ - if (unlikely(!budget)) { + if (unlikely(!budget || !skb)) { dev_consume_skb_any(skb); return; } DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) { + skb_release_head_state(skb); + return skb_attempt_defer_free(skb); + } + if (!skb_unref(skb)) return; /* if reaching here SKB is ready to free */ - trace_consume_skb(skb); + trace_consume_skb(skb, __builtin_return_address(0)); /* if SKB is a clone, don't handle this case */ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { @@ -1311,14 +1652,18 @@ EXPORT_SYMBOL_GPL(skb_morph); int mm_account_pinned_pages(struct mmpin *mmp, size_t size) { - unsigned long max_pg, num_pg, new_pg, old_pg; + unsigned long max_pg, num_pg, new_pg, old_pg, rlim; struct user_struct *user; if (capable(CAP_IPC_LOCK) || !size) return 0; + rlim = rlimit(RLIMIT_MEMLOCK); + if (rlim == RLIM_INFINITY) + return 0; + num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ - max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + max_pg = rlim >> PAGE_SHIFT; user = mmp->user ? : current_user(); old_pg = atomic_long_read(&user->locked_vm); @@ -1348,7 +1693,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1363,12 +1709,12 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } - uarg->ubuf.callback = msg_zerocopy_callback; + uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; @@ -1386,7 +1732,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1394,7 +1740,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, u32 bytelen, next; /* there might be non MSG_ZEROCOPY users */ - if (uarg->callback != msg_zerocopy_callback) + if (uarg->ops != &msg_zerocopy_ubuf_ops) return NULL; /* realloc only when socket is locked (TCP, UDP cork), @@ -1416,7 +1762,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1431,7 +1778,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1505,8 +1852,8 @@ release: sock_put(sk); } -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); @@ -1515,7 +1862,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, if (refcount_dec_and_test(&uarg->refcnt)) __msg_zerocopy_callback(uarg_zc); } -EXPORT_SYMBOL_GPL(msg_zerocopy_callback); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { @@ -1525,24 +1871,39 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) uarg_to_msgzc(uarg)->len--; if (have_uref) - msg_zerocopy_callback(NULL, uarg, true); + msg_zerocopy_complete(NULL, uarg, true); } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { + .complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { - struct ubuf_info *orig_uarg = skb_zcopy(skb); int err, orig_len = skb->len; - /* An skb can only point to one uarg. This edge case happens when - * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. - */ - if (orig_uarg && uarg != orig_uarg) - return -EEXIST; + if (uarg->ops->link_skb) { + err = uarg->ops->link_skb(skb, uarg); + if (err) + return err; + } else { + struct ubuf_info *orig_uarg = skb_zcopy(skb); + + /* An skb can only point to one uarg. This edge case happens + * when TCP appends to an skb, but zerocopy_realloc triggered + * a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + } - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; @@ -1608,18 +1969,29 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; - int i, new_frags; + int i, order, psize, new_frags; u32 d_off; if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; + if (!skb_frags_readable(skb)) + return -EFAULT; + if (!num_frags) goto release; - new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; + /* We might have to allocate high order pages, so compute what minimum + * page order is needed. + */ + order = 0; + while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) + order++; + psize = (PAGE_SIZE << order); + + new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); for (i = 0; i < new_frags; i++) { - page = alloc_page(gfp_mask); + page = alloc_pages(gfp_mask | __GFP_COMP, order); if (!page) { while (head) { struct page *next = (struct page *)page_private(head); @@ -1646,11 +2018,11 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) vaddr = kmap_atomic(p); while (done < p_len) { - if (d_off == PAGE_SIZE) { + if (d_off == psize) { d_off = 0; page = (struct page *)page_private(page); } - copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); + copy = min_t(u32, psize - d_off, p_len - done); memcpy(page_address(page) + d_off, vaddr + p_off + done, copy); done += copy; @@ -1666,10 +2038,11 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { - __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); + __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize); head = (struct page *)page_private(head); } - __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); + __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0, + d_off); skb_shinfo(skb)->nr_frags = new_frags; release: @@ -1711,7 +2084,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask); if (!n) return NULL; @@ -1774,11 +2147,20 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb_headroom(skb); - unsigned int size = skb_end_offset(skb) + skb->data_len; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + struct sk_buff *n; + unsigned int size; + int headerlen; + if (!skb_frags_readable(skb)) + return NULL; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + + headerlen = skb_headroom(skb); + size = skb_end_offset(skb) + skb->data_len; + n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; @@ -1873,6 +2255,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone); * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. + * + * Note: If you skb_push() the start of the buffer after reallocating the + * header, call skb_postpush_data_move() first to move the metadata out of + * the way before writing to &sk_buff->data. */ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, @@ -1893,10 +2279,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; size = SKB_WITH_OVERHEAD(size); @@ -1947,8 +2330,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); - skb_metadata_clear(skb); - /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). @@ -1959,7 +2340,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, return 0; nofrags: - kfree(data); + skb_kfree_head(data, size); nodata: return -ENOMEM; } @@ -1986,6 +2367,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +/* Note: We plan to rework this in linux-6.4 */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { unsigned int saved_end_offset, saved_truesize; @@ -2004,6 +2386,20 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) if (likely(skb_end_offset(skb) == saved_end_offset)) return 0; + /* We can not change skb->end if the original or new value + * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). + */ + if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || + skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { + /* We think this path should not be taken. + * Add a temporary trace to warn us just in case. + */ + pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n", + saved_end_offset, skb_end_offset(skb)); + WARN_ON_ONCE(1); + return 0; + } + shinfo = skb_shinfo(skb); /* We are about to change back skb->end, @@ -2094,12 +2490,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* * Allocate the copy buffer */ - struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask, skb_alloc_rx_flag(skb), - NUMA_NO_NODE); - int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; + struct sk_buff *n; + int oldheadroom; + if (!skb_frags_readable(skb)) + return NULL; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + + oldheadroom = skb_headroom(skb); + n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (!n) return NULL; @@ -2437,6 +2841,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) */ int i, k, eat = (skb->tail + delta) - skb->end; + if (!skb_frags_readable(skb)) + return NULL; + if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC)) @@ -2590,6 +2997,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) to += copy; } + if (!skb_frags_readable(skb)) + goto fault; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; @@ -2689,10 +3099,8 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd, /* * Fill page/offset/length into spd, if it can hold more pages. */ -static bool spd_fill_page(struct splice_pipe_desc *spd, - struct pipe_inode_info *pipe, struct page *page, - unsigned int *len, unsigned int offset, - bool linear, +static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page, + unsigned int *len, unsigned int offset, bool linear, struct sock *sk) { if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) @@ -2720,8 +3128,7 @@ static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct splice_pipe_desc *spd, bool linear, - struct sock *sk, - struct pipe_inode_info *pipe) + struct sock *sk) { if (!*len) return true; @@ -2740,13 +3147,14 @@ static bool __splice_segment(struct page *page, unsigned int poff, do { unsigned int flen = min(*len, plen); - if (spd_fill_page(spd, pipe, page, &flen, poff, - linear, sk)) + if (spd_fill_page(spd, page, &flen, poff, linear, sk)) return true; poff += flen; plen -= flen; *len -= flen; - } while (*len && plen); + if (!*len) + return true; + } while (plen); return false; } @@ -2759,8 +3167,8 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd, struct sock *sk) { - int seg; struct sk_buff *iter; + int seg; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a @@ -2772,18 +3180,24 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, skb_headlen(skb), offset, len, spd, skb_head_is_locked(skb), - sk, pipe)) + sk)) return true; /* * then map the fragments */ + if (!skb_frags_readable(skb)) + return false; + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + if (WARN_ON_ONCE(!skb_frag_page(f))) + return false; + if (__splice_segment(skb_frag_page(f), skb_frag_off(f), skb_frag_size(f), - offset, len, spd, false, sk, pipe)) + offset, len, spd, false, sk)) return true; } @@ -2831,33 +3245,34 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, } EXPORT_SYMBOL_GPL(skb_splice_bits); -static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t size) +static int sendmsg_locked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; + size_t size = msg_data_left(msg); if (!sock) return -EINVAL; - return kernel_sendmsg(sock, msg, vec, num, size); + + if (!sock->ops->sendmsg_locked) + return sock_no_sendmsg_locked(sk, msg, size); + + return sock->ops->sendmsg_locked(sk, msg, size); } -static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; if (!sock) return -EINVAL; - return kernel_sendpage(sock, page, offset, size, flags); + return sock_sendmsg(sock, msg); } -typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t size); -typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, - size_t size, int flags); +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg, sendpage_func sendpage) + int len, sendmsg_func sendmsg, int flags) { + int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0; unsigned int orig_len = len; struct sk_buff *head = skb; unsigned short fragidx; @@ -2874,10 +3289,13 @@ do_frag_list: kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + msg.msg_flags = MSG_DONTWAIT | flags; + if (slen < len) + msg.msg_flags |= more_hint; - ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, - sendmsg_unlocked, sk, &msg, &kv, 1, slen); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, + sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; @@ -2908,11 +3326,21 @@ do_frag_list: slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, - sendpage_unlocked, sk, - skb_frag_page(frag), - skb_frag_off(frag) + offset, - slen, MSG_DONTWAIT); + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | + flags, + }; + + if (slen < len) + msg.msg_flags |= more_hint; + bvec_set_page(&bvec, skb_frag_page(frag), slen, + skb_frag_off(frag) + offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, + slen); + + ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, + sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; @@ -2949,16 +3377,21 @@ error: int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, - kernel_sendpage_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); + /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, - sendpage_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** @@ -2992,6 +3425,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) from += copy; } + if (!skb_frags_readable(skb)) + goto fault; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; int end; @@ -3051,8 +3487,7 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ -__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum, const struct skb_checksum_ops *ops) +__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -3063,14 +3498,16 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, if (copy > 0) { if (copy > len) copy = len; - csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, - skb->data + offset, copy, csum); + csum = csum_partial(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; pos = copy; } + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; @@ -3091,13 +3528,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); - csum2 = INDIRECT_CALL_1(ops->update, - csum_partial_ext, - vaddr + p_off, p_len, 0); + csum2 = csum_partial(vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); - csum = INDIRECT_CALL_1(ops->combine, - csum_block_add_ext, csum, - csum2, pos, p_len); + csum = csum_block_add(csum, csum2, pos); pos += p_len; } @@ -3118,10 +3551,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum2; if (copy > len) copy = len; - csum2 = __skb_checksum(frag_iter, offset - start, - copy, 0, ops); - csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, - csum, csum2, pos, copy); + csum2 = skb_checksum(frag_iter, offset - start, copy, + 0); + csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; offset += copy; @@ -3133,18 +3565,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, return csum; } -EXPORT_SYMBOL(__skb_checksum); - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) -{ - const struct skb_checksum_ops ops = { - .update = csum_partial_ext, - .combine = csum_block_add_ext, - }; - - return __skb_checksum(skb, offset, len, csum, &ops); -} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -3171,6 +3591,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, pos = copy; } + if (!skb_frags_readable(skb)) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -3234,6 +3657,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +#ifdef CONFIG_NET_CRC32C +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + if (copy > 0) { + copy = min(copy, len); + crc = crc32c(crc, skb->data + offset, copy); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + copy = end - offset; + if (copy > 0) { + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + copy = min(copy, len); + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + crc = crc32c(crc, vaddr + p_off, p_len); + kunmap_atomic(vaddr); + } + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + copy = min(copy, len); + crc = skb_crc32c(frag_iter, offset - start, copy, crc); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + BUG_ON(len); + + return crc; +} +EXPORT_SYMBOL(skb_crc32c); +#endif /* CONFIG_NET_CRC32C */ + __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) { __sum16 sum; @@ -3293,32 +3788,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) } EXPORT_SYMBOL(__skb_checksum_complete); -static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static const struct skb_checksum_ops default_crc32c_ops = { - .update = warn_crc32c_csum_update, - .combine = warn_crc32c_csum_combine, -}; - -const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = - &default_crc32c_ops; -EXPORT_SYMBOL(crc32c_csum_stub); - /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer @@ -3389,7 +3858,8 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) if (plen) { page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); - __skb_fill_page_desc(to, 0, page, offset, plen); + __skb_fill_netmem_desc(to, 0, page_to_netmem(page), + offset, plen); get_page(page); j = 1; len -= plen; @@ -3492,20 +3962,32 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) EXPORT_SYMBOL(skb_dequeue_tail); /** - * skb_queue_purge - empty a list + * skb_queue_purge_reason - empty a list * @list: list to empty + * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function takes the list * lock and is atomic with respect to other list locking functions. */ -void skb_queue_purge(struct sk_buff_head *list) +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) { - struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); + struct sk_buff_head tmp; + unsigned long flags; + + if (skb_queue_empty_lockless(list)) + return; + + __skb_queue_head_init(&tmp); + + spin_lock_irqsave(&list->lock, flags); + skb_queue_splice_init(list, &tmp); + spin_unlock_irqrestore(&list->lock, flags); + + __skb_queue_purge_reason(&tmp, reason); } -EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_purge_reason); /** * skb_rbtree_purge - empty a skb rbtree @@ -3533,6 +4015,27 @@ unsigned int skb_rbtree_purge(struct rb_root *root) return sum; } +void skb_errqueue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb, *next; + struct sk_buff_head kill; + unsigned long flags; + + __skb_queue_head_init(&kill); + + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk_safe(list, skb, next) { + if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) + continue; + __skb_unlink(skb, list); + __skb_queue_tail(&kill, skb); + } + spin_unlock_irqrestore(&list->lock, flags); + __skb_queue_purge(&kill); +} +EXPORT_SYMBOL(skb_errqueue_purge); + /** * skb_queue_head - queue a buffer at the list head * @list: list to use @@ -3628,6 +4131,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb, skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb1->unreadable = skb->unreadable; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; skb1->len += skb1->data_len; @@ -3675,6 +4179,8 @@ static inline void skb_split_no_header(struct sk_buff *skb, pos += size; } skb_shinfo(skb1)->nr_frags = k; + + skb1->unreadable = skb->unreadable; } /** @@ -3738,6 +4244,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) if (skb_zcopy(tgt) || skb_zcopy(skb)) return 0; + DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); + DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); + todo = shiftlen; from = 0; to = skb_shinfo(tgt)->nr_frags; @@ -3746,8 +4255,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) /* Actual merge is delayed until the point when we know we can * commit all, so that we don't have to undo partial changes */ - if (!to || - !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), skb_frag_off(fragfrom))) { merge = -1; } else { @@ -3910,6 +4418,9 @@ next_skb: return block_limit - abs_offset; } + if (!skb_frags_readable(st->cur_skb)) + return 0; + if (st->frag_idx == 0 && !st->frag_data) st->stepped_offset += skb_headlen(st->cur_skb); @@ -3986,6 +4497,41 @@ void skb_abort_seq_read(struct skb_seq_state *st) } EXPORT_SYMBOL(skb_abort_seq_read); +/** + * skb_copy_seq_read() - copy from a skb_seq_state to a buffer + * @st: source skb_seq_state + * @offset: offset in source + * @to: destination buffer + * @len: number of bytes to copy + * + * Copy @len bytes from @offset bytes into the source @st to the destination + * buffer @to. `offset` should increase (or be unchanged) with each subsequent + * call to this function. If offset needs to decrease from the previous use `st` + * should be reset first. + * + * Return: 0 on success or -EINVAL if the copy ended early + */ +int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) +{ + const u8 *data; + u32 sqlen; + + for (;;) { + sqlen = skb_seq_read(offset, &data, st); + if (sqlen == 0) + return -EINVAL; + if (sqlen >= len) { + memcpy(to, data, len); + return 0; + } + memcpy(to, data, sqlen); + to += sqlen; + offset += sqlen; + len -= sqlen; + } +} +EXPORT_SYMBOL(skb_copy_seq_read); + #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, @@ -4015,6 +4561,7 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config) { + unsigned int patlen = config->ops->get_pattern_len(config); struct ts_state state; unsigned int ret; @@ -4026,18 +4573,18 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); ret = textsearch_find(config, &state); - return (ret <= to - from ? ret : UINT_MAX); + return (ret + patlen <= to - from ? ret : UINT_MAX); } EXPORT_SYMBOL(skb_find_text); int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size) + int offset, size_t size, size_t max_frags) { int i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - } else if (i < MAX_SKB_FRAGS) { + } else if (i < max_frags) { skb_zcopy_downgrade_managed(skb); get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, size); @@ -4077,10 +4624,9 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) struct page *page; page = virt_to_head_page(frag_skb->head); - __skb_frag_set_page(&head_frag, page); - skb_frag_off_set(&head_frag, frag_skb->data - - (unsigned char *)page_address(page)); - skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); + skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - + (unsigned char *)page_address(page), + skb_headlen(frag_skb)); return head_frag; } @@ -4098,9 +4644,14 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(skb, -skb_network_offset(skb) + offset); + /* Ensure the head is writeable before touching the shared info */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) + goto err_linearize; + skb_shinfo(skb)->frag_list = NULL; - do { + while (list_skb) { nskb = list_skb; list_skb = list_skb->next; @@ -4146,8 +4697,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, if (skb_needs_linearize(nskb, features) && __skb_linearize(nskb)) goto err_linearize; - - } while (list_skb); + } skb->truesize = skb->truesize - delta_truesize; skb->data_len = skb->data_len - delta_len; @@ -4187,21 +4737,20 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; - skb_frag_t *frag = skb_shinfo(head_skb)->frags; unsigned int mss = skb_shinfo(head_skb)->gso_size; unsigned int doffset = head_skb->data - skb_mac_header(head_skb); - struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int partial_segs = 0; unsigned int headroom; unsigned int len = head_skb->len; + struct sk_buff *frag_skb; + skb_frag_t *frag; __be16 proto; bool csum, sg; - int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; - int pos; + int nfrags, pos; if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { @@ -4266,8 +4815,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, /* GSO partial only requires that we trim off any excess that * doesn't fit into an MSS sized block, so take care of that * now. + * Cap len to not accidentally hit GSO_BY_FRAGS. */ - partial_segs = len / mss; + partial_segs = min(len, GSO_BY_FRAGS - 1) / mss; if (partial_segs > 1) mss *= partial_segs; else @@ -4278,6 +4828,13 @@ normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); + if (skb_orphan_frags(head_skb, GFP_ATOMIC)) + return ERR_PTR(-ENOMEM); + + nfrags = skb_shinfo(head_skb)->nr_frags; + frag = skb_shinfo(head_skb)->frags; + frag_skb = head_skb; + do { struct sk_buff *nskb; skb_frag_t *nskb_frag; @@ -4298,6 +4855,10 @@ normal: (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); + nskb = skb_clone(list_skb, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; @@ -4316,12 +4877,8 @@ normal: frag++; } - nskb = skb_clone(list_skb, GFP_ATOMIC); list_skb = list_skb->next; - if (unlikely(!nskb)) - goto err; - if (unlikely(pskb_trim(nskb, len))) { kfree_skb(nskb); goto err; @@ -4397,12 +4954,16 @@ normal: skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & SKBFL_SHARED_FRAG; - if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || - skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) goto err; while (pos < offset + len) { if (i >= nfrags) { + if (skb_orphan_frags(list_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, list_skb, + GFP_ATOMIC)) + goto err; + i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; @@ -4416,10 +4977,6 @@ normal: i--; frag--; } - if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || - skb_zerocopy_clone(nskb, frag_skb, - GFP_ATOMIC)) - goto err; list_skb = list_skb->next; } @@ -4545,33 +5102,28 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_INET_PSP) + [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) { - return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - skb_ext_type_len[SKB_EXT_BRIDGE_NF] + -#endif -#ifdef CONFIG_XFRM - skb_ext_type_len[SKB_EXT_SEC_PATH] + -#endif -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext_type_len[TC_SKB_EXT] + -#endif -#if IS_ENABLED(CONFIG_MPTCP) - skb_ext_type_len[SKB_EXT_MPTCP] + -#endif -#if IS_ENABLED(CONFIG_MCTP_FLOWS) - skb_ext_type_len[SKB_EXT_MCTP] + -#endif - 0; + unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); + int i; + + for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) + l += skb_ext_type_len[i]; + + return l; } static void skb_extensions_init(void) { BUILD_BUG_ON(SKB_EXT_NUM >= 8); +#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) BUILD_BUG_ON(skb_ext_total_length() > 255); +#endif skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), @@ -4583,20 +5135,44 @@ static void skb_extensions_init(void) static void skb_extensions_init(void) {} #endif +/* The SKB kmem_cache slab is critical for network performance. Never + * merge/alias the slab with similar sized objects. This avoids fragmentation + * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. + */ +#ifndef CONFIG_SLUB_TINY +#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE +#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ +#define FLAG_SKB_NO_MERGE 0 +#endif + void __init skb_init(void) { - skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", + net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, + SLAB_HWCACHE_ALIGN|SLAB_PANIC| + FLAG_SKB_NO_MERGE, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); - skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache); + + net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. + * struct skb_shared_info is located at the end of skb->head, + * and should not be copied to/from user. + */ + net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", + SKB_SMALL_HEAD_CACHE_SIZE, + 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, + 0, + SKB_SMALL_HEAD_HEADROOM, + NULL); skb_extensions_init(); } @@ -4713,7 +5289,7 @@ EXPORT_SYMBOL_GPL(skb_to_sgvec); * 3. sg_unmark_end * 4. skb_to_sgvec(payload2) * - * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * When mapping multiple payload conditionally, skb_to_sgvec_nomark * is more preferable. */ int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, @@ -4889,6 +5465,9 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) bool icmp_next = false; unsigned long flags; + if (skb_queue_empty_lockless(q)) + return NULL; + spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) { @@ -4959,7 +5538,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) serr->ee.ee_data -= atomic_read(&sk->sk_tskey); @@ -4975,7 +5554,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) + if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) return true; read_lock_bh(&sk->sk_callback_lock); @@ -5008,6 +5587,54 @@ err: } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); +static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + int tstype) +{ + switch (tstype) { + case SCM_TSTAMP_SCHED: + return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; + case SCM_TSTAMP_SND: + return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : + SKBTX_SW_TSTAMP); + case SCM_TSTAMP_ACK: + return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; + case SCM_TSTAMP_COMPLETION: + return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP; + } + + return false; +} + +static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, + int tstype) +{ + int op; + + switch (tstype) { + case SCM_TSTAMP_SCHED: + op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; + break; + case SCM_TSTAMP_SND: + if (hwtstamps) { + op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; + *skb_hwtstamps(skb) = *hwtstamps; + } else { + op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; + } + break; + case SCM_TSTAMP_ACK: + op = BPF_SOCK_OPS_TSTAMP_ACK_CB; + break; + default: + return; + } + + bpf_skops_tx_timestamping(sk, skb, op); +} + void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, @@ -5015,21 +5642,30 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, { struct sk_buff *skb; bool tsonly, opt_stats = false; + u32 tsflags; if (!sk) return; - if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) + skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, + sk, tstype); + + if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) + return; + + tsflags = READ_ONCE(sk->sk_tsflags); + if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) return; - tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) { #ifdef CONFIG_INET - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); @@ -5039,6 +5675,11 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, skb = alloc_skb(0, GFP_ATOMIC); } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { + kfree_skb(skb); + return; + } } if (!skb) return; @@ -5066,6 +5707,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, } EXPORT_SYMBOL_GPL(skb_tstamp_tx); +#ifdef CONFIG_WIRELESS void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; @@ -5091,6 +5733,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); +#endif /* CONFIG_WIRELESS */ /** * skb_partial_csum_set - set up and verify partial csum values for packet @@ -5109,7 +5752,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); u32 csum_start = skb_headroom(skb) + (u32)start; - if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { + if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", start, off, skb_headroom(skb), skb_headlen(skb)); return false; @@ -5117,7 +5760,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = csum_start; skb->csum_offset = off; - skb_set_transport_header(skb, start); + skb->transport_header = csum_start; return true; } EXPORT_SYMBOL_GPL(skb_partial_csum_set); @@ -5451,7 +6094,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); - kmem_cache_free(skbuff_head_cache, skb); + kmem_cache_free(net_hotdata.skbuff_cache, skb); } else { __kfree_skb(skb); } @@ -5476,21 +6119,19 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; - /* In general, avoid mixing slab allocated and page_pool allocated - * pages within the same SKB. However when @to is not pp_recycle and - * @from is cloned, we can transition frag pages from page_pool to - * reference counted. - * - * On the other hand, don't allow coalescing two pp_recycle SKBs if - * @from is cloned, in case the SKB is using page_pool fragment - * references (PP_FLAG_PAGE_FRAG). Since we only take full page - * references for cloned SKBs at the moment that would result in - * inconsistent reference counts. + /* In general, avoid mixing page_pool and non-page_pool allocated + * pages within the same SKB. In theory we could take full + * references if @from is cloned and !@to->pp_recycle but its + * tricky (due to potential race with the clone disappearing) and + * rare, so not worth dealing with. */ - if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) + if (to->pp_recycle != from->pp_recycle) return false; - if (len <= skb_tailroom(to)) { + if (skb_frags_readable(from) != skb_frags_readable(to)) + return false; + + if (len <= skb_tailroom(to) && skb_frags_readable(from)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; @@ -5544,8 +6185,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < from_shinfo->nr_frags; i++) - __skb_frag_ref(&from_shinfo->frags[i]); + if (skb_pp_frag_ref(from)) { + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); + } to->truesize += delta; to->len += len; @@ -5562,7 +6205,7 @@ EXPORT_SYMBOL(skb_try_coalesce); * @skb: buffer to clean * @xnet: packet is crossing netns * - * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * skb_scrub_packet can be used after encapsulating or decapsulating a packet * into/from a tunnel. Some information have to be cleared during these * operations. * skb_scrub_packet can also be used to clean a skb before injecting it in @@ -5583,157 +6226,16 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->offload_fwd_mark = 0; skb->offload_l3_fwd_mark = 0; #endif + ipvs_reset(skb); if (!xnet) return; - ipvs_reset(skb); skb->mark = 0; skb_clear_tstamp(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); -/** - * skb_gso_transport_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_transport_seglen is used to determine the real size of the - * individual segments, including Layer4 headers (TCP/UDP). - * - * The MAC/L2 or network (IP, IPv6) headers are not accounted for. - */ -static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) -{ - const struct skb_shared_info *shinfo = skb_shinfo(skb); - unsigned int thlen = 0; - - if (skb->encapsulation) { - thlen = skb_inner_transport_header(skb) - - skb_transport_header(skb); - - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - thlen += inner_tcp_hdrlen(skb); - } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { - thlen = tcp_hdrlen(skb); - } else if (unlikely(skb_is_gso_sctp(skb))) { - thlen = sizeof(struct sctphdr); - } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { - thlen = sizeof(struct udphdr); - } - /* UFO sets gso_size to the size of the fragmentation - * payload, i.e. the size of the L4 (UDP) header is already - * accounted for. - */ - return thlen + shinfo->gso_size; -} - -/** - * skb_gso_network_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_network_seglen is used to determine the real size of the - * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). - * - * The MAC/L2 header is not accounted for. - */ -static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - - skb_network_header(skb); - - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_mac_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_mac_seglen is used to determine the real size of the - * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 - * headers (TCP/UDP). - */ -static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS - * - * There are a couple of instances where we have a GSO skb, and we - * want to determine what size it would be after it is segmented. - * - * We might want to check: - * - L3+L4+payload size (e.g. IP forwarding) - * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) - * - * This is a helper to do that correctly considering GSO_BY_FRAGS. - * - * @skb: GSO skb - * - * @seg_len: The segmented length (from skb_gso_*_seglen). In the - * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. - * - * @max_len: The maximum permissible length. - * - * Returns true if the segmented length <= max length. - */ -static inline bool skb_gso_size_check(const struct sk_buff *skb, - unsigned int seg_len, - unsigned int max_len) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); - const struct sk_buff *iter; - - if (shinfo->gso_size != GSO_BY_FRAGS) - return seg_len <= max_len; - - /* Undo this so we can re-use header sizes */ - seg_len -= GSO_BY_FRAGS; - - skb_walk_frags(skb, iter) { - if (seg_len + skb_headlen(iter) > max_len) - return false; - } - - return true; -} - -/** - * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? - * - * @skb: GSO skb - * @mtu: MTU to validate against - * - * skb_gso_validate_network_len validates if a given skb will fit a - * wanted MTU once split. It considers L3 headers, L4 headers, and the - * payload. - */ -bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) -{ - return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); - -/** - * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? - * - * @skb: GSO skb - * @len: length to validate against - * - * skb_gso_validate_mac_len validates if a given skb will fit a wanted - * length once split, including L2, L3 and L4 headers and the payload. - */ -bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) -{ - return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); - static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { int mac_len, meta_len; @@ -5813,12 +6315,36 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) } EXPORT_SYMBOL(skb_ensure_writable); +int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) +{ + int needed_headroom = dev->needed_headroom; + int needed_tailroom = dev->needed_tailroom; + + /* For tail taggers, we need to pad short frames ourselves, to ensure + * that the tail tag does not fail at its role of being at the end of + * the packet, once the conduit interface pads the frame. Account for + * that pad length here, and pad later. + */ + if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) + needed_tailroom += ETH_ZLEN - skb->len; + /* skb_headroom() returns unsigned int... */ + needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); + needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); + + if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) + /* No reallocation needed, yay! */ + return 0; + + return pskb_expand_head(skb, needed_headroom, needed_tailroom, + GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable_head_tail); + /* remove VLAN header from packet and update csum accordingly. * expects a non skb_vlan_tag_present skb with a vlan tag payload */ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { - struct vlan_hdr *vhdr; int offset = skb->data - skb_mac_header(skb); int err; @@ -5834,13 +6360,8 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); - vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); - *vlan_tci = ntohs(vhdr->h_vlan_TCI); - - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); - __skb_pull(skb, VLAN_HLEN); + vlan_remove_tag(skb, vlan_tci); - vlan_set_encap_proto(skb, vhdr); skb->mac_header += VLAN_HLEN; if (skb_network_offset(skb) < ETH_HLEN) @@ -5906,7 +6427,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) return err; skb->protocol = skb->vlan_proto; - skb->mac_len += VLAN_HLEN; + skb->network_header -= VLAN_HLEN; skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } @@ -6166,7 +6687,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); * * @header_len: size of linear part * @data_len: needed length in frags - * @max_page_order: max page order desired. + * @order: max page order desired. * @errcode: pointer to error code if any * @gfp_mask: allocation mask * @@ -6174,21 +6695,17 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); */ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, - int max_page_order, + int order, int *errcode, gfp_t gfp_mask) { - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; unsigned long chunk; struct sk_buff *skb; struct page *page; - int i; + int nr_frags = 0; *errcode = -EMSGSIZE; - /* Note this test could be relaxed, if we succeed to allocate - * high order pages... - */ - if (npages > MAX_SKB_FRAGS) + if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) return NULL; *errcode = -ENOBUFS; @@ -6196,34 +6713,32 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (!skb) return NULL; - skb->truesize += npages << PAGE_SHIFT; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | - __GFP_NOWARN, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } + while (data_len) { + if (nr_frags == MAX_SKB_FRAGS) + goto failure; + while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; + + if (order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN, + order); + if (!page) { + order--; + continue; + } + } else { + page = alloc_page(gfp_mask); + if (!page) + goto failure; } - page = alloc_page(gfp_mask); - if (!page) - goto failure; -fill_page: chunk = min_t(unsigned long, data_len, PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); + skb_fill_page_desc(skb, nr_frags, page, 0, chunk); + nr_frags++; + skb->truesize += (PAGE_SIZE << order); data_len -= chunk; - npages -= 1 << order; } return skb; @@ -6245,10 +6760,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); @@ -6264,7 +6776,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, if (skb_cloned(skb)) { /* drop the old head gracefully */ if (skb_orphan_frags(skb, gfp_mask)) { - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) @@ -6298,8 +6810,7 @@ static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); /* carve out the first eat bytes from skb's frag_list. May recurse into * pskb_carve() */ -static int pskb_carve_frag_list(struct sk_buff *skb, - struct skb_shared_info *shinfo, int eat, +static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat, gfp_t gfp_mask) { struct sk_buff *list = shinfo->frag_list; @@ -6364,10 +6875,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - size = kmalloc_size_roundup(size); - data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; size = SKB_WITH_OVERHEAD(size); @@ -6375,7 +6883,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } shinfo = (struct skb_shared_info *)(data + size); @@ -6407,11 +6915,11 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_clone_fraglist(skb); /* split line is in frag list */ - if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { + if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) { /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ if (skb_has_frag_list(skb)) kfree_skb_list(skb_shinfo(skb)->frag_list); - kfree(data); + skb_kfree_head(data, size); return -ENOMEM; } skb_release_data(skb, SKB_CONSUMED); @@ -6478,7 +6986,7 @@ void skb_condense(struct sk_buff *skb) { if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || - skb_cloned(skb)) + skb_cloned(skb) || !skb_frags_readable(skb)) return; /* Nice, we can free page frag(s) right now */ @@ -6546,6 +7054,14 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, xfrm_state_hold(sp->xvec[i]); } #endif +#ifdef CONFIG_MCTP_FLOWS + if (old_active & (1 << SKB_EXT_MCTP)) { + struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP); + + if (flow->key) + refcount_inc(&flow->key->refs); + } +#endif __skb_ext_put(old); return new; } @@ -6573,6 +7089,7 @@ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, skb->active_extensions = 1 << id; return skb_ext_get_ptr(ext, id); } +EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL"); /** * skb_ext_add - allocate space for given extension, COW if needed @@ -6686,6 +7203,19 @@ free_now: EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + local_bh_disable(); + __napi_kfree_skb(skb, SKB_CONSUMED); + local_bh_enable(); +} + /** * skb_attempt_defer_free - queue skb for remote freeing * @skb: buffer @@ -6696,38 +7226,194 @@ EXPORT_SYMBOL(__skb_ext_put); */ void skb_attempt_defer_free(struct sk_buff *skb) { + struct skb_defer_node *sdn; + unsigned long defer_count; int cpu = skb->alloc_cpu; - struct softnet_data *sd; - unsigned long flags; unsigned int defer_max; bool kick; - if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || - !cpu_online(cpu) || - cpu == raw_smp_processor_id()) { -nodefer: __kfree_skb(skb); + if (cpu == raw_smp_processor_id() || + WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu)) { +nodefer: kfree_skb_napi_cache(skb); return; } - sd = &per_cpu(softnet_data, cpu); - defer_max = READ_ONCE(sysctl_skb_defer_max); - if (READ_ONCE(sd->defer_count) >= defer_max) + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(skb->destructor); + DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb)); + + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); + + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); + defer_count = atomic_long_inc_return(&sdn->defer_count); + + if (defer_count >= defer_max) goto nodefer; - spin_lock_irqsave(&sd->defer_lock, flags); - /* Send an IPI every time queue reaches half capacity. */ - kick = sd->defer_count == (defer_max >> 1); - /* Paired with the READ_ONCE() few lines above */ - WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + llist_add(&skb->ll_node, &sdn->defer_list); - skb->next = sd->defer_list; - /* Paired with READ_ONCE() in skb_defer_free_flush() */ - WRITE_ONCE(sd->defer_list, skb); - spin_unlock_irqrestore(&sd->defer_lock, flags); + /* Send an IPI every time queue reaches half capacity. */ + kick = (defer_count - 1) == (defer_max >> 1); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) - smp_call_function_single_async(cpu, &sd->defer_csd); + if (unlikely(kick)) + kick_defer_list_purge(cpu); +} + +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, + size_t offset, size_t len) +{ + const char *kaddr; + __wsum csum; + + kaddr = kmap_local_page(page); + csum = csum_partial(kaddr + offset, len, 0); + kunmap_local(kaddr); + skb->csum = csum_block_add(skb->csum, csum, skb->len); +} + +/** + * skb_splice_from_iter - Splice (or copy) pages to skbuff + * @skb: The buffer to add pages to + * @iter: Iterator representing the pages to be added + * @maxsize: Maximum amount of pages to be added + * + * This is a common helper function for supporting MSG_SPLICE_PAGES. It + * extracts pages from an iterator and adds them to the socket buffer if + * possible, copying them to fragments if not possible (such as if they're slab + * pages). + * + * Returns the amount of data spliced/copied or -EMSGSIZE if there's + * insufficient space in the buffer to transfer anything. + */ +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, + ssize_t maxsize) +{ + size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); + struct page *pages[8], **ppages = pages; + ssize_t spliced = 0, ret = 0; + unsigned int i; + + while (iter->count > 0) { + ssize_t space, nr, len; + size_t off; + + ret = -EMSGSIZE; + space = frag_limit - skb_shinfo(skb)->nr_frags; + if (space < 0) + break; + + /* We might be able to coalesce without increasing nr_frags */ + nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); + + len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); + if (len <= 0) { + ret = len ?: -EIO; + break; + } + + i = 0; + do { + struct page *page = pages[i++]; + size_t part = min_t(size_t, PAGE_SIZE - off, len); + + ret = -EIO; + if (WARN_ON_ONCE(!sendpage_ok(page))) + goto out; + + ret = skb_append_pagefrags(skb, page, off, part, + frag_limit); + if (ret < 0) { + iov_iter_revert(iter, len); + goto out; + } + + if (skb->ip_summed == CHECKSUM_NONE) + skb_splice_csum_page(skb, page, off, part); + + off = 0; + spliced += part; + maxsize -= part; + len -= part; + } while (len > 0); + + if (maxsize <= 0) + break; + } + +out: + skb_len_add(skb, spliced); + return spliced ?: ret; +} +EXPORT_SYMBOL(skb_splice_from_iter); + +static __always_inline +size_t memcpy_from_iter_csum(void *iter_from, size_t progress, + size_t len, void *to, void *priv2) +{ + __wsum *csum = priv2; + __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len); + + *csum = csum_block_add(*csum, next, progress); + return 0; +} + +static __always_inline +size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, + size_t len, void *to, void *priv2) +{ + __wsum next, *csum = priv2; + + next = csum_and_copy_from_user(iter_from, to + progress, len); + *csum = csum_block_add(*csum, next, progress); + return next ? 0 : len; +} + +bool csum_and_copy_from_iter_full(void *addr, size_t bytes, + __wsum *csum, struct iov_iter *i) +{ + size_t copied; + + if (WARN_ON_ONCE(!i->data_source)) + return false; + copied = iterate_and_advance2(i, bytes, addr, csum, + copy_from_user_iter_csum, + memcpy_from_iter_csum); + if (likely(copied == bytes)) + return true; + iov_iter_revert(i, copied); + return false; +} +EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); } +EXPORT_SYMBOL(put_netmem); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 53d0251788aa..2ac7731e1e0a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -8,6 +8,7 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/tls.h> +#include <trace/events/sock.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -292,7 +293,7 @@ out: /* If we trim data a full sg elem before curr pointer update * copybreak and current so that any future copy operations * start at new copy location. - * However trimed data that has not yet been used in a copy op + * However trimmed data that has not yet been used in a copy op * does not require an update. */ if (!msg->sg.size) { @@ -368,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -396,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -403,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); @@ -433,7 +435,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, page = sg_page(sge); if (copied + copy > len) copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (copy) + copy = copy_page_to_iter(page, sge->offset, copy, iter); if (!copy) { copied = copied ? copied : -EFAULT; goto out; @@ -443,8 +446,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -480,8 +485,6 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, msg_rx = sk_psock_peek_msg(psock); } out: - if (psock->work_state.skb && copied > 0) - schedule_work(&psock->work); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -527,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, - struct sk_msg *msg) + struct sk_msg *msg, + bool take_ref) { int num_sge, copied; + /* skb_to_sgvec will fail when the total number of fragments in + * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the + * caller may aggregate multiple skbs. + */ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. + * Note that skb_linearize requires the skb not to be shared. */ if (skb_linearize(skb)) return -EAGAIN; @@ -546,11 +555,14 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, return num_sge; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + psock->ingress_bytes += len; +#endif copied = len; msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; - msg->skb = skb; + msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); @@ -558,7 +570,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len); + u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) @@ -572,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * correctly. */ if (unlikely(skb->sk == sk)) - return sk_psock_skb_ingress_self(psock, skb, off, len); + return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; @@ -584,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * into user buffers. */ skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; @@ -595,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len) + u32 off, u32 len, bool take_ref) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; @@ -604,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; @@ -618,47 +630,48 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } + return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, - struct sk_buff *skb, int len, int off) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { - state->skb = skb; state->len = len; state->off = off; - } else { - sock_drop(psock->sk, skb); } spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_backlog(struct work_struct *work) { - struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct delayed_work *dwork = to_delayed_work(work); + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; + u32 len = 0, off = 0; bool ingress; - u32 len, off; int ret; - mutex_lock(&psock->work_mutex); - if (unlikely(state->skb)) { - spin_lock_bh(&psock->ingress_lock); - skb = state->skb; - len = state->len; - off = state->off; - state->skb = NULL; - spin_unlock_bh(&psock->ingress_lock); - } - if (skb) - goto start; + /* If sk is quickly removed from the map and then added back, the old + * psock should not be scheduled, because there are now two psocks + * pointing to the same sk. + */ + if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + return; - while ((skb = skb_dequeue(&psock->ingress_skb))) { + /* Increment the psock refcnt to synchronize with close(fd) path in + * sock_map_close(), ensuring we wait for backlog thread completion + * before sk_socket freed. If refcnt increment fails, it indicates + * sock_map_close() completed with sk_socket potentially already freed. + */ + if (!sk_psock_get(psock->sk)) + return; + mutex_lock(&psock->work_mutex); + while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { @@ -667,7 +680,13 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } -start: + + /* Resume processing from previous partial state */ + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -677,25 +696,33 @@ start: len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { - sk_psock_skb_state(psock, state, skb, - len, off); + sk_psock_skb_state(psock, state, len, off); + /* Restore redir info we cleared before */ + skb_bpf_set_redir(skb, psock->sk, ingress); + /* Delay slightly to prioritize any + * other work that might be here. + */ + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - sock_drop(psock->sk, skb); goto end; } off += ret; len -= ret; } while (len); - if (!ingress) - kfree_skb(skb); + /* The entire skb sent, clear state */ + sk_psock_skb_state(psock, state, 0, 0); + skb = skb_dequeue(&psock->ingress_skb); + kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); + sk_psock_put(psock->sk, psock); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -733,7 +760,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); - INIT_WORK(&psock->work, sk_psock_backlog); + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); @@ -772,6 +799,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } @@ -785,11 +814,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); sock_drop(psock->sk, skb); } - kfree_skb(psock->work_state.skb); - /* We null the skb here to ensure that calls to sk_psock_backlog - * do not pick up the free'd skb. - */ - psock->work_state.skb = NULL; __sk_psock_purge_ingress_msg(psock); } @@ -808,7 +832,6 @@ void sk_psock_stop(struct sk_psock *psock) spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); - __sk_psock_zap_ingress(psock); spin_unlock_bh(&psock->ingress_lock); } @@ -822,7 +845,8 @@ static void sk_psock_destroy(struct work_struct *work) sk_psock_done_strp(psock); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); + __sk_psock_zap_ingress(psock); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); @@ -832,6 +856,8 @@ static void sk_psock_destroy(struct work_struct *work) if (psock->sk_redir) sock_put(psock->sk_redir); + if (psock->sk_pair) + sock_put(psock->sk_pair); sock_put(psock->sk); kfree(psock); } @@ -850,7 +876,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); - queue_rcu_work(system_wq, &psock->rwork); + queue_rcu_work(system_percpu_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); @@ -937,7 +963,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) } skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); + schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } @@ -989,10 +1015,8 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { - skb_bpf_redirect_clear(skb); + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) goto out_free; - } skb_bpf_set_ingress(skb); @@ -1011,28 +1035,29 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, off = stm->offset; len = stm->full_len; } - err = sk_psock_skb_ingress_self(psock, skb, off, len); + err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); - if (err < 0) { - skb_bpf_redirect_clear(skb); + if (err < 0) goto out_free; - } } break; case __SK_REDIRECT: + tcp_eat_skb(psock->sk, skb); err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: out_free: + skb_bpf_redirect_clear(skb); + tcp_eat_skb(psock->sk, skb); sock_drop(psock->sk, skb); } @@ -1048,7 +1073,7 @@ static void sk_psock_write_space(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); @@ -1077,8 +1102,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); - if (ret == SK_PASS) - skb_bpf_set_strparser(skb); + skb_bpf_set_strparser(skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } @@ -1114,15 +1138,17 @@ static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; + trace_sk_data_ready(sk); + rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { - write_lock_bh(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); @@ -1130,13 +1156,23 @@ static void sk_psock_strp_data_ready(struct sock *sk) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { + int ret; + static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; - return strp_init(&psock->strp, sk, &cb); + ret = strp_init(&psock->strp, sk, &cb); + if (!ret) + sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + + if (sk_is_tcp(sk)) { + psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; + psock->copied_seq = tcp_sk(sk)->copied_seq; + } + return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) @@ -1164,7 +1200,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.stream_parser) + if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else @@ -1180,12 +1216,11 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) int ret = __SK_DROP; int len = skb->len; - skb_get(skb); - rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; + tcp_eat_skb(sk, skb); sock_drop(sk, skb); goto out; } @@ -1209,10 +1244,26 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; + const struct proto_ops *ops; + int copied; + + trace_sk_data_ready(sk); - if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) + if (unlikely(!sock)) return; - sock->ops->read_skb(sk, sk_psock_verdict_recv); + ops = READ_ONCE(sock->ops); + if (!ops || !ops->read_skb) + return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); + if (copied >= 0) { + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) + sk_psock_data_ready(sk, psock); + rcu_read_unlock(); + } } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) diff --git a/net/core/sock.c b/net/core/sock.c index f954d5893e79..45c98bf524b2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -85,7 +85,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> @@ -107,6 +107,7 @@ #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> +#include <linux/udp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> @@ -114,15 +115,20 @@ #include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/compat.h> +#include <linux/mroute.h> +#include <linux/mroute6.h> +#include <linux/icmpv6.h> #include <linux/uaccess.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> @@ -138,15 +144,18 @@ #include <net/tcp.h> #include <net/busy_poll.h> +#include <net/phonet/phonet.h> #include <linux/ethtool.h> +#include <uapi/linux/pidfd.h> + #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -static void sock_def_write_space_wfree(struct sock *sk); +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); static void sock_def_write_space(struct sock *sk); /** @@ -272,18 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_wmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_wmem_max); -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_rmem_max); -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; - -/* Maximal space eaten by iovec or ancillary data plus some space */ -int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); -EXPORT_SYMBOL(sysctl_optmem_max); - -int sysctl_tstamp_allow_data __read_mostly = 1; +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); @@ -425,6 +428,7 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, { struct __kernel_sock_timeval tv; int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); + long val; if (err) return err; @@ -435,7 +439,7 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, if (tv.tv_sec < 0) { static int warned __read_mostly; - *timeo_p = 0; + WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", @@ -443,14 +447,22 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, } return 0; } - *timeo_p = MAX_SCHEDULE_TIMEOUT; - if (tv.tv_sec == 0 && tv.tv_usec == 0) - return 0; - if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) - *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); + val = MAX_SCHEDULE_TIMEOUT; + if ((tv.tv_sec || tv.tv_usec) && + (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) + val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, + USEC_PER_SEC / HZ); + WRITE_ONCE(*timeo_p, val); return 0; } +static bool sk_set_prio_allowed(const struct sock *sk, int val) +{ + return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); +} + static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { @@ -478,14 +490,14 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { - atomic_inc(&sk->sk_drops); + if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { + sk_drops_inc(sk); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return -ENOBUFS; } @@ -514,11 +526,10 @@ int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason drop_reason; int err; - err = sk_filter(sk, skb); - if (err) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + err = sk_filter_reason(sk, skb, &drop_reason); + if (err) goto out; - } + err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: @@ -541,15 +552,18 @@ EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; int rc = NET_RX_SUCCESS; + int err; - if (sk_filter_trim_cap(sk, skb, trim_cap)) + if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) goto discard_and_relse; skb->dev = NULL; - if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { + sk_drops_inc(sk); + reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } if (nested) @@ -565,9 +579,13 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); - } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { + } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { bh_unlock_sock(sk); - atomic_inc(&sk->sk_drops); + if (err == -ENOMEM) + reason = SKB_DROP_REASON_PFMEMALLOC; + if (err == -ENOBUFS) + reason = SKB_DROP_REASON_SOCKET_BACKLOG; + sk_drops_inc(sk); goto discard_and_relse; } @@ -577,7 +595,7 @@ out: sock_put(sk); return rc; discard_and_relse: - kfree_skb(skb); + sk_skb_reason_drop(sk, skb, reason); goto out; } EXPORT_SYMBOL(__sk_receive_skb); @@ -590,11 +608,11 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->obsolete && + if (dst && READ_ONCE(dst->obsolete) && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); - sk->sk_dst_pending_confirm = 0; + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; @@ -608,7 +626,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); - if (dst && dst->obsolete && + if (dst && READ_ONCE(dst->obsolete) && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); @@ -753,18 +771,19 @@ out: return ret; } -bool sk_mc_loop(struct sock *sk) +bool sk_mc_loop(const struct sock *sk) { if (dev_recursion_level()) return false; if (!sk) return true; - switch (sk->sk_family) { + /* IPV6_ADDRFORM can change sk->sk_family under us. */ + switch (READ_ONCE(sk->sk_family)) { case AF_INET: - return inet_sk(sk)->mc_loop; + return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - return inet6_sk(sk)->mc_loop; + return inet6_test_bit(MC6_LOOP, sk); #endif } WARN_ON_ONCE(1); @@ -791,7 +810,7 @@ EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); - sk->sk_lingertime = 0; + WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } @@ -799,44 +818,29 @@ EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { - lock_sock(sk); - sk->sk_priority = priority; - release_sock(sk); + WRITE_ONCE(sk->sk_priority, priority); } EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { - lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) - sk->sk_sndtimeo = secs * HZ; + WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else - sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - release_sock(sk); + WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { + sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); - sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); - sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } -void sock_enable_timestamps(struct sock *sk) -{ - lock_sock(sk); - __sock_set_timestamps(sk, true, false, true); - release_sock(sk); -} -EXPORT_SYMBOL(sock_enable_timestamps); - void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { @@ -887,7 +891,7 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) if (!match) return -EINVAL; - sk->sk_bind_phc = phc_index; + WRITE_ONCE(sk->sk_bind_phc, phc_index); return 0; } @@ -930,8 +934,9 @@ int sock_set_timestamping(struct sock *sk, int optname, return ret; } - sk->sk_tsflags = val; + WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, @@ -942,6 +947,20 @@ int sock_set_timestamping(struct sock *sk, int optname, return 0; } +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ + struct bpf_sock_ops_kern sock_ops; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, 0); + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); +} +#endif + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); @@ -984,7 +1003,7 @@ EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { - sk->sk_mark = val; + WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } @@ -1003,7 +1022,7 @@ static void sock_release_reserved_memory(struct sock *sk, int bytes) bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); - sk->sk_reserved_mem -= bytes; + WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); sk_mem_reclaim(sk); } @@ -1013,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) + if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) @@ -1022,29 +1041,105 @@ static int sock_reserve_memory(struct sock *sk, int bytes) pages = sk_mem_pages(bytes); /* pre-charge to memcg */ - charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + charged = mem_cgroup_sk_charge(sk, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; + if (sk->sk_bypass_prot_mem) + goto success; + /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); + /* If the system goes into memory pressure with this * precharge, give up and return error. */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); - mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } - sk->sk_forward_alloc += pages << PAGE_SHIFT; - sk->sk_reserved_mem += pages << PAGE_SHIFT; +success: + sk_forward_alloc_add(sk, pages << PAGE_SHIFT); + + WRITE_ONCE(sk->sk_reserved_mem, + sk->sk_reserved_mem + (pages << PAGE_SHIFT)); return 0; } +#ifdef CONFIG_PAGE_POOL + +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED + * in 1 syscall. The limit exists to limit the amount of memory the kernel + * allocates to copy these tokens, and to prevent looping over the frags for + * too long. + */ +#define MAX_DONTNEED_TOKENS 128 +#define MAX_DONTNEED_FRAGS 1024 + +static noinline_for_stack int +sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) +{ + unsigned int num_tokens, i, j, k, netmem_num = 0; + struct dmabuf_token *tokens; + int ret = 0, num_frags = 0; + netmem_ref netmems[16]; + + if (!sk_is_tcp(sk)) + return -EBADF; + + if (optlen % sizeof(*tokens) || + optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) + return -EINVAL; + + num_tokens = optlen / sizeof(*tokens); + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); + if (!tokens) + return -ENOMEM; + + if (copy_from_sockptr(tokens, optval, optlen)) { + kvfree(tokens); + return -EFAULT; + } + + xa_lock_bh(&sk->sk_user_frags); + for (i = 0; i < num_tokens; i++) { + for (j = 0; j < tokens[i].token_count; j++) { + if (++num_frags > MAX_DONTNEED_FRAGS) + goto frag_limit_reached; + + netmem_ref netmem = (__force netmem_ref)__xa_erase( + &sk->sk_user_frags, tokens[i].token_start + j); + + if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + continue; + + netmems[netmem_num++] = netmem; + if (netmem_num == ARRAY_SIZE(netmems)) { + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + netmem_num = 0; + xa_lock_bh(&sk->sk_user_frags); + } + ret++; + } + } + +frag_limit_reached: + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + + kvfree(tokens); + return ret; +} +#endif + void sockopt_lock_sock(struct sock *sk) { /* When current->bpf_ctx is set, the setsockopt is called from @@ -1079,6 +1174,17 @@ bool sockopt_capable(int cap) } EXPORT_SYMBOL(sockopt_capable); +static int sockopt_validate_clockid(__kernel_clockid_t value) +{ + switch (value) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_TAI: + return 0; + } + return -EINVAL; +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -1110,6 +1216,97 @@ int sk_setsockopt(struct sock *sk, int level, int optname, valbool = val ? 1 : 0; + /* handle options which do not require locking the socket. */ + switch (optname) { + case SO_PRIORITY: + if (sk_set_prio_allowed(sk, val)) { + sock_set_priority(sk, val); + return 0; + } + return -EPERM; + case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: + case SO_ERROR: + return -ENOPROTOOPT; +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + if (val < 0) + return -EINVAL; + WRITE_ONCE(sk->sk_ll_usec, val); + return 0; + case SO_PREFER_BUSY_POLL: + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) + return -EPERM; + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + return 0; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && + !sockopt_capable(CAP_NET_ADMIN)) + return -EPERM; + if (val < 0 || val > U16_MAX) + return -EINVAL; + WRITE_ONCE(sk->sk_busy_poll_budget, val); + return 0; +#endif + case SO_MAX_PACING_RATE: + { + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; + unsigned long pacing_rate; + + if (sizeof(ulval) != sizeof(val) && + optlen >= sizeof(ulval) && + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { + return -EFAULT; + } + if (ulval != ~0UL) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); + /* Pairs with READ_ONCE() from sk_getsockopt() */ + WRITE_ONCE(sk->sk_max_pacing_rate, ulval); + pacing_rate = READ_ONCE(sk->sk_pacing_rate); + if (ulval < pacing_rate) + WRITE_ONCE(sk->sk_pacing_rate, ulval); + return 0; + } + case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; + if (val < -1 || val > 1) + return -EINVAL; + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + /* Paired with READ_ONCE() in tcp_rtx_synack() + * and sk_getsockopt(). + */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); + return 0; + case SO_PEEK_OFF: + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); + else + ret = -EOPNOTSUPP; + return ret; + } +#ifdef CONFIG_PAGE_POOL + case SO_DEVMEM_DONTNEED: + return sock_devmem_dontneed(sk, optval, optlen); +#endif + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + return sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + return sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); + } + sockopt_lock_sock(sk); switch (optname) { @@ -1123,13 +1320,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; - break; - case SO_TYPE: - case SO_PROTOCOL: - case SO_DOMAIN: - case SO_ERROR: - ret = -ENOPROTOOPT; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); @@ -1205,15 +1399,6 @@ set_sndbuf: sk->sk_no_check_tx = valbool; break; - case SO_PRIORITY: - if ((val >= 0 && val <= 6) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - sk->sk_priority = val; - else - ret = -EPERM; - break; - case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ @@ -1223,15 +1408,15 @@ set_sndbuf: ret = -EFAULT; break; } - if (!ling.l_onoff) + if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); - else { -#if (BITS_PER_LONG == 32) - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + } else { + unsigned long t_sec = ling.l_linger; + + if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) + WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else -#endif - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; @@ -1239,13 +1424,6 @@ set_sndbuf: case SO_BSDCOMPAT: break; - case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); - else - clear_bit(SOCK_PASSCRED, &sock->flags); - break; - case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: @@ -1269,26 +1447,19 @@ set_sndbuf: break; case SO_RCVLOWAT: + { + int (*set_rcvlowat)(struct sock *sk, int val) = NULL; + if (val < 0) val = INT_MAX; - if (sock && sock->ops->set_rcvlowat) - ret = sock->ops->set_rcvlowat(sk, val); + if (sock) + set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; + if (set_rcvlowat) + ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; - - case SO_RCVTIMEO_OLD: - case SO_RCVTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, - optlen, optname == SO_RCVTIMEO_OLD); - break; - - case SO_SNDTIMEO_OLD: - case SO_SNDTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, - optlen, optname == SO_SNDTIMEO_OLD); - break; - + } case SO_ATTACH_FILTER: { struct sock_fprog fprog; @@ -1346,12 +1517,6 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; - case SO_PASSSEC: - if (valbool) - set_bit(SOCK_PASSSEC, &sock->flags); - else - clear_bit(SOCK_PASSSEC, &sock->flags); - break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { @@ -1362,15 +1527,13 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: - if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - break; - } - sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; + case SO_RCVPRIORITY: + sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); + break; + case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; @@ -1379,13 +1542,6 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; - case SO_PEEK_OFF: - if (sock->ops->set_peek_off) - ret = sock->ops->set_peek_off(sk, val); - else - ret = -EOPNOTSUPP; - break; - case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; @@ -1394,54 +1550,34 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; -#ifdef CONFIG_NET_RX_BUSY_POLL - case SO_BUSY_POLL: - /* allow unprivileged users to decrease the value */ - if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN)) - ret = -EPERM; - else { - if (val < 0) - ret = -EINVAL; - else - WRITE_ONCE(sk->sk_ll_usec, val); - } - break; - case SO_PREFER_BUSY_POLL: - if (valbool && !sockopt_capable(CAP_NET_ADMIN)) - ret = -EPERM; + case SO_PASSCRED: + if (sk_may_scm_recv(sk)) + sk->sk_scm_credentials = valbool; else - WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + ret = -EOPNOTSUPP; break; - case SO_BUSY_POLL_BUDGET: - if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { - ret = -EPERM; - } else { - if (val < 0 || val > U16_MAX) - ret = -EINVAL; - else - WRITE_ONCE(sk->sk_busy_poll_budget, val); - } + + case SO_PASSSEC: + if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) + sk->sk_scm_security = valbool; + else + ret = -EOPNOTSUPP; break; -#endif - case SO_MAX_PACING_RATE: - { - unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; + case SO_PASSPIDFD: + if (sk_is_unix(sk)) + sk->sk_scm_pidfd = valbool; + else + ret = -EOPNOTSUPP; + break; - if (sizeof(ulval) != sizeof(val) && - optlen >= sizeof(ulval) && - copy_from_sockptr(&ulval, optval, sizeof(ulval))) { - ret = -EFAULT; - break; - } - if (ulval != ~0UL) - cmpxchg(&sk->sk_pacing_status, - SK_PACING_NONE, - SK_PACING_NEEDED); - sk->sk_max_pacing_rate = ulval; - sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); + case SO_PASSRIGHTS: + if (sk_is_unix(sk)) + sk->sk_scm_rights = valbool; + else + ret = -EOPNOTSUPP; break; - } + case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; @@ -1488,6 +1624,11 @@ set_sndbuf: ret = -EPERM; break; } + + ret = sockopt_validate_clockid(sk_txtime.clockid); + if (ret) + break; + sock_valbool_flag(sk, SOCK_TXTIME, true); sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = @@ -1526,15 +1667,6 @@ set_sndbuf: break; } - case SO_TXREHASH: - if (val < -1 || val > 1) { - ret = -EINVAL; - break; - } - /* Paired with READ_ONCE() in tcp_rtx_synack() */ - WRITE_ONCE(sk->sk_txrehash, (u8)val); - break; - default: ret = -ENOPROTOOPT; break; @@ -1631,11 +1763,11 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_SNDBUF: - v.val = sk->sk_sndbuf; + v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: - v.val = sk->sk_rcvbuf; + v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: @@ -1677,13 +1809,13 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PRIORITY: - v.val = sk->sk_priority; + v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); - v.ling.l_linger = sk->sk_lingertime / HZ; + v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: @@ -1708,23 +1840,32 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: lv = sizeof(v.timestamping); - v.timestamping.flags = sk->sk_tsflags; - v.timestamping.bind_phc = sk->sk_bind_phc; + /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only + * returning the flags when they were set through the same option. + * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. + */ + if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); + v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); + } break; case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: - lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); + lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, + SO_RCVTIMEO_OLD == optname); break; case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: - lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); + lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, + SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: - v.val = sk->sk_rcvlowat; + v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: @@ -1732,7 +1873,24 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSCRED: - v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_credentials; + break; + + case SO_PASSPIDFD: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_pidfd; + break; + + case SO_PASSRIGHTS: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_rights; break; case SO_PEERCRED: @@ -1750,6 +1908,47 @@ int sk_getsockopt(struct sock *sk, int level, int optname, goto lenout; } + case SO_PEERPIDFD: + { + struct pid *peer_pid; + struct file *pidfd_file = NULL; + unsigned int flags = 0; + int pidfd; + + if (len > sizeof(pidfd)) + len = sizeof(pidfd); + + spin_lock(&sk->sk_peer_lock); + peer_pid = get_pid(sk->sk_peer_pid); + spin_unlock(&sk->sk_peer_lock); + + if (!peer_pid) + return -ENODATA; + + /* The use of PIDFD_STALE requires stashing of struct pid + * on pidfs with pidfs_register_pid() and only AF_UNIX + * were prepared for this. + */ + if (sk->sk_family == AF_UNIX) + flags = PIDFD_STALE; + + pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); + put_pid(peer_pid); + if (pidfd < 0) + return pidfd; + + if (copy_to_sockptr(optval, &pidfd, len) || + copy_to_sockptr(optlen, &len, sizeof(int))) { + put_unused_fd(pidfd); + fput(pidfd_file); + + return -EFAULT; + } + + fd_install(pidfd, pidfd_file); + return 0; + } + case SO_PEERGROUPS: { const struct cred *cred; @@ -1776,14 +1975,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_PEERNAME: { - char address[128]; + struct sockaddr_storage address; - lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); + lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_sockptr(optval, address, len)) + if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } @@ -1796,7 +1995,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSSEC: - v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_security; break; case SO_PEERSEC: @@ -1804,13 +2006,17 @@ int sk_getsockopt(struct sock *sk, int level, int optname, optval, optlen, len); case SO_MARK: - v.val = sk->sk_mark; + v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: v.val = sock_flag(sk, SOCK_RCVMARK); break; + case SO_RCVPRIORITY: + v.val = sock_flag(sk, SOCK_RCVPRIORITY); + break; + case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; @@ -1820,10 +2026,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PEEK_OFF: - if (!sock->ops->set_peek_off) + if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; - v.val = sk->sk_peek_off; + v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); @@ -1853,7 +2059,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: - v.val = sk->sk_ll_usec; + v.val = READ_ONCE(sk->sk_ll_usec); break; case SO_PREFER_BUSY_POLL: v.val = READ_ONCE(sk->sk_prefer_busy_poll); @@ -1861,12 +2067,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, #endif case SO_MAX_PACING_RATE: + /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { lv = sizeof(v.ulval); - v.ulval = sk->sk_max_pacing_rate; + v.ulval = READ_ONCE(sk->sk_max_pacing_rate); } else { /* 32bit version */ - v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); + v.val = min_t(unsigned long, ~0U, + READ_ONCE(sk->sk_max_pacing_rate)); } break; @@ -1892,7 +2100,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ - if (v.val < MIN_NAPI_ID) + if (!napi_id_valid(v.val)) v.val = 0; break; @@ -1934,11 +2142,15 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_RESERVE_MEM: - v.val = sk->sk_reserved_mem; + v.val = READ_ONCE(sk->sk_reserved_mem); break; case SO_TXREHASH: - v.val = sk->sk_txrehash; + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; + + /* Paired with WRITE_ONCE() in sk_setsockopt() */ + v.val = READ_ONCE(sk->sk_txrehash); break; default: @@ -1958,14 +2170,6 @@ lenout: return 0; } -int sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - return sk_getsockopt(sock->sk, level, optname, - USER_SOCKPTR(optval), - USER_SOCKPTR(optlen)); -} - /* * Initialize an sk_lock. * @@ -1973,6 +2177,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, */ static inline void sock_lock_init(struct sock *sk) { + sk_owner_clear(sk); + if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, @@ -1991,7 +2197,7 @@ static inline void sock_lock_init(struct sock *sk) /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, - * even temporarly, because of RCU lookups. sk_node should also be left as is. + * even temporarily, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) @@ -2012,8 +2218,9 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); - memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, - prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + prot->obj_size - offsetof(struct sock, sk_dontcopy_end), + /* alloc is larger than struct, see sk_prot_alloc() */); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; @@ -2068,6 +2275,9 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); + + sk_owner_put(sk); + if (slab != NULL) kmem_cache_free(slab, sk); else @@ -2096,19 +2306,25 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + + if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) + sk->sk_bypass_prot_mem = 1; + sk->sk_kern_sock = kern; sock_lock_init(sk); + sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { + net_passive_inc(net); __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } sock_net_set(sk, net); - refcount_set(&sk->sk_wmem_alloc, 1); + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -2127,6 +2343,7 @@ EXPORT_SYMBOL(sk_alloc); static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); + struct net *net = sock_net(sk); struct sk_filter *filter; if (sk->sk_destruct) @@ -2158,14 +2375,28 @@ static void __sk_destruct(struct rcu_head *head) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - if (likely(sk->sk_net_refcnt)) - put_net_track(sock_net(sk), &sk->ns_tracker); - else - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - + if (likely(sk->sk_net_refcnt)) { + put_net_track(net, &sk->ns_tracker); + } else { + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + } sk_prot_free(sk->sk_prot_creator, sk); } +void sk_net_refcnt_upgrade(struct sock *sk) +{ + struct net *net = sock_net(sk); + + WARN_ON_ONCE(sk->sk_net_refcnt); + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); +} +EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); + void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); @@ -2220,19 +2451,27 @@ static void sk_init_common(struct sock *sk) lockdep_set_class_and_name(&sk->sk_error_queue.lock, af_elock_keys + sk->sk_family, af_family_elock_key_strings[sk->sk_family]); - lockdep_set_class_and_name(&sk->sk_callback_lock, + if (sk->sk_kern_sock) + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_kern_callback_keys + sk->sk_family, + af_family_kern_clock_key_strings[sk->sk_family]); + else + lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); } /** - * sk_clone_lock - clone a socket, and lock its clone - * @sk: the socket to clone - * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * sk_clone - clone a socket + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @lock: if true, lock the cloned sk * - * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + * If @lock is true, the clone is locked by bh_lock_sock(), and + * caller must unlock socket even in error path by bh_unlock_sock(). */ -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, + bool lock) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; @@ -2257,19 +2496,23 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) * is not properly dismantling its kernel sockets at netns * destroy time. */ + net_passive_inc(sock_net(newsk)); __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } + sk_node_init(&newsk->sk_node); sock_lock_init(newsk); - bh_lock_sock(newsk); + + if (lock) + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); - /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ - refcount_set(&newsk->sk_wmem_alloc, 1); + refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); @@ -2279,15 +2522,18 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; - atomic_set(&newsk->sk_drops, 0); + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); + sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); +#ifdef CONFIG_MEMCG /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; +#endif cgroup_sk_clone(&newsk->sk_cgrp_data); @@ -2309,17 +2555,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; + + goto free; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. @@ -2338,17 +2581,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) smp_wmb(); refcount_set(&newsk->sk_refcnt, 2); - /* Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy). - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme - */ - sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); @@ -2360,40 +2592,51 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) net_enable_timestamp(); out: return newsk; -} -EXPORT_SYMBOL_GPL(sk_clone_lock); - -void sk_free_unlock_clone(struct sock *sk) -{ +free: /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - sk->sk_destruct = NULL; - bh_unlock_sock(sk); - sk_free(sk); + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + if (lock) + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; } -EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +EXPORT_SYMBOL_GPL(sk_clone); -static void sk_trim_gso_size(struct sock *sk) +static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { - if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) - return; + bool is_ipv6 = false; + u32 max_size; + #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6 && - sk_is_tcp(sk) && - !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) - return; + is_ipv6 = (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif - sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ + max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); + if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) + max_size = GSO_LEGACY_MAX_SIZE; + + return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + const struct net_device *dev; u32 max_segs = 1; - sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features; - if (sk_is_tcp(sk)) + rcu_read_lock(); + dev = dst_dev_rcu(dst); + sk->sk_route_caps = dev->features; + if (sk_is_tcp(sk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + sk->sk_route_caps |= NETIF_F_GSO; + icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); + } if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) @@ -2403,15 +2646,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); - sk_trim_gso_size(sk); - sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; + sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -2425,16 +2667,18 @@ EXPORT_SYMBOL_GPL(sk_setup_caps); */ void sock_wfree(struct sk_buff *skb) { - struct sock *sk = skb->sk; unsigned int len = skb->truesize; + struct sock *sk = skb->sk; bool free; + int old; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); - free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); - sock_def_write_space_wfree(sk); + free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, + &old); + sock_def_write_space_wfree(sk, old - len); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); @@ -2471,35 +2715,41 @@ void __sock_wfree(struct sk_buff *skb) void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { + int old_wmem; + skb_orphan(skb); - skb->sk = sk; #ifdef CONFIG_INET - if (unlikely(!sk_fullsock(sk))) { - skb->destructor = sock_edemux; - sock_hold(sk); - return; - } + if (unlikely(!sk_fullsock(sk))) + return skb_set_owner_edemux(skb, sk); #endif + skb->sk = sk; skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation - * is enough to guarantee sk_free() wont free this sock until + * is enough to guarantee sk_free() won't free this sock until * all in-flight packets are completed */ - refcount_add(skb->truesize, &sk->sk_wmem_alloc); + __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); + + /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket + * is in a host queue (qdisc, NIC queue). + * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue + * based on XPS for better performance. + * Otherwise clear ooo_okay to not risk Out Of Order delivery. + */ + skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); } EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { -#ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ - if (skb->decrypted) + if (skb_is_decrypted(skb)) return false; -#endif + return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } @@ -2551,33 +2801,21 @@ EXPORT_SYMBOL(sock_efree); #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { - if (sk_is_refcounted(skb->sk)) - sock_gen_put(skb->sk); -} -EXPORT_SYMBOL(sock_pfree); -#endif /* CONFIG_INET */ - -kuid_t sock_i_uid(struct sock *sk) -{ - kuid_t uid; + struct sock *sk = skb->sk; - read_lock_bh(&sk->sk_callback_lock); - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; - read_unlock_bh(&sk->sk_callback_lock); - return uid; -} -EXPORT_SYMBOL(sock_i_uid); + if (!sk_is_refcounted(sk)) + return; -unsigned long sock_i_ino(struct sock *sk) -{ - unsigned long ino; + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + inet_reqsk(sk)->rsk_listener = NULL; + reqsk_free(inet_reqsk(sk)); + return; + } - read_lock_bh(&sk->sk_callback_lock); - ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock_bh(&sk->sk_callback_lock); - return ino; + sock_gen_put(sk); } -EXPORT_SYMBOL(sock_i_ino); +EXPORT_SYMBOL(sock_pfree); +#endif /* CONFIG_INET */ /* * Allocate a skb from the socket's send buffer. @@ -2612,7 +2850,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - READ_ONCE(sysctl_optmem_max)) + READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2630,7 +2868,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - int optmem_max = READ_ONCE(sysctl_optmem_max); + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { @@ -2648,6 +2886,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } EXPORT_SYMBOL(sock_kmalloc); +/* + * Duplicate the input "src" memory block using the socket's + * option memory buffer. + */ +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority) +{ + void *mem; + + mem = sock_kmalloc(sk, size, priority); + if (mem) + memcpy(mem, src, size); + return mem; +} +EXPORT_SYMBOL(sock_kmemdup); + /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. @@ -2693,9 +2947,9 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; - if (sk->sk_err) + if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } @@ -2723,7 +2977,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, goto failure; err = -EPIPE; - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) @@ -2757,6 +3011,8 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, { u32 tsflags; + BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); + switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && @@ -2767,6 +3023,7 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; @@ -2784,10 +3041,33 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; + case SCM_TS_OPT_ID: + if (sk_is_tcp(sk)) + return -EINVAL; + tsflags = READ_ONCE(sk->sk_tsflags); + if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); + sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SO_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) + return -EPERM; + sockc->priority = *(u32 *)CMSG_DATA(cmsg); + break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } @@ -2825,7 +3105,8 @@ static void sk_enter_memory_pressure(struct sock *sk) static void sk_leave_memory_pressure(struct sock *sk) { if (sk->sk_prot->leave_memory_pressure) { - sk->sk_prot->leave_memory_pressure(sk); + INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, + tcp_leave_memory_pressure, sk); } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; @@ -2885,8 +3166,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; - sk_enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; } EXPORT_SYMBOL(sk_page_frag_refill); @@ -2914,23 +3198,27 @@ void __release_sock(struct sock *sk) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; + int nb = 0; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); - do { + while (1) { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); - cond_resched(); - skb = next; - } while (skb != NULL); + if (!skb) + break; + + if (!(++nb & 15)) + cond_resched(); + } spin_lock_bh(&sk->sk_lock.slock); } @@ -2946,6 +3234,11 @@ void __sk_flush_backlog(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); __release_sock(sk); + + if (sk->sk_prot->release_cb) + INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, + tcp_release_cb, sk); + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(__sk_flush_backlog); @@ -2982,21 +3275,34 @@ EXPORT_SYMBOL(sk_wait_data); * @amt: pages to allocate * @kind: allocation type * - * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc + * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. + * + * Unlike the globally shared limits among the sockets under same protocol, + * consuming the budget of a memcg won't have direct effect on other ones. + * So be optimistic about memcg's tolerance, and leave the callers to decide + * whether or not to raise allocated through sk_under_memory_pressure() or + * its variants. */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { - bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; + bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - bool charged = true; - long allocated; + long allocated = 0; - sk_memory_allocated_add(sk, amt); - allocated = sk_memory_allocated(sk); - if (memcg_charge && - !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, - gfp_memcg_charge()))) - goto suppress_allocation; + if (!sk->sk_bypass_prot_mem) { + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + } + + if (mem_cgroup_sk_enabled(sk)) { + memcg_enabled = true; + charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); + if (!charged) + goto suppress_allocation; + } + + if (!allocated) + return 1; /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { @@ -3012,7 +3318,14 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; - /* guarantee minimum buffer size under pressure */ + /* Guarantee minimum buffer size under pressure (either global + * or memcg) to make sure features described in RFC 7323 (TCP + * Extensions for High Performance) work properly. + * + * This rule does NOT stand when exceeds global or memcg's hard + * limit, or else a DoS attack can be taken place by spawning + * lots of sockets whose usage are under minimum buffer size. + */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; @@ -3031,8 +3344,17 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) if (sk_has_memory_pressure(sk)) { u64 alloc; - if (!sk_under_memory_pressure(sk)) + /* The following 'average' heuristic is within the + * scope of global accounting, so it only makes + * sense for global memory pressure. + */ + if (!sk_under_global_memory_pressure(sk)) return 1; + + /* Try to be fair among all the sockets under global + * pressure by allowing the ones that below average + * usage to raise. + */ alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + @@ -3051,21 +3373,20 @@ suppress_allocation: */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ - if (memcg_charge && !charged) { - mem_cgroup_charge_skmem(sk->sk_memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); - } + if (memcg_enabled && !charged) + mem_cgroup_sk_charge(sk, amt, + gfp_memcg_charge() | __GFP_NOFAIL); return 1; } } - if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) - trace_sock_exceed_buf_limit(sk, prot, allocated, kind); + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); - sk_memory_allocated_sub(sk, amt); + if (allocated) + sk_memory_allocated_sub(sk, amt); - if (memcg_charge && charged) - mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); + if (charged) + mem_cgroup_sk_uncharge(sk, amt); return 0; } @@ -3084,10 +3405,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) - sk->sk_forward_alloc -= amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -3101,12 +3422,15 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, amount); + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_uncharge(sk, amount); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + if (sk->sk_bypass_prot_mem) + return; - if (sk_under_memory_pressure(sk) && + sk_memory_allocated_sub(sk, amount); + + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } @@ -3119,14 +3443,32 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; - sk->sk_forward_alloc -= amount << PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); +void __sk_charge(struct sock *sk, gfp_t gfp) +{ + int amt; + + gfp |= __GFP_NOFAIL; + if (mem_cgroup_from_sk(sk)) { + /* The socket has not been accepted yet, no need + * to look at newsk->sk_wmem_queued. + */ + amt = sk_mem_pages(sk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + if (amt) + mem_cgroup_sk_charge(sk, amt, gfp); + } + + kmem_cache_charge(sk, gfp); +} + int sk_set_peek_off(struct sock *sk, int val) { - sk->sk_peek_off = val; + WRITE_ONCE(sk->sk_peek_off, val); return 0; } EXPORT_SYMBOL_GPL(sk_set_peek_off); @@ -3138,13 +3480,13 @@ EXPORT_SYMBOL_GPL(sk_set_peek_off); * function, some default processing is provided. */ -int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); -int sock_no_connect(struct socket *sock, struct sockaddr *saddr, +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { return -EOPNOTSUPP; @@ -3157,8 +3499,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) } EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { return -EOPNOTSUPP; } @@ -3230,36 +3572,6 @@ void __receive_sock(struct file *file) } } -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) -{ - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg(sock, &msg, &iov, 1, size); - kunmap(page); - return res; -} -EXPORT_SYMBOL(sock_no_sendpage); - -ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); - kunmap(page); - return res; -} -EXPORT_SYMBOL(sock_no_sendpage_locked); - /* * Default Socket Callbacks */ @@ -3283,7 +3595,7 @@ static void sock_def_error_report(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } @@ -3291,12 +3603,14 @@ void sock_def_readable(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -3316,7 +3630,7 @@ static void sock_def_write_space(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -3326,12 +3640,12 @@ static void sock_def_write_space(struct sock *sk) * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ -static void sock_def_write_space_wfree(struct sock *sk) +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if (sock_writeable(sk)) { + if (__sock_writeable(sk, wmem_alloc)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ @@ -3341,7 +3655,7 @@ static void sock_def_write_space_wfree(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } @@ -3352,7 +3666,7 @@ static void sock_def_destruct(struct sock *sk) void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) - if (send_sigurg(&sk->sk_socket->file->f_owner)) + if (send_sigurg(sk->sk_socket->file)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); @@ -3367,19 +3681,19 @@ EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { - if (del_timer(timer)) + if (timer_delete(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { - if (del_timer_sync(timer)) + if (timer_delete_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); -void sock_init_data(struct socket *sock, struct sock *sk) +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) { sk_init_common(sk); sk->sk_send_head = NULL; @@ -3399,23 +3713,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; - sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { RCU_INIT_POINTER(sk->sk_wq, NULL); - sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } - - rwlock_init(&sk->sk_callback_lock); - if (sk->sk_kern_sock) - lockdep_set_class_and_name( - &sk->sk_callback_lock, - af_kern_callback_keys + sk->sk_family, - af_family_kern_clock_key_strings[sk->sk_family]); - else - lockdep_set_class_and_name( - &sk->sk_callback_lock, - af_callback_keys + sk->sk_family, - af_family_clock_key_strings[sk->sk_family]); + sk->sk_uid = uid; sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; @@ -3451,7 +3752,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; - sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* @@ -3460,7 +3760,17 @@ void sock_init_data(struct socket *sock, struct sock *sk) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_drops, 0); + sk_drops_reset(sk); +} +EXPORT_SYMBOL(sock_init_data_uid); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + kuid_t uid = sock ? + SOCK_INODE(sock)->i_uid : + make_kuid(sock_net(sk)->user_ns, 0); + + sock_init_data_uid(sock, sk, uid); } EXPORT_SYMBOL(sock_init_data); @@ -3484,11 +3794,9 @@ void release_sock(struct sock *sk) if (sk->sk_backlog.tail) __release_sock(sk); - /* Warning : release_cb() might need to release sk ownership, - * ie call sock_release_ownership(sk) before us. - */ if (sk->sk_prot->release_cb) - sk->sk_prot->release_cb(sk); + INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, + tcp_release_cb, sk); sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) @@ -3626,7 +3934,7 @@ EXPORT_SYMBOL(sock_recv_errqueue); * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume - * this means if you specify SO_ERROR (otherwise whats the point of it). + * this means if you specify SO_ERROR (otherwise what is the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) @@ -3696,8 +4004,6 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); - sk_refcnt_debug_release(sk); - sock_put(sk); } EXPORT_SYMBOL(sk_common_release); @@ -3710,11 +4016,11 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); - mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); } #ifdef CONFIG_PROC_FS @@ -3776,7 +4082,7 @@ static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } @@ -3787,7 +4093,7 @@ static int assign_proto_idx(struct proto *prot) static void release_proto_idx(struct proto *prot) { - if (prot->inuse_idx != PROTO_INUSE_NR - 1) + if (prot->inuse_idx != PROTO_INUSE_NR) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else @@ -4007,7 +4313,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " - "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), @@ -4028,7 +4334,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), - proto_method_implemented(proto->sendpage), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), @@ -4049,7 +4354,7 @@ static int proto_seq_show(struct seq_file *seq, void *v) "maxhdr", "slab", "module", - "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; @@ -4096,16 +4401,149 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) { struct sock *sk = p; - return !skb_queue_empty_lockless(&sk->sk_receive_queue) || - sk_busy_loop_timeout(sk, start_time); + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + return true; + + if (sk_is_udp(sk) && + !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) + return true; + + return sk_busy_loop_timeout(sk, start_time); } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ -int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); + +/* Copy 'size' bytes from userspace and return `size` back to userspace */ +int sock_ioctl_inout(struct sock *sk, unsigned int cmd, + void __user *arg, void *karg, size_t size) +{ + int ret; + + if (copy_from_user(karg, arg, size)) + return -EFAULT; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); + if (ret) + return ret; + + if (copy_to_user(arg, karg, size)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(sock_ioctl_inout); + +/* This is the most common ioctl prep function, where the result (4 bytes) is + * copied back to userspace if the ioctl() returns successfully. No input is + * copied from userspace as input argument. + */ +static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int ret, karg = 0; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); + if (ret) + return ret; + + return put_user(karg, (int __user *)arg); +} + +/* A wrapper around sock ioctls, which copies the data from userspace + * (depending on the protocol/ioctl), and copies back the result to userspace. + * The main motivation for this function is to pass kernel memory to the + * protocol ioctl callbacks, instead of userspace memory. + */ +int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int rc = 1; + + if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) + rc = ipmr_sk_ioctl(sk, cmd, arg); + else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) + rc = ip6mr_sk_ioctl(sk, cmd, arg); + else if (sk_is_phonet(sk)) + rc = phonet_sk_ioctl(sk, cmd, arg); + + /* If ioctl was processed, returns its value */ + if (rc <= 0) + return rc; + + /* Otherwise call the default handler */ + return sock_ioctl_out(sk, cmd, arg); +} +EXPORT_SYMBOL(sk_ioctl); + +static int __init sock_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); +#ifdef CONFIG_MEMCG + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); +#endif + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); + return 0; +} + +core_initcall(sock_struct_check); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index b1e29e18d1d6..026ce9bd9e5e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -16,9 +16,10 @@ #include <linux/inet_diag.h> #include <linux/sock_diag.h> -static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; -static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); -static DEFINE_MUTEX(sock_diag_table_mutex); +static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX]; + +static const struct sock_diag_inet_compat __rcu *inet_rcv_compat; + static struct workqueue_struct *broadcast_wq; DEFINE_COOKIE(sock_cookie); @@ -122,6 +123,24 @@ static size_t sock_diag_nlmsg_size(void) + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ } +static const struct sock_diag_handler *sock_diag_lock_handler(int family) +{ + const struct sock_diag_handler *handler; + + rcu_read_lock(); + handler = rcu_dereference(sock_diag_handlers[family]); + if (handler && !try_module_get(handler->owner)) + handler = NULL; + rcu_read_unlock(); + + return handler; +} + +static void sock_diag_unlock_handler(const struct sock_diag_handler *handler) +{ + module_put(handler->owner); +} + static void sock_diag_broadcast_destroy_work(struct work_struct *work) { struct broadcast_sk *bsk = @@ -138,12 +157,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work) if (!skb) goto out; - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[sk->sk_family]; - if (hndl && hndl->get_info) - err = hndl->get_info(skb, sk); - mutex_unlock(&sock_diag_table_mutex); - + hndl = sock_diag_lock_handler(sk->sk_family); + if (hndl) { + if (hndl->get_info) + err = hndl->get_info(skb, sk); + sock_diag_unlock_handler(hndl); + } if (!err) nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, GFP_KERNEL); @@ -166,51 +185,43 @@ void sock_diag_broadcast_destroy(struct sock *sk) queue_work(broadcast_wq, &bsk->work); } -void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = fn; - mutex_unlock(&sock_diag_table_mutex); + xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr)); } EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); -void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = NULL; - mutex_unlock(&sock_diag_table_mutex); + const struct sock_diag_inet_compat *old; + + old = unrcu_pointer(xchg(&inet_rcv_compat, NULL)); + WARN_ON_ONCE(old != ptr); } EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); int sock_diag_register(const struct sock_diag_handler *hndl) { - int err = 0; + int family = hndl->family; - if (hndl->family >= AF_MAX) + if (family >= AF_MAX) return -EINVAL; - mutex_lock(&sock_diag_table_mutex); - if (sock_diag_handlers[hndl->family]) - err = -EBUSY; - else - sock_diag_handlers[hndl->family] = hndl; - mutex_unlock(&sock_diag_table_mutex); - - return err; + return !cmpxchg((const struct sock_diag_handler **) + &sock_diag_handlers[family], + NULL, hndl) ? 0 : -EBUSY; } EXPORT_SYMBOL_GPL(sock_diag_register); -void sock_diag_unregister(const struct sock_diag_handler *hnld) +void sock_diag_unregister(const struct sock_diag_handler *hndl) { - int family = hnld->family; + int family = hndl->family; if (family >= AF_MAX) return; - mutex_lock(&sock_diag_table_mutex); - BUG_ON(sock_diag_handlers[family] != hnld); - sock_diag_handlers[family] = NULL; - mutex_unlock(&sock_diag_table_mutex); + xchg((const struct sock_diag_handler **)&sock_diag_handlers[family], + NULL); } EXPORT_SYMBOL_GPL(sock_diag_unregister); @@ -227,20 +238,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); - if (sock_diag_handlers[req->sdiag_family] == NULL) + if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family])) sock_load_diag_module(req->sdiag_family, 0); - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[req->sdiag_family]; + hndl = sock_diag_lock_handler(req->sdiag_family); if (hndl == NULL) - err = -ENOENT; - else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) + return -ENOENT; + + if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) err = hndl->destroy(skb, nlh); else err = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + sock_diag_unlock_handler(hndl); return err; } @@ -248,20 +259,25 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + const struct sock_diag_inet_compat *ptr; int ret; switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: - case DCCPDIAG_GETSOCK: - if (inet_rcv_compat == NULL) + if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); - mutex_lock(&sock_diag_table_mutex); - if (inet_rcv_compat != NULL) - ret = inet_rcv_compat(skb, nlh); - else - ret = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + rcu_read_lock(); + ptr = rcu_dereference(inet_rcv_compat); + if (ptr && !try_module_get(ptr->owner)) + ptr = NULL; + rcu_read_unlock(); + + ret = -EOPNOTSUPP; + if (ptr) { + ret = ptr->fn(skb, nlh); + module_put(ptr->owner); + } return ret; case SOCK_DIAG_BY_FAMILY: @@ -272,13 +288,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } } -static DEFINE_MUTEX(sock_diag_mutex); - static void sock_diag_rcv(struct sk_buff *skb) { - mutex_lock(&sock_diag_mutex); netlink_rcv_skb(skb, &sock_diag_rcv_msg); - mutex_unlock(&sock_diag_mutex); } static int sock_diag_bind(struct net *net, int group) @@ -286,12 +298,12 @@ static int sock_diag_bind(struct net *net, int group) switch (group) { case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: - if (!sock_diag_handlers[AF_INET]) + if (!rcu_access_pointer(sock_diag_handlers[AF_INET])) sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: - if (!sock_diag_handlers[AF_INET6]) + if (!rcu_access_pointer(sock_diag_handlers[AF_INET6])) sock_load_diag_module(AF_INET6, 0); break; } @@ -336,7 +348,7 @@ static struct pernet_operations diag_net_ops = { static int __init sock_diag_init(void) { - broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); + broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0); BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 22fa2c5bc6ec..5947b38e4f8b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -18,22 +18,28 @@ struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; - raw_spinlock_t lock; + spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && @@ -46,7 +52,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); - raw_spin_lock_init(&stab->lock); + spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), @@ -61,55 +67,50 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { - u32 ufd = attr->target_fd; struct bpf_map *map; - struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); - fdput(f); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { - u32 ufd = attr->target_fd; struct bpf_prog *prog; struct bpf_map *map; - struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); - if (IS_ERR(prog)) { - ret = PTR_ERR(prog); - goto put_map; - } + if (IS_ERR(prog)) + return PTR_ERR(prog); if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); -put_map: - fdput(f); return ret; } @@ -117,7 +118,6 @@ static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); - preempt_disable(); rcu_read_lock(); } @@ -125,7 +125,6 @@ static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); - preempt_enable(); release_sock(sk); } @@ -150,16 +149,17 @@ static void sock_map_del_link(struct sock *sk, list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; - struct bpf_stab *stab = container_of(map, struct bpf_stab, - map); - if (psock->saved_data_ready && stab->progs.stream_parser) + struct sk_psock_progs *progs = sock_map_progs(map); + + if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.stream_verdict) + if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); + break; } } spin_unlock_bh(&psock->link_lock); @@ -303,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { - ret = sk_psock_init_strp(sk, psock); + if (sk_is_tcp(sk)) + ret = sk_psock_init_strp(sk, psock); + else + ret = -EOPNOTSUPP; if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); @@ -412,12 +415,11 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key) static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { - struct sock *sk; + struct sock *sk = NULL; int err = 0; - raw_spin_lock_bh(&stab->lock); - sk = *psk; - if (!sk_test || sk_test == sk) + spin_lock_bh(&stab->lock); + if (!sk_test || sk_test == *psk) sk = xchg(psk, NULL); if (likely(sk)) @@ -425,7 +427,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, else err = -EINVAL; - raw_spin_unlock_bh(&stab->lock); + spin_unlock_bh(&stab->lock); return err; } @@ -437,7 +439,7 @@ static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, __sock_map_delete(stab, sk, link_raw); } -static int sock_map_delete_elem(struct bpf_map *map, void *key) +static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; @@ -491,7 +493,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, psock = sk_psock(sk); WARN_ON_ONCE(!psock); - raw_spin_lock_bh(&stab->lock); + spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; @@ -505,10 +507,10 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); - raw_spin_unlock_bh(&stab->lock); + spin_unlock_bh(&stab->lock); return 0; out_unlock: - raw_spin_unlock_bh(&stab->lock); + spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: @@ -540,6 +542,11 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); + if (sk_is_stream_unix(sk)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } @@ -587,8 +594,8 @@ out: return ret; } -static int sock_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; @@ -646,6 +653,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) + return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; @@ -672,6 +681,10 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; + if (sk_is_vsock(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; @@ -797,6 +810,14 @@ static void sock_map_fini_seq_private(void *priv_data) bpf_map_put_with_uref(info->map); } +static u64 sock_map_mem_usage(const struct bpf_map *map) +{ + u64 usage = sizeof(struct bpf_stab); + + usage += (u64)map->max_entries * sizeof(struct sock *); + return usage; +} + static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, @@ -816,6 +837,7 @@ const struct bpf_map_ops sock_map_ops = { .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, + .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; @@ -830,7 +852,7 @@ struct bpf_shtab_elem { struct bpf_shtab_bucket { struct hlist_head head; - raw_spinlock_t lock; + spinlock_t lock; }; struct bpf_shtab { @@ -905,7 +927,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ - raw_spin_lock_bh(&bucket->lock); + spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { @@ -913,10 +935,10 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); } -static int sock_hash_delete_elem(struct bpf_map *map, void *key) +static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; @@ -927,7 +949,7 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); - raw_spin_lock_bh(&bucket->lock); + spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); @@ -935,7 +957,7 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) sock_hash_free_elem(htab, elem); ret = 0; } - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); return ret; } @@ -995,7 +1017,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); - raw_spin_lock_bh(&bucket->lock); + spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; @@ -1021,10 +1043,10 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); return 0; out_unlock: - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); @@ -1076,8 +1098,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) struct bpf_shtab *htab; int i, err; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && @@ -1112,7 +1132,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); - raw_spin_lock_init(&htab->buckets[i].lock); + spin_lock_init(&htab->buckets[i].lock); } return &htab->map; @@ -1144,11 +1164,11 @@ static void sock_hash_free(struct bpf_map *map) * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ - raw_spin_lock_bh(&bucket->lock); + spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); - raw_spin_unlock_bh(&bucket->lock); + spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's @@ -1164,6 +1184,7 @@ static void sock_hash_free(struct bpf_map *map) sock_put(elem->sk); sock_hash_free_elem(htab, elem); } + cond_resched(); } /* wait for psock readers accessing its map link */ @@ -1238,6 +1259,8 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) + return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; @@ -1264,6 +1287,10 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; + if (sk_is_vsock(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; @@ -1397,6 +1424,16 @@ static void sock_hash_fini_seq_private(void *priv_data) bpf_map_put_with_uref(info->map); } +static u64 sock_hash_mem_usage(const struct bpf_map *map) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u64 usage = sizeof(*htab); + + usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); + usage += atomic_read(&htab->count) * (u64)htab->elem_size; + return usage; +} + static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, @@ -1416,6 +1453,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, + .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; @@ -1434,80 +1472,108 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; + u32 prog_cnt = 0, flags = 0; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; - struct fd f; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1532,7 +1598,6 @@ end: copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; - fdput(f); return ret; } @@ -1569,15 +1634,16 @@ void sock_map_unhash(struct sock *sk) psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; + saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + } else { + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); } - - saved_unhash = psock->saved_unhash; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - saved_unhash(sk); + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; + if (saved_unhash) + saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); @@ -1590,17 +1656,18 @@ void sock_map_destroy(struct sock *sk) psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->destroy) - sk->sk_prot->destroy(sk); - return; + saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + } else { + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + sk_psock_put(sk, psock); } - - saved_destroy = psock->saved_destroy; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - sk_psock_put(sk, psock); - saved_destroy(sk); + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; + if (saved_destroy) + saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); @@ -1611,24 +1678,227 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock_get(sk); - if (unlikely(!psock)) { + psock = sk_psock(sk); + if (likely(psock)) { + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + psock = sk_psock_get(sk); + if (unlikely(!psock)) + goto no_psock; + rcu_read_unlock(); + sk_psock_stop(psock); + release_sock(sk); + cancel_delayed_work_sync(&psock->work); + sk_psock_put(sk, psock); + } else { + saved_close = READ_ONCE(sk->sk_prot)->close; +no_psock: rcu_read_unlock(); release_sock(sk); - return sk->sk_prot->close(sk, timeout); } - saved_close = psock->saved_close; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - release_sock(sk); - cancel_work_sync(&psock->work); - sk_psock_put(sk, psock); + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + if (!sockmap_link->map) { + ret = -ENOLINK; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog, + attach_type); + sockmap_link->map = map; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5a165286e4d8..4211710393a8 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -173,10 +173,9 @@ static bool __reuseport_detach_closed_sock(struct sock *sk, static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { - unsigned int size = sizeof(struct sock_reuseport) + - sizeof(struct sock *) * max_socks; - struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + struct sock_reuseport *reuse; + reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC); if (!reuse) return NULL; diff --git a/net/core/stream.c b/net/core/stream.c index cd06750dd329..7a37e7dd2c43 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -23,9 +23,13 @@ /** * sk_stream_write_space - stream socket write_space callback. - * @sk: socket + * @sk: pointer to the socket structure * - * FIXME: write proper description + * This function is invoked when there's space available in the socket's + * send buffer for writing. It first checks if the socket is writable, + * clears the SOCK_NOSPACE flag indicating that memory for writing + * is now available, wakes up any processes waiting for write operations + * and sends asynchronous notifications if needed. */ void sk_stream_write_space(struct sock *sk) { @@ -73,13 +77,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, - !sk->sk_err && - !((1 << sk->sk_state) & + !READ_ONCE(sk->sk_err) && + !((1 << READ_ONCE(sk->sk_state)) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); - return 0; + return done < 0 ? done : 0; } EXPORT_SYMBOL(sk_stream_wait_connect); @@ -87,9 +91,9 @@ EXPORT_SYMBOL(sk_stream_wait_connect); * sk_stream_closing - Return 1 if we still have things to send in our buffers. * @sk: socket to verify */ -static inline int sk_stream_closing(struct sock *sk) +static int sk_stream_closing(const struct sock *sk) { - return (1 << sk->sk_state) & + return (1 << READ_ONCE(sk->sk_state)) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); } @@ -117,7 +121,7 @@ EXPORT_SYMBOL(sk_stream_wait_close); */ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) { - int err = 0; + int ret, err = 0; long vm_wait = 0; long current_timeo = *timeo_p; DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -142,11 +146,13 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || - (sk_stream_memory_free(sk) && - !vm_wait), &wait); + ret = sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && !vm_wait), + &wait); sk->sk_write_pending--; + if (ret < 0) + goto do_error; if (vm_wait) { vm_wait -= current_timeo; @@ -209,7 +215,6 @@ void sk_stream_kill_queues(struct sock *sk) sk_mem_reclaim_final(sk); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk->sk_forward_alloc); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5b1ce656baa1..8d4decb2606f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -16,19 +16,26 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/sched/isolation.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> +#include <net/hotdata.h> +#include <net/proto_memory.h> +#include <net/rps.h> #include "dev.h" +#include "net-sysfs.h" static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -45,8 +52,89 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); +#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) +static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) +{ + char *kbuf; + int len; + + if (*ppos || !*lenp) { + *lenp = 0; + return 0; + } + + /* CPUs are displayed as a hex bitmap + a comma between each groups of 8 + * nibbles (except the last one which has a newline instead). + * Guesstimate the buffer size at the group granularity level. + */ + len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp); + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) { + *lenp = 0; + return -ENOMEM; + } + + len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); + if (!len) { + *lenp = 0; + goto free_buf; + } + + /* scnprintf writes a trailing null char not counted in the returned + * length, override it with a newline. + */ + kbuf[len++] = '\n'; + memcpy(buffer, kbuf, len); + *lenp = len; + *ppos += len; + +free_buf: + kfree(kbuf); + return 0; +} +#endif + #ifdef CONFIG_RPS -static int rps_sock_flow_sysctl(struct ctl_table *table, int write, + +DEFINE_MUTEX(rps_default_mask_mutex); + +static int rps_default_mask_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = (struct net *)table->data; + struct cpumask *mask; + int err = 0; + + mutex_lock(&rps_default_mask_mutex); + mask = net->core.rps_default_mask; + if (write) { + if (!mask) { + mask = kzalloc(cpumask_size(), GFP_KERNEL); + net->core.rps_default_mask = mask; + } + err = -ENOMEM; + if (!mask) + goto done; + + err = cpumask_parse(buffer, mask); + if (err) + goto done; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto done; + } else { + err = dump_cpumask(buffer, lenp, ppos, + mask ?: cpu_none_mask); + } + +done: + mutex_unlock(&rps_default_mask_mutex); + return err; +} + +static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; @@ -61,7 +149,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + orig_sock_table = rcu_dereference_protected( + net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; @@ -82,7 +171,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -ENOMEM; } - rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; @@ -93,7 +183,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, sock_table = NULL; if (sock_table != orig_sock_table) { - rcu_assign_pointer(rps_sock_flow_table, sock_table); + rcu_assign_pointer(net_hotdata.rps_sock_flow_table, + sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); @@ -101,7 +192,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -115,7 +206,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); -static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, +static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; @@ -139,7 +230,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); @@ -148,20 +239,13 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { - char kbuf[128]; - - if (*ppos || !*lenp) { - *lenp = 0; - goto done; - } - cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { @@ -171,17 +255,7 @@ write_unlock: } rcu_read_unlock(); - len = min(sizeof(kbuf) - 1, *lenp); - len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); - if (!len) { - *lenp = 0; - goto done; - } - if (len < *lenp) - kbuf[len++] = '\n'; - memcpy(buffer, kbuf, len); - *lenp = len; - *ppos += len; + ret = dump_cpumask(buffer, lenp, ppos, mask); } done: @@ -189,7 +263,7 @@ done: return ret; } -static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, +static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; @@ -211,7 +285,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED -static int set_default_qdisc(struct ctl_table *table, int write, +static int set_default_qdisc(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; @@ -230,25 +304,25 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif -static int proc_do_dev_weight(struct ctl_table *table, int write, +static int proc_do_dev_weight(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { static DEFINE_MUTEX(dev_weight_mutex); int ret, weight; mutex_lock(&dev_weight_mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); - WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); - WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); return ret; } -static int proc_do_rss_key(struct ctl_table *table, int write, +static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; @@ -261,7 +335,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, } #ifdef CONFIG_BPF_JIT -static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, +static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -294,7 +368,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int -proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) @@ -305,7 +379,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, # endif /* CONFIG_HAVE_EBPF_JIT */ static int -proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) @@ -317,36 +391,12 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, static struct ctl_table net_core_table[] = { { - .procname = "wmem_max", - .data = &sysctl_wmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_max", - .data = &sysctl_rmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { - .procname = "wmem_default", - .data = &sysctl_wmem_default, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &sysctl_rmem_default, + .procname = "mem_pcpu_rsv", + .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, + .extra1 = &min_mem_pcpu_rsv, }, { .procname = "dev_weight", @@ -354,6 +404,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", @@ -361,6 +412,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", @@ -368,10 +420,11 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -430,7 +483,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_tstamp_prequeue", - .data = &netdev_tstamp_prequeue, + .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -449,22 +502,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "optmem_max", - .data = &sysctl_optmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tstamp_allow_data", - .data = &sysctl_tstamp_allow_data, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", @@ -515,7 +552,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_budget", - .data = &netdev_budget, + .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -529,7 +566,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -538,11 +575,11 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_budget_usecs", - .data = &netdev_budget_usecs, + .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", @@ -571,7 +608,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "gro_normal_batch", - .data = &gro_normal_batch, + .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -588,16 +625,23 @@ static struct ctl_table net_core_table[] = { }, { .procname = "skb_defer_max", - .data = &sysctl_skb_defer_max, + .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { } }; static struct ctl_table netns_core_table[] = { +#if IS_ENABLED(CONFIG_RPS) + { + .procname = "rps_default_mask", + .data = &init_net, + .mode = 0644, + .proc_handler = rps_default_mask_sysctl + }, +#endif { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, @@ -607,6 +651,14 @@ static struct ctl_table netns_core_table[] = { .proc_handler = proc_dointvec_minmax }, { + .procname = "optmem_max", + .data = &init_net.core.sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, + { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, .maxlen = sizeof(u8), @@ -615,7 +667,66 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, - { } + { + .procname = "txq_reselection_ms", + .data = &init_net.core.sysctl_txq_reselection, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "tstamp_allow_data", + .data = &init_net.core.sysctl_tstamp_allow_data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + /* sysctl_core_net_init() will set the values after this + * to readonly in network namespaces + */ + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -633,24 +744,27 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + size_t table_size = ARRAY_SIZE(netns_core_table); + struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) - tmp->data += (char *)net - (char *)&init_net; + for (i = 0; i < table_size; ++i) { + if (tbl[i].data == &sysctl_wmem_max) + break; - /* Don't export any sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; + tbl[i].data += (char *)net - (char *)&init_net; } + for (; i < table_size; ++i) + tbl[i].mode &= ~0222; } - net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; @@ -665,11 +779,14 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); +#if IS_ENABLED(CONFIG_RPS) + kfree(net->core.rps_default_mask); +#endif kfree(tbl); } diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 04840697fe79..a50a7ef49ae8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -9,6 +9,7 @@ #include <linux/ptp_classify.h> #include <linux/skbuff.h> #include <linux/export.h> +#include <linux/ptp_clock_kernel.h> static unsigned int classify(const struct sk_buff *skb) { @@ -21,18 +22,39 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; struct sk_buff *clone; unsigned int type; - if (!skb->sk) + if (!skb->sk || !skb->dev) return; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + type = classify(skb); if (type == PTP_CLASS_NONE) return; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->txtstamp)) { clone = skb_clone_sk(skb); if (!clone) @@ -44,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; unsigned int type; - if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) + if (!skb->dev) return false; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return false; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + if (skb_headroom(skb) < ETH_HLEN) return false; @@ -62,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return false; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->rxtstamp)) return mii_ts->rxtstamp(mii_ts, skb, type); diff --git a/net/core/tso.c b/net/core/tso.c index e00796e3b146..6df997b9076e 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -3,7 +3,7 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/tso.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last) diff --git a/net/core/utils.c b/net/core/utils.c index c994e95172ac..5e63b0ea21f3 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Generic address resultion entity + * Generic address resolution entity * * Authors: * net_random Alan Cox @@ -399,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, } EXPORT_SYMBOL(inet_pton_with_scope); -bool inet_addr_is_any(struct sockaddr *addr) +bool inet_addr_is_any(struct sockaddr_storage *addr) { - if (addr->sa_family == AF_INET6) { + if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; const struct sockaddr_in6 in6_any = { .sin6_addr = IN6ADDR_ANY_INIT }; @@ -409,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr) if (!memcmp(in6->sin6_addr.s6_addr, in6_any.sin6_addr.s6_addr, 16)) return true; - } else if (addr->sa_family == AF_INET) { + } else if (addr->ss_family == AF_INET) { struct sockaddr_in *in = (struct sockaddr_in *)addr; if (in->sin_addr.s_addr == htonl(INADDR_ANY)) return true; } else { - pr_warn("unexpected address family %u\n", addr->sa_family); + pr_warn("unexpected address family %u\n", addr->ss_family); } return false; @@ -473,11 +473,11 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, EXPORT_SYMBOL(inet_proto_csum_replace16); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr) + __wsum diff, bool pseudohdr, bool ipv6) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace_by_diff(sum, diff); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6) skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); diff --git a/net/core/xdp.c b/net/core/xdp.c index 844c9d99dc0e..9100e160113a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -4,6 +4,8 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> @@ -12,8 +14,10 @@ #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/bug.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> +#include <net/hotdata.h> +#include <net/netdev_lock.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> @@ -73,7 +77,7 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); /* Allow this ID to be reused */ - ida_simple_remove(&mem_id_pool, xa->mem.id); + ida_free(&mem_id_pool, xa->mem.id); kfree(xa); } @@ -124,10 +128,8 @@ void xdp_unreg_mem_model(struct xdp_mem_info *mem) return; if (type == MEM_TYPE_PAGE_POOL) { - rcu_read_lock(); - xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); - rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); @@ -185,7 +187,6 @@ int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; - xdp_rxq->napi_id = napi_id; xdp_rxq->frag_size = frag_size; xdp_rxq->reg_state = REG_STATE_REGISTERED; @@ -240,7 +241,7 @@ static int __mem_id_cyclic_get(gfp_t gfp) int id; again: - id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp); if (id < 0) { if (id == -ENOSPC) { /* Cyclic allocator, reset next id */ @@ -292,10 +293,8 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, mutex_lock(&mem_id_lock); ret = __mem_id_init_hash_table(); mutex_unlock(&mem_id_lock); - if (ret < 0) { - WARN_ON(1); + if (ret < 0) return ERR_PTR(ret); - } } xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); @@ -315,7 +314,7 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { - ida_simple_remove(&mem_id_pool, mem->id); + ida_free(&mem_id_pool, mem->id); mem->id = 0; errno = PTR_ERR(ptr); goto err; @@ -359,6 +358,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); + if (type == MEM_TYPE_XSK_BUFF_POOL && allocator) + xsk_pool_set_rxq_info(allocator, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) trace_mem_connect(xdp_alloc, xdp_rxq); return 0; @@ -366,33 +368,87 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); +/** + * xdp_reg_page_pool - register &page_pool as a memory provider for XDP + * @pool: &page_pool to register + * + * Can be used to register pools manually without connecting to any XDP RxQ + * info, so that the XDP layer will be aware of them. Then, they can be + * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool(). + * + * Return: %0 on success, -errno on error. + */ +int xdp_reg_page_pool(struct page_pool *pool) +{ + struct xdp_mem_info mem; + + return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool); +} +EXPORT_SYMBOL_GPL(xdp_reg_page_pool); + +/** + * xdp_unreg_page_pool - unregister &page_pool from the memory providers list + * @pool: &page_pool to unregister + * + * A shorthand for manual unregistering page pools. If the pool was previously + * attached to an RxQ info, it must be detached first. + */ +void xdp_unreg_page_pool(const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_unreg_mem_model(&mem); +} +EXPORT_SYMBOL_GPL(xdp_unreg_page_pool); + +/** + * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info + * @xdp_rxq: XDP RxQ info to attach the pool to + * @pool: pool to attach + * + * If the pool was registered manually, this function must be called instead + * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info. + */ +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_rxq_info_attach_mem_model(xdp_rxq, &mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); + /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp) +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp) { - struct page *page; - - switch (mem->type) { + switch (mem_type) { case MEM_TYPE_PAGE_POOL: - page = virt_to_head_page(data); + netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as mem->type knows this a page_pool page + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page */ - page_pool_put_full_page(page->pp, page, napi_direct); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, + napi_direct); break; case MEM_TYPE_PAGE_SHARED: - page_frag_free(data); + page_frag_free(__netmem_address(netmem)); break; case MEM_TYPE_PAGE_ORDER0: - page = virt_to_page(data); /* Assumes order0 page*/ - put_page(page); + put_page(__netmem_to_page(netmem)); break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ @@ -400,7 +456,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ - WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); + WARN(1, "Incorrect XDP memory type (%d) usage", mem_type); break; } } @@ -408,38 +464,34 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + false, NULL); - __xdp_return(page_address(page), &xdpf->mem, false, NULL); - } out: - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + true, NULL); - __xdp_return(page_address(page), &xdpf->mem, true, NULL); - } out: - __xdp_return(xdpf->data, &xdpf->mem, true, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -453,46 +505,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); * xdp_frame_bulk is usually stored/allocated on the function * call-stack to avoid locking penalties. */ -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) -{ - struct xdp_mem_allocator *xa = bq->xa; - - if (unlikely(!xa || !bq->count)) - return; - - page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); - /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ - bq->count = 0; -} -EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); /* Must be called with rcu_read_lock held */ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq) { - struct xdp_mem_info *mem = &xdpf->mem; - struct xdp_mem_allocator *xa; - - if (mem->type != MEM_TYPE_PAGE_POOL) { + if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) { xdp_return_frame(xdpf); return; } - xa = bq->xa; - if (unlikely(!xa)) { - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - bq->count = 0; - bq->xa = xa; - } - if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); - if (unlikely(mem->id != xa->mem.id)) { - xdp_flush_frame_bulk(bq); - bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - } - if (unlikely(xdp_frame_has_frags(xdpf))) { struct skb_shared_info *sinfo; int i; @@ -501,49 +526,43 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; - bq->q[bq->count++] = skb_frag_address(frag); + bq->q[bq->count++] = skb_frag_netmem(frag); if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); } } - bq->q[bq->count++] = xdpf->data; + bq->q[bq->count++] = virt_to_netmem(xdpf->data); } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); +/** + * xdp_return_frag -- free one XDP frag or decrement its refcount + * @netmem: network memory reference to release + * @xdp: &xdp_buff to release the frag for + */ +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp) +{ + __xdp_return(netmem, xdp->rxq->mem.type, true, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frag); + void xdp_return_buff(struct xdp_buff *xdp) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), + xdp->rxq->mem.type, true, xdp); - __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); - } out: - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); + __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); -/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ -void __xdp_release_frame(void *data, struct xdp_mem_info *mem) -{ - struct xdp_mem_allocator *xa; - struct page *page; - - rcu_read_lock(); - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - page = virt_to_head_page(data); - if (xa) - page_pool_release_page(xa->page_pool, page); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(__xdp_release_frame); - void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { @@ -586,7 +605,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->headroom = 0; xdpf->metasize = metasize; xdpf->frame_sz = PAGE_SIZE; - xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); return xdpf; @@ -600,16 +619,177 @@ void xdp_warn(const char *msg, const char *func, const int line) }; EXPORT_SYMBOL_GPL(xdp_warn); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +/** + * xdp_build_skb_from_buff - create an skb from &xdp_buff + * @xdp: &xdp_buff to convert to an skb + * + * Perform common operations to create a new skb to pass up the stack from + * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize + * skb data pointers and offsets, set the recycle bit if the buff is + * PP-backed, Rx queue index, protocol and update frags info. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) { - n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, - n_skb, skbs); - if (unlikely(!n_skb)) - return -ENOMEM; + const struct xdp_rxq_info *rxq = xdp->rxq; + const struct skb_shared_info *sinfo; + struct sk_buff *skb; + u32 nr_frags = 0; + int metalen; - return 0; + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } + + skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + __skb_put(skb, xdp->data_end - xdp->data); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) + skb_metadata_set(skb, metalen); + + if (rxq->mem.type == MEM_TYPE_PAGE_POOL) + skb_mark_for_recycle(skb); + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(nr_frags)) { + u32 tsize; + + tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + tsize, xdp_buff_get_skb_flags(xdp)); + } + + skb->protocol = eth_type_trans(skb, rxq->dev); + + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff); + +/** + * xdp_copy_frags_from_zc - copy frags from XSk buff to skb + * @skb: skb to copy frags to + * @xdp: XSk &xdp_buff from which the frags will be copied + * @pp: &page_pool backing page allocation, if available + * + * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack. + * Allocate a new buffer for each frag, copy it and attach to the skb. + * + * Return: true on success, false on netmem allocation fail. + */ +static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, + const struct xdp_buff *xdp, + struct page_pool *pp) +{ + struct skb_shared_info *sinfo = skb_shinfo(skb); + const struct skb_shared_info *xinfo; + u32 nr_frags, tsize = 0; + u32 flags = 0; + + xinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = xinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + const skb_frag_t *frag = &xinfo->frags[i]; + u32 len = skb_frag_size(frag); + u32 offset, truesize = len; + struct page *page; + + page = page_pool_dev_alloc(pp, &offset, &truesize); + if (unlikely(!page)) { + sinfo->nr_frags = i; + return false; + } + + memcpy(page_address(page) + offset, skb_frag_address(frag), + LARGEST_ALIGN(len)); + __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); + + tsize += truesize; + if (page_is_pfmemalloc(page)) + flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; + } + + xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize, + flags); + + return true; } -EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); + +/** + * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff + * @xdp: source XSk buff + * + * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb + * head, new buffer for the head, copy the data and initialize the skb fields. + * If there are frags, allocate new buffers for them and copy. + * Buffers are allocated from the system percpu pools to try recycling them. + * If new skb was built successfully, @xdp is returned to XSk pool's freelist. + * On error, it remains untouched and the caller must take care of this. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) +{ + const struct xdp_rxq_info *rxq = xdp->rxq; + u32 len = xdp->data_end - xdp->data_meta; + u32 truesize = xdp->frame_sz; + struct sk_buff *skb = NULL; + struct page_pool *pp; + int metalen; + void *data; + + if (!IS_ENABLED(CONFIG_PAGE_POOL)) + return NULL; + + local_lock_nested_bh(&system_page_pool.bh_lock); + pp = this_cpu_read(system_page_pool.pool); + data = page_pool_dev_alloc_va(pp, &truesize); + if (unlikely(!data)) + goto out; + + skb = napi_build_skb(data, truesize); + if (unlikely(!skb)) { + page_pool_free_va(pp, data, true); + goto out; + } + + skb_mark_for_recycle(skb); + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); + + memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len)); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) { + skb_metadata_set(skb, metalen); + __skb_pull(skb, metalen); + } + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(xdp_buff_has_frags(xdp)) && + unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { + napi_consume_skb(skb, true); + skb = NULL; + goto out; + } + + xsk_buff_free(xdp); + + skb->protocol = eth_type_trans(skb, rxq->dev); + +out: + local_unlock_nested_bh(&system_page_pool.bh_lock); + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, @@ -643,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, skb_metadata_set(skb, xdpf->metasize); if (unlikely(xdp_frame_has_frags(xdpf))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdpf->frame_sz, - xdp_frame_is_frag_pfmemalloc(xdpf)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_get_skb_flags(xdpf)); /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); @@ -657,8 +836,8 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, * - RX ring dev queue index (skb_record_rx_queue) */ - /* Until page_pool get SKB return path, release DMA here */ - xdp_release_frame(xdpf); + if (xdpf->mem_type == MEM_TYPE_PAGE_POOL) + skb_mark_for_recycle(skb); /* Allow SKB to reuse area used by xdp_frame */ xdp_scrub_frame(xdpf); @@ -672,7 +851,7 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, { struct sk_buff *skb; - skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -704,8 +883,173 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) nxdpf = addr; nxdpf->data = addr + headroom; nxdpf->frame_sz = PAGE_SIZE; - nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - nxdpf->mem.id = 0; + nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0; return nxdpf; } + +__bpf_kfunc_start_defs(); + +/** + * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp. + * @ctx: XDP context pointer. + * @timestamp: Return value pointer. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : means device driver does not implement kfunc + * * ``-ENODATA`` : means no RX-timestamp available for this frame + */ +__bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + return -EOPNOTSUPP; +} + +/** + * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash. + * @ctx: XDP context pointer. + * @hash: Return value pointer. + * @rss_type: Return value pointer for RSS type. + * + * The RSS hash type (@rss_type) specifies what portion of packet headers NIC + * hardware used when calculating RSS hash value. The RSS type can be decoded + * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits + * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types* + * ``XDP_RSS_TYPE_L*``. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc + * * ``-ENODATA`` : means no RX-hash available for this frame + */ +__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) +{ + return -EOPNOTSUPP; +} + +/** + * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag + * @ctx: XDP context pointer. + * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID). + * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP) + * + * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*, + * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use + * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)** + * and should be used as follows: + * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();`` + * + * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag. + * Driver is expected to provide those in **host byte order (usually LE)**, + * so the bpf program should not perform byte conversion. + * According to 802.1Q standard, *VLAN TCI (Tag control information)* + * is a bit field that contains: + * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``, + * *Drop eligible indicator (DEI)* - 1 bit, + * *Priority code point (PCP)* - 3 bits. + * For detailed meaning of DEI and PCP, please refer to other sources. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc + * * ``-ENODATA`` : VLAN tag was not stripped or is not available + */ +__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, + __be16 *vlan_proto, u16 *vlan_tci) +{ + return -EOPNOTSUPP; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(xdp_metadata_kfunc_ids) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC +BTF_KFUNCS_END(xdp_metadata_kfunc_ids) + +static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { + .owner = THIS_MODULE, + .set = &xdp_metadata_kfunc_ids, +}; + +BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) +#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + +u32 bpf_xdp_metadata_kfunc_id(int id) +{ + /* xdp_metadata_kfunc_ids is sorted and can't be used */ + return xdp_metadata_kfunc_ids_unsorted[id]; +} + +bool bpf_dev_bound_kfunc_id(u32 btf_id) +{ + return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id); +} + +static int __init xdp_metadata_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); +} +late_initcall(xdp_metadata_init); + +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) +{ + val &= NETDEV_XDP_ACT_MASK; + if (dev->xdp_features == val) + return; + + netdev_assert_locked_or_invisible(dev); + dev->xdp_features = val; + + if (dev->reg_state == NETREG_REGISTERED) + call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} +EXPORT_SYMBOL_GPL(xdp_set_features_flag); + +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg) +{ + xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); + + if (support_sg) + val |= NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + netdev_lock(dev); + xdp_features_set_redirect_target_locked(dev, support_sg); + netdev_unlock(dev); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); + +void xdp_features_clear_redirect_target_locked(struct net_device *dev) +{ + xdp_features_t val = dev->xdp_features; + + val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + netdev_lock(dev); + xdp_features_clear_redirect_target_locked(dev); + netdev_unlock(dev); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); |
