diff options
Diffstat (limited to 'net/core')
79 files changed, 66303 insertions, 14934 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index b33b996f5dd6..9ef2099c5426 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -1,24 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux networking core. # -obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ - gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o +obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ + gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ + flow_dissector.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ +obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o + sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ + fib_notifier.o xdp.o flow_offload.o gro.o \ + netdev-genl.o netdev-genl-gen.o gso.o + +obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o -obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o +obj-y += hotdata.o +obj-y += netdev_rx_queue.o +obj-y += netdev_queues.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o -obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o +obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o -obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o +obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o +obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o +obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o +obj-$(CONFIG_DST_CACHE) += dst_cache.o +obj-$(CONFIG_HWBM) += hwbm.o +obj-$(CONFIG_GRO_CELLS) += gro_cells.o +obj-$(CONFIG_FAILOVER) += failover.o +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o +obj-$(CONFIG_BPF_SYSCALL) += sock_map.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o +obj-$(CONFIG_OF) += of_net.o +obj-$(CONFIG_NET_TEST) += net_test.o +obj-$(CONFIG_NET_DEVMEM) += devmem.o +obj-$(CONFIG_DEBUG_NET) += lock_debug.o +obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c new file mode 100644 index 000000000000..850dd736ccd1 --- /dev/null +++ b/net/core/bpf_sk_storage.c @@ -0,0 +1,914 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/bpf_local_storage.h> +#include <net/bpf_sk_storage.h> +#include <net/sock.h> +#include <uapi/linux/sock_diag.h> +#include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> + +DEFINE_BPF_STORAGE_CACHE(sk_cache); + +static struct bpf_local_storage_data * +bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +{ + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_map *smap; + + sk_storage = + rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held()); + if (!sk_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); +} + +static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = bpf_sk_storage_lookup(sk, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata), false); + + return 0; +} + +/* Called by __sk_destruct() & bpf_sk_storage_clone() */ +void bpf_sk_storage_free(struct sock *sk) +{ + struct bpf_local_storage *sk_storage; + + rcu_read_lock_dont_migrate(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage) + goto out; + + bpf_local_storage_destroy(sk_storage); +out: + rcu_read_unlock_migrate(); +} + +static void bpf_sk_storage_map_free(struct bpf_map *map) +{ + bpf_local_storage_map_free(map, &sk_cache, NULL); +} + +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) +{ + return bpf_local_storage_map_alloc(attr, &sk_cache, false); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = bpf_sk_storage_lookup(sock->sk, map, true); + sockfd_put(sock); + return sdata ? sdata->data : NULL; + } + + return ERR_PTR(err); +} + +static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = bpf_local_storage_update( + sock->sk, (struct bpf_local_storage_map *)map, value, + map_flags, false, GFP_ATOMIC); + sockfd_put(sock); + return PTR_ERR_OR_ZERO(sdata); + } + + return err; +} + +static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + err = bpf_sk_storage_del(sock->sk, map); + sockfd_put(sock); + return err; + } + + return err; +} + +static struct bpf_local_storage_elem * +bpf_sk_storage_clone_elem(struct sock *newsk, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_elem *copy_selem; + + copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC); + if (!copy_selem) + return NULL; + + if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data, true); + else + copy_map_value(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data); + + return copy_selem; +} + +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) +{ + struct bpf_local_storage *new_sk_storage = NULL; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; + int ret = 0; + + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + + rcu_read_lock_dont_migrate(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + + if (!sk_storage || hlist_empty(&sk_storage->list)) + goto out; + + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + struct bpf_local_storage_elem *copy_selem; + struct bpf_local_storage_map *smap; + struct bpf_map *map; + + smap = rcu_dereference(SDATA(selem)->smap); + if (!(smap->map.map_flags & BPF_F_CLONE)) + continue; + + /* Note that for lockless listeners adding new element + * here can race with cleanup in bpf_local_storage_map_free. + * Try to grab map refcnt to make sure that it's still + * alive and prevent concurrent removal. + */ + map = bpf_map_inc_not_zero(&smap->map); + if (IS_ERR(map)) + continue; + + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); + if (!copy_selem) { + ret = -ENOMEM; + bpf_map_put(map); + goto out; + } + + if (new_sk_storage) { + bpf_selem_link_map(smap, copy_selem); + bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); + } else { + ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); + if (ret) { + bpf_selem_free(copy_selem, true); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } + + new_sk_storage = + rcu_dereference(copy_selem->local_storage); + } + bpf_map_put(map); + } + +out: + rcu_read_unlock_migrate(); + + /* In case of an error, don't free anything explicitly here, the + * caller is responsible to call bpf_sk_storage_free. + */ + + return ret; +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) +{ + struct bpf_local_storage_data *sdata; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) + return (unsigned long)NULL; + + sdata = bpf_sk_storage_lookup(sk, map, true); + if (sdata) + return (unsigned long)sdata->data; + + if (flags == BPF_SK_STORAGE_GET_F_CREATE && + /* Cannot add new elem to a going away sk. + * Otherwise, the new elem may become a leak + * (and also other memory issues during map + * destruction). + */ + refcount_inc_not_zero(&sk->sk_refcnt)) { + sdata = bpf_local_storage_update( + sk, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST, false, gfp_flags); + /* sk must be a fullsock (guaranteed by verifier), + * so sock_gen_put() is unnecessary. + */ + sock_put(sk); + return IS_ERR(sdata) ? + (unsigned long)NULL : (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) +{ + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!sk || !sk_fullsock(sk)) + return -EINVAL; + + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + int err; + + err = bpf_sk_storage_del(sk, map); + sock_put(sk); + return err; + } + + return -ENOENT; +} + +static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + struct sock *sk = (struct sock *)owner; + int optmem_max; + + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + /* same check as in sock_kmalloc() */ + if (size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; +} + +static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + struct sock *sk = owner; + + atomic_sub(size, &sk->sk_omem_alloc); +} + +static struct bpf_local_storage __rcu ** +bpf_sk_storage_ptr(void *owner) +{ + struct sock *sk = owner; + + return &sk->sk_bpf_storage; +} + +const struct bpf_map_ops sk_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, + .map_update_elem = bpf_fd_sk_storage_update_elem, + .map_delete_elem = bpf_fd_sk_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_id = &bpf_local_storage_map_btf_id[0], + .map_local_storage_charge = bpf_sk_storage_charge, + .map_local_storage_uncharge = bpf_sk_storage_uncharge, + .map_owner_storage_ptr = bpf_sk_storage_ptr, + .map_mem_usage = bpf_local_storage_map_mem_usage, +}; + +const struct bpf_func_proto bpf_sk_storage_get_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */ + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_proto = { + .func = bpf_sk_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) +{ + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (in_hardirq() || in_nmi()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, + gfp_flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (in_hardirq() || in_nmi()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, +}; + +struct bpf_sk_storage_diag { + u32 nr_maps; + struct bpf_map *maps[]; +}; + +/* The reply will be like: + * INET_DIAG_BPF_SK_STORAGES (nla_nest) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * .... + */ +static int nla_value_size(u32 value_size) +{ + /* SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + */ + return nla_total_size(0) + nla_total_size(sizeof(u32)) + + nla_total_size_64bit(value_size); +} + +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ + u32 i; + + if (!diag) + return; + + for (i = 0; i < diag->nr_maps; i++) + bpf_map_put(diag->maps[i]); + + kfree(diag); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free); + +static bool diag_check_dup(const struct bpf_sk_storage_diag *diag, + const struct bpf_map *map) +{ + u32 i; + + for (i = 0; i < diag->nr_maps; i++) { + if (diag->maps[i] == map) + return true; + } + + return false; +} + +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) +{ + struct bpf_sk_storage_diag *diag; + struct nlattr *nla; + u32 nr_maps = 0; + int rem, err; + + /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as + * the map_alloc_check() side also does. + */ + if (!bpf_capable()) + return ERR_PTR(-EPERM); + + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); + nr_maps++; + } + + diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); + if (!diag) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + int map_fd = nla_get_u32(nla); + struct bpf_map *map = bpf_map_get(map_fd); + + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto err_free; + } + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) { + bpf_map_put(map); + err = -EINVAL; + goto err_free; + } + if (diag_check_dup(diag, map)) { + bpf_map_put(map); + err = -EEXIST; + goto err_free; + } + diag->maps[diag->nr_maps++] = map; + } + + return diag; + +err_free: + bpf_sk_storage_diag_free(diag); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); + +static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) +{ + struct nlattr *nla_stg, *nla_value; + struct bpf_local_storage_map *smap; + + /* It cannot exceed max nlattr's payload */ + BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); + + nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); + if (!nla_stg) + return -EMSGSIZE; + + smap = rcu_dereference(sdata->smap); + if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) + goto errout; + + nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE, + smap->map.value_size, + SK_DIAG_BPF_STORAGE_PAD); + if (!nla_value) + goto errout; + + if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) + copy_map_value_locked(&smap->map, nla_data(nla_value), + sdata->data, true); + else + copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + + nla_nest_end(skb, nla_stg); + return 0; + +errout: + nla_nest_cancel(skb, nla_stg); + return -EMSGSIZE; +} + +static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + + rcu_read_lock(); + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + smap = rcu_dereference(SDATA(selem)->smap); + diag_size += nla_value_size(smap->map.value_size); + + if (nla_stgs && diag_get(SDATA(selem), skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} + +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_data *sdata; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + u32 i; + + *res_diag_size = 0; + + /* No map has been specified. Dump all. */ + if (!diag->nr_maps) + return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type, + res_diag_size); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + for (i = 0; i < diag->nr_maps; i++) { + sdata = bpf_local_storage_lookup(sk_storage, + (struct bpf_local_storage_map *)diag->maps[i], + false); + + if (!sdata) + continue; + + diag_size += nla_value_size(diag->maps[i]->value_size); + + if (nla_stgs && diag_get(sdata, skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); + +struct bpf_iter_seq_sk_storage_map_info { + struct bpf_map *map; + unsigned int bucket_id; + unsigned skip_elems; +}; + +static struct bpf_local_storage_elem * +bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, + struct bpf_local_storage_elem *prev_selem) + __acquires(RCU) __releases(RCU) +{ + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; + u32 skip_elems = info->skip_elems; + struct bpf_local_storage_map *smap; + u32 bucket_id = info->bucket_id; + u32 i, count, n_buckets; + struct bpf_local_storage_map_bucket *b; + + smap = (struct bpf_local_storage_map *)info->map; + n_buckets = 1U << smap->bucket_log; + if (bucket_id >= n_buckets) + return NULL; + + /* try to find next selem in the same bucket */ + selem = prev_selem; + count = 0; + while (selem) { + selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)), + struct bpf_local_storage_elem, map_node); + if (!selem) { + /* not found, unlock and go to the next bucket */ + b = &smap->buckets[bucket_id++]; + rcu_read_unlock(); + skip_elems = 0; + break; + } + sk_storage = rcu_dereference(selem->local_storage); + if (sk_storage) { + info->skip_elems = skip_elems + count; + return selem; + } + count++; + } + + for (i = bucket_id; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + rcu_read_lock(); + count = 0; + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + sk_storage = rcu_dereference(selem->local_storage); + if (sk_storage && count >= skip_elems) { + info->bucket_id = i; + info->skip_elems = count; + return selem; + } + count++; + } + rcu_read_unlock(); + skip_elems = 0; + } + + info->bucket_id = i; + info->skip_elems = 0; + return NULL; +} + +static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_local_storage_elem *selem; + + selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL); + if (!selem) + return NULL; + + if (*pos == 0) + ++*pos; + return selem; +} + +static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + + ++*pos; + ++info->skip_elems; + return bpf_sk_storage_map_seq_find_next(seq->private, v); +} + +struct bpf_iter__bpf_sk_storage_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(struct sock *, sk); + __bpf_md_ptr(void *, value); +}; + +DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta, + struct bpf_map *map, struct sock *sk, + void *value) + +static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, + struct bpf_local_storage_elem *selem) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + struct bpf_iter__bpf_sk_storage_map ctx = {}; + struct bpf_local_storage *sk_storage; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, selem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (selem) { + sk_storage = rcu_dereference(selem->local_storage); + ctx.sk = sk_storage->owner; + ctx.value = SDATA(selem)->data; + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_sk_storage_map_seq_show(seq, v); +} + +static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + if (!v) + (void)__bpf_sk_storage_map_seq_show(seq, v); + else + rcu_read_unlock(); +} + +static int bpf_iter_init_sk_storage_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + + bpf_map_inc_with_uref(aux->map); + seq_info->map = aux->map; + return 0; +} + +static void bpf_iter_fini_sk_storage_map(void *priv_data) +{ + struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + + bpf_map_put_with_uref(seq_info->map); +} + +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + goto put_map; + + if (prog->aux->max_rdwr_access > map->value_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +static const struct seq_operations bpf_sk_storage_map_seq_ops = { + .start = bpf_sk_storage_map_seq_start, + .next = bpf_sk_storage_map_seq_next, + .stop = bpf_sk_storage_map_seq_stop, + .show = bpf_sk_storage_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_sk_storage_map_seq_ops, + .init_seq_private = bpf_iter_init_sk_storage_map, + .fini_seq_private = bpf_iter_fini_sk_storage_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info), +}; + +static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { + .target = "bpf_sk_storage_map", + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__bpf_sk_storage_map, value), + PTR_TO_BUF | PTR_MAYBE_NULL }, + }, + .seq_info = &iter_seq_info, +}; + +static int __init bpf_sk_storage_map_iter_init(void) +{ + bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info); +} +late_initcall(bpf_sk_storage_map_iter_init); diff --git a/net/core/datagram.c b/net/core/datagram.c index 8ab48cd89559..c285c6465923 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * SUCS NET3: * @@ -36,7 +37,7 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/errno.h> @@ -48,6 +49,10 @@ #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/iov_iter.h> +#include <linux/indirect_call_wrapper.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -58,6 +63,8 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> +#include "devmem.h" + /* * Is a socket 'connection oriented' ? */ @@ -66,23 +73,22 @@ static inline int connection_based(struct sock *sk) return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } -static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync, +static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { - unsigned long bits = (unsigned long)key; - /* * Avoid a wakeup if event not interesting for us */ - if (bits && !(bits & (POLLIN | POLLERR))) + if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) return 0; return autoremove_wake_function(wait, mode, sync, key); } /* * Wait for the last received packet to be different from skb */ -static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, - const struct sk_buff *skb) +int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, + int *err, long *timeo_p, + const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); @@ -94,7 +100,7 @@ static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, if (error) goto out_err; - if (sk->sk_receive_queue.prev != skb) + if (READ_ONCE(queue->prev) != skb) goto out; /* Socket shut down? */ @@ -128,15 +134,86 @@ out_noerr: error = 1; goto out; } +EXPORT_SYMBOL(__skb_wait_for_more_packets); + +static struct sk_buff *skb_set_peeked(struct sk_buff *skb) +{ + struct sk_buff *nskb; + + if (skb->peeked) + return skb; + + /* We have to unshare an skb before modifying it. */ + if (!skb_shared(skb)) + goto done; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return ERR_PTR(-ENOMEM); + + skb->prev->next = nskb; + skb->next->prev = nskb; + nskb->prev = skb->prev; + nskb->next = skb->next; + + consume_skb(skb); + skb = nskb; + +done: + skb->peeked = 1; + + return skb; +} + +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, + unsigned int flags, + int *off, int *err, + struct sk_buff **last) +{ + bool peek_at_off = false; + struct sk_buff *skb; + int _off = 0; + + if (unlikely(flags & MSG_PEEK && *off >= 0)) { + peek_at_off = true; + _off = *off; + } + + *last = queue->prev; + skb_queue_walk(queue, skb) { + if (flags & MSG_PEEK) { + if (peek_at_off && _off >= skb->len && + (_off || skb->peeked)) { + _off -= skb->len; + continue; + } + if (!skb->len) { + skb = skb_set_peeked(skb); + if (IS_ERR(skb)) { + *err = PTR_ERR(skb); + return NULL; + } + } + refcount_inc(&skb->users); + } else { + __skb_unlink(skb, queue); + } + *off = _off; + return skb; + } + return NULL; +} /** - * __skb_recv_datagram - Receive a datagram skbuff + * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket - * @flags: MSG_ flags - * @peeked: returns non-zero if this packet has been seen before + * @queue: socket queue from which to receive + * @flags: MSG\_ flags * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned + * @last: set to last peeked message to inform the wait function + * what to look for when peeking * * Get a datagram skbuff, understands the peeking, nonblocking wakeups * and possible races. This replaces identical code in packet, raw and @@ -144,9 +221,11 @@ out_noerr: * the long standing peek and read race for datagram sockets. If you * alter this routine remember it must be re-entrant. * - * This function will lock the socket if a skb is returned, so the caller - * needs to unlock the socket in that case (usually by calling - * skb_free_datagram) + * This function will lock the socket if a skb is returned, so + * the caller needs to unlock the socket in that case (usually by + * calling skb_free_datagram). Returns NULL with @err set to + * -EAGAIN if no data was available or to some other value if an + * error was detected. * * * It does not lock socket since today. This function is * * free of race conditions. This measure should/can improve @@ -160,11 +239,13 @@ out_noerr: * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, - int *peeked, int *off, int *err) +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, int *off, int *err, + struct sk_buff **last) { - struct sk_buff *skb, *last; - long timeo; + struct sk_buff *skb; + unsigned long cpu_flags; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() */ @@ -173,8 +254,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, if (error) goto no_packet; - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. @@ -182,92 +261,100 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ - unsigned long cpu_flags; - struct sk_buff_head *queue = &sk->sk_receive_queue; - int _off = *off; - - last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); - skb_queue_walk(queue, skb) { - last = skb; - *peeked = skb->peeked; - if (flags & MSG_PEEK) { - if (_off >= skb->len && (skb->len || _off || - skb->peeked)) { - _off -= skb->len; - continue; - } - skb->peeked = 1; - atomic_inc(&skb->users); - } else - __skb_unlink(skb, queue); - - spin_unlock_irqrestore(&queue->lock, cpu_flags); - *off = _off; - return skb; - } + skb = __skb_try_recv_from_queue(queue, flags, off, &error, + last); spin_unlock_irqrestore(&queue->lock, cpu_flags); - - if (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)) - continue; - - /* User doesn't want to wait */ - error = -EAGAIN; - if (!timeo) + if (error) goto no_packet; + if (skb) + return skb; - } while (!wait_for_more_packets(sk, err, &timeo, last)); + if (!sk_can_busy_loop(sk)) + break; - return NULL; + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (READ_ONCE(queue->prev) != *last); + + error = -EAGAIN; no_packet: *err = error; return NULL; } +EXPORT_SYMBOL(__skb_try_recv_datagram); + +struct sk_buff *__skb_recv_datagram(struct sock *sk, + struct sk_buff_head *sk_queue, + unsigned int flags, int *off, int *err) +{ + struct sk_buff *skb, *last; + long timeo; + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err, + &last); + if (skb) + return skb; + + if (*err != -EAGAIN) + break; + } while (timeo && + !__skb_wait_for_more_packets(sk, sk_queue, err, + &timeo, last)); + + return NULL; +} EXPORT_SYMBOL(__skb_recv_datagram); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, - int noblock, int *err) + int *err) { - int peeked, off = 0; + int off = 0; - return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - &peeked, &off, err); + return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, + &off, err); } EXPORT_SYMBOL(skb_recv_datagram); void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { consume_skb(skb); - sk_mem_reclaim_partial(sk); } EXPORT_SYMBOL(skb_free_datagram); -void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, + struct sk_buff *skb, unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb)) { - bool slow; - - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) - return; + int err = 0; - slow = lock_sock_fast(sk); - skb_orphan(skb); - sk_mem_reclaim_partial(sk); - unlock_sock_fast(sk, slow); + if (flags & MSG_PEEK) { + err = -ENOENT; + spin_lock_bh(&sk_queue->lock); + if (skb->next) { + __skb_unlink(skb, sk_queue); + refcount_dec(&skb->users); + if (destructor) + destructor(sk, skb); + err = 0; + } + spin_unlock_bh(&sk_queue->lock); + } - /* skb is now orphaned, can be freed outside of locked section */ - __kfree_skb(skb); + sk_drops_inc(sk); + return err; } -EXPORT_SYMBOL(skb_free_datagram_locked); +EXPORT_SYMBOL(__sk_queue_drop_skb); /** * skb_kill_datagram - Free a datagram skbuff forcibly * @sk: socket * @skb: datagram skbuff - * @flags: MSG_ flags + * @flags: MSG\_ flags * * This function frees a datagram skbuff that was received by * skb_recv_datagram. The flags argument must match the one @@ -286,56 +373,44 @@ EXPORT_SYMBOL(skb_free_datagram_locked); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { - int err = 0; - - if (flags & MSG_PEEK) { - err = -ENOENT; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - atomic_dec(&skb->users); - err = 0; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - } + int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, + NULL); kfree_skb(skb); - atomic_inc(&sk->sk_drops); - sk_mem_reclaim_partial(sk); - return err; } EXPORT_SYMBOL(skb_kill_datagram); -/** - * skb_copy_datagram_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @len: amount of data to copy from buffer to iovec - * - * Note: the iovec is modified during the copy. - */ -int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, - struct iovec *to, int len) +INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, + size_t bytes, + void *data __always_unused, + struct iov_iter *i)); + +static int __skb_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, bool fault_short, + size_t (*cb)(const void *, size_t, void *, + struct iov_iter *), void *data) { int start = skb_headlen(skb); - int i, copy = start - offset; + int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; - trace_skb_copy_datagram_iovec(skb, len); - /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; - if (memcpy_toiovec(to, skb->data + offset, copy)) - goto fault; + n = INDIRECT_CALL_1(cb, simple_copy_to_iter, + skb->data + offset, copy, data, to); + offset += n; + if (n != copy) + goto short_copy; if ((len -= copy) == 0) return 0; - offset += copy; } + if (!skb_frags_readable(skb)) + goto short_copy; + /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -345,21 +420,28 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; if (copy > len) copy = len; - vaddr = kmap(page); - err = memcpy_toiovec(to, vaddr + frag->page_offset + - offset - start, copy); - kunmap(page); - if (err) - goto fault; + + n = 0; + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); + n += INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + p_off, p_len, data, to); + kunmap_local(vaddr); + } + + offset += n; + if (n != copy) + goto short_copy; if (!(len -= copy)) return 0; - offset += copy; } start = end; } @@ -373,9 +455,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_iovec(frag_iter, - offset - start, - to, copy)) + if (__skb_datagram_iter(frag_iter, offset - start, + to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; @@ -386,113 +467,87 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, if (!len) return 0; + /* This is not really a user copy fault, but rather someone + * gave us a bogus length on the skb. We should probably + * print a warning here as it may indicate a kernel bug. + */ + fault: + iov_iter_revert(to, offset - start_off); return -EFAULT; + +short_copy: + if (fault_short || iov_iter_count(to)) + goto fault; + + return 0; +} + +#ifdef CONFIG_NET_CRC32C +static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, + void *_crcp, struct iov_iter *i) +{ + u32 *crcp = _crcp; + size_t copied; + + copied = copy_to_iter(addr, bytes, i); + *crcp = crc32c(*crcp, addr, copied); + return copied; } -EXPORT_SYMBOL(skb_copy_datagram_iovec); /** - * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. + * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator + * and update a CRC32C value. * @skb: buffer to copy * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @to_offset: offset in the io vector to start copying to + * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec + * @crcp: pointer to CRC32C value to update * - * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. + * Return: 0 on success, -EFAULT if there was a fault during copy. */ -int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, - const struct iovec *to, int to_offset, - int len) +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp) { - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovecend(to, vaddr + frag->page_offset + - offset - start, to_offset, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); + return __skb_datagram_iter(skb, offset, to, len, true, + crc32c_and_copy_to_iter, crcp); +} +EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); +#endif /* CONFIG_NET_CRC32C */ - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(frag_iter, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - if (!len) - return 0; +static size_t simple_copy_to_iter(const void *addr, size_t bytes, + void *data __always_unused, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} -fault: - return -EFAULT; +/** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + trace_skb_copy_datagram_iovec(skb, len); + return __skb_datagram_iter(skb, offset, to, len, false, + simple_copy_to_iter, NULL); } -EXPORT_SYMBOL(skb_copy_datagram_const_iovec); +EXPORT_SYMBOL(skb_copy_datagram_iter); /** - * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. + * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. * @skb: buffer to copy * @offset: offset in the buffer to start copying to - * @from: io vector to copy to - * @from_offset: offset in the io vector to start copying from + * @from: the copy source * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. */ -int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, - const struct iovec *from, int from_offset, +int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, + struct iov_iter *from, int len) { int start = skb_headlen(skb); @@ -503,13 +558,11 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (memcpy_fromiovecend(skb->data + offset, from, from_offset, - copy)) + if (copy_from_iter(skb->data + offset, copy, from) != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - from_offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ @@ -521,24 +574,19 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); + size_t copied; if (copy > len) copy = len; - vaddr = kmap(page); - err = memcpy_fromiovecend(vaddr + frag->page_offset + - offset - start, - from, from_offset, copy); - kunmap(page); - if (err) + copied = copy_page_from_iter(skb_frag_page(frag), + skb_frag_off(frag) + offset - start, + copy, from); + if (copied != copy) goto fault; if (!(len -= copy)) return 0; offset += copy; - from_offset += copy; } start = end; } @@ -552,16 +600,13 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_from_iovec(frag_iter, - offset - start, - from, - from_offset, - copy)) + if (skb_copy_datagram_from_iter(frag_iter, + offset - start, + from, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - from_offset += copy; } start = end; } @@ -571,132 +616,276 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, fault: return -EFAULT; } -EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +EXPORT_SYMBOL(skb_copy_datagram_from_iter); -static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, - u8 __user *to, int len, - __wsum *csump) +int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, + struct iov_iter *from, int len) { - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - int pos = 0; + struct iov_iter_state state; + int ret; + + iov_iter_save_state(from, &state); + ret = skb_copy_datagram_from_iter(skb, offset, from, len); + if (ret) + iov_iter_restore(from, &state); + return ret; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iter_full); - /* Copy header. */ - if (copy > 0) { - int err = 0; - if (copy > len) - copy = len; - *csump = csum_and_copy_to_user(skb->data + offset, to, copy, - *csump, &err); - if (err) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - pos = copy; +int zerocopy_fill_skb_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length) +{ + int frag = skb_shinfo(skb)->nr_frags; + + if (!skb_frags_readable(skb)) + return -EFAULT; + + while (length && iov_iter_count(from)) { + struct page *head, *last_head = NULL; + struct page *pages[MAX_SKB_FRAGS]; + int refs, order, n = 0; + size_t start; + ssize_t copied; + + if (frag == MAX_SKB_FRAGS) + return -EMSGSIZE; + + copied = iov_iter_get_pages2(from, pages, length, + MAX_SKB_FRAGS - frag, &start); + if (copied < 0) + return -EFAULT; + + length -= copied; + + skb->data_len += copied; + skb->len += copied; + skb->truesize += PAGE_ALIGN(copied + start); + + head = compound_head(pages[n]); + order = compound_order(head); + + for (refs = 0; copied != 0; start = 0) { + int size = min_t(int, copied, PAGE_SIZE - start); + + if (pages[n] - head > (1UL << order) - 1) { + head = compound_head(pages[n]); + order = compound_order(head); + } + + start += (pages[n] - head) << PAGE_SHIFT; + copied -= size; + n++; + if (frag) { + skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; + + if (head == skb_frag_page(last) && + start == skb_frag_off(last) + skb_frag_size(last)) { + skb_frag_size_add(last, size); + /* We combined this page, we need to release + * a reference. Since compound pages refcount + * is shared among many pages, batch the refcount + * adjustments to limit false sharing. + */ + last_head = head; + refs++; + continue; + } + } + if (refs) { + page_ref_sub(last_head, refs); + refs = 0; + } + skb_fill_page_desc_noacc(skb, frag++, head, start, size); + } + if (refs) + page_ref_sub(last_head, refs); } + return 0; +} - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; - WARN_ON(start > offset + len); + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC && + iov_iter_type(from) != ITER_UBUF) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - __wsum csum2; - int err = 0; - u8 *vaddr; - struct page *page = skb_frag_page(frag); + return 0; +} - if (copy > len) - copy = len; - vaddr = kmap(page); - csum2 = csum_and_copy_to_user(vaddr + - frag->page_offset + - offset - start, - to, copy, 0, &err); - kunmap(page); - if (err) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if (!(len -= copy)) - return 0; - offset += copy; - to += copy; - pos += copy; - } - start = end; +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, struct iov_iter *from, + size_t length, + struct net_devmem_dmabuf_binding *binding) +{ + unsigned long orig_size = skb->truesize; + unsigned long truesize; + int ret; + + if (msg && msg->msg_ubuf && msg->sg_from_iter) + ret = msg->sg_from_iter(skb, from, length); + else if (binding) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); + else + ret = zerocopy_fill_skb_from_iter(skb, from, length); + + truesize = skb->truesize - orig_size; + if (sk && sk->sk_type == SOCK_STREAM) { + sk_wmem_queued_add(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); } + return ret; +} +EXPORT_SYMBOL(__zerocopy_sg_from_iter); - skb_walk_frags(skb, frag_iter) { - int end; +/** + * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter + * @skb: buffer to copy + * @from: the source to copy from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + */ +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +{ + int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); - WARN_ON(start > offset + len); + /* copy up to skb headlen */ + if (skb_copy_datagram_from_iter(skb, 0, from, copy)) + return -EFAULT; - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - if (copy > len) - copy = len; - if (skb_copy_and_csum_datagram(frag_iter, - offset - start, - to, copy, - &csum2)) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - pos += copy; - } - start = end; - } - if (!len) - return 0; + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); +} +EXPORT_SYMBOL(zerocopy_sg_from_iter); -fault: - return -EFAULT; +static __always_inline +size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress, + size_t len, void *from, void *priv2) +{ + __wsum next, *csum = priv2; + + next = csum_and_copy_to_user(from + progress, iter_to, len); + *csum = csum_block_add(*csum, next, progress); + return next ? 0 : len; } -__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +static __always_inline +size_t memcpy_to_iter_csum(void *iter_to, size_t progress, + size_t len, void *from, void *priv2) { - __sum16 sum; + __wsum *csum = priv2; + __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len); - sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) - netdev_rx_csum_fault(skb->dev); - skb->ip_summed = CHECKSUM_UNNECESSARY; + *csum = csum_block_add(*csum, next, progress); + return 0; +} + +struct csum_state { + __wsum csum; + size_t off; +}; + +static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, + struct iov_iter *i) +{ + struct csum_state *csstate = _csstate; + __wsum sum; + + if (WARN_ON_ONCE(i->data_source)) + return 0; + if (unlikely(iov_iter_is_discard(i))) { + // can't use csum_memcpy() for that one - data is not copied + csstate->csum = csum_block_add(csstate->csum, + csum_partial(addr, bytes, 0), + csstate->off); + csstate->off += bytes; + return bytes; } - return sum; + + sum = csum_shift(csstate->csum, csstate->off); + + bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum, + copy_to_user_iter_csum, + memcpy_to_iter_csum); + csstate->csum = csum_shift(sum, csstate->off); + csstate->off += bytes; + return bytes; } -EXPORT_SYMBOL(__skb_checksum_complete_head); -__sum16 __skb_checksum_complete(struct sk_buff *skb) +/** + * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator + * and update a checksum. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @csump: checksum pointer + */ +static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + __wsum *csump) { - return __skb_checksum_complete_head(skb, skb->len); + struct csum_state csdata = { .csum = *csump }; + int ret; + + ret = __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, &csdata); + if (ret) + return ret; + + *csump = csdata.csum; + return 0; } -EXPORT_SYMBOL(__skb_checksum_complete); /** - * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. + * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length - * @iov: io vector + * @msg: destination * * Caller _must_ check that skb will fit to this iovec. * * Returns: 0 - success. * -EINVAL - checksum failure. - * -EFAULT - fault during copy. Beware, in this case iovec - * can be modified! + * -EFAULT - fault during copy. */ -int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, - int hlen, struct iovec *iov) +int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, + int hlen, struct msghdr *msg) { __wsum csum; int chunk = skb->len - hlen; @@ -704,89 +893,113 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, if (!chunk) return 0; - /* Skip filled elements. - * Pretty silly, look at memcpy_toiovec, though 8) - */ - while (!iov->iov_len) - iov++; - - if (iov->iov_len < chunk) { + if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) - goto csum_error; - if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) + return -EINVAL; + if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { csum = csum_partial(skb->data, hlen, skb->csum); - if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, + if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; - if (csum_fold(csum)) - goto csum_error; - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) - netdev_rx_csum_fault(skb->dev); - iov->iov_len -= chunk; - iov->iov_base += chunk; + + if (csum_fold(csum)) { + iov_iter_revert(&msg->msg_iter, chunk); + return -EINVAL; + } + + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(NULL, skb); } return 0; -csum_error: - return -EINVAL; fault: return -EFAULT; } -EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); +EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** - * datagram_poll - generic datagram poll + * datagram_poll_queue - same as datagram_poll, but on a specific receive + * queue * @file: file struct * @sock: socket * @wait: poll table + * @rcv_queue: receive queue to poll * - * Datagram poll: Again totally generic. This also handles - * sequenced packet sockets providing the socket receive queue - * is only ever holding data ready to receive. + * Performs polling on the given receive queue, handling shutdown, error, + * and connection state. This is useful for protocols that deliver + * userspace-bound packets through a custom queue instead of + * sk->sk_receive_queue. * - * Note: when you _don't_ use this routine for this protocol, - * and you use a different write policy from sock_writeable() - * then please supply your own write_space callback. + * Return: poll bitmask indicating the socket's current state */ -unsigned int datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t datagram_poll_queue(struct file *file, struct socket *sock, + poll_table *wait, struct sk_buff_head *rcv_queue) { struct sock *sk = sock->sk; - unsigned int mask; + __poll_t mask; + u8 shutdown; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR | - (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0); + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) + mask |= EPOLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP | POLLIN | POLLRDNORM; - if (sk->sk_shutdown == SHUTDOWN_MASK) - mask |= POLLHUP; + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; + if (shutdown == SHUTDOWN_MASK) + mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue)) - mask |= POLLIN | POLLRDNORM; + if (!skb_queue_empty_lockless(rcv_queue)) + mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (connection_based(sk)) { - if (sk->sk_state == TCP_CLOSE) - mask |= POLLHUP; + int state = READ_ONCE(sk->sk_state); + + if (state == TCP_CLOSE) + mask |= EPOLLHUP; /* connection hasn't started yet? */ - if (sk->sk_state == TCP_SYN_SENT) + if (state == TCP_SYN_SENT) return mask; } /* writable? */ if (sock_writeable(sk)) - mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } +EXPORT_SYMBOL(datagram_poll_queue); + +/** + * datagram_poll - generic datagram poll + * @file: file struct + * @sock: socket + * @wait: poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you *don't* use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + * + * Return: poll bitmask indicating the socket's current state + */ +__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + return datagram_poll_queue(file, sock, wait, + &sock->sk->sk_receive_queue); +} EXPORT_SYMBOL(datagram_poll); diff --git a/net/core/dev.c b/net/core/dev.c index a3d8d44cb7f4..9094c0fb8c68 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * NET3 Protocol independent device support routines. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * NET3 Protocol independent device support routines. * * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro + * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * @@ -21,9 +17,9 @@ * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set - * to 2 if register_netdev gets called - * before net_dev_init & also removed a - * few lines of code in the process. + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. @@ -36,7 +32,7 @@ * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. - * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before @@ -46,7 +42,7 @@ * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. - * Alan Cox : Fixed nasty side effect of device close + * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() @@ -67,13 +63,13 @@ * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait - * indefinitely on dev->refcnt - * J Hadi Salim : - Backlog queue sampling + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ -#include <asm/uaccess.h> -#include <linux/bitops.h> +#include <linux/uaccess.h> +#include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> @@ -81,7 +77,11 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/isolation.h> +#include <linux/sched/mm.h> +#include <linux/smpboot.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> @@ -92,16 +92,26 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> -#include <linux/notifier.h> +#include <linux/ethtool_netlink.h> #include <linux/skbuff.h> +#include <linux/kthread.h> +#include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> +#include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> +#include <net/dsa.h> #include <net/dst.h> +#include <net/dst_metadata.h> +#include <net/gro.h> +#include <net/netdev_queues.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> @@ -118,6 +128,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> +#include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> @@ -125,65 +136,64 @@ #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> -#include <linux/pci.h> +#include <trace/events/qdisc.h> +#include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> #include <linux/vmalloc.h> - +#include <linux/if_macvlan.h> +#include <linux/errqueue.h> +#include <linux/hrtimer.h> +#include <linux/netfilter_netdev.h> +#include <linux/crash_dump.h> +#include <linux/sctp.h> +#include <net/udp_tunnel.h> +#include <linux/net_namespace.h> +#include <linux/indirect_call_wrapper.h> +#include <net/devlink.h> +#include <linux/pm_runtime.h> +#include <linux/prandom.h> +#include <linux/once_lite.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/types.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/rps.h> +#include <linux/phy_link_topology.h> + +#include "dev.h" +#include "devmem.h" #include "net-sysfs.h" -/* Instead of increasing this, you should create a hash table. */ -#define MAX_GRO_SKBS 8 - -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(ptype_lock); -static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -struct list_head ptype_all __read_mostly; /* Taps */ -static struct list_head offload_base __read_mostly; -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); + +static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); -static unsigned int napi_gen_id; -static DEFINE_HASHTABLE(napi_hash, 8); - -seqcount_t devnet_rename_seq; +static unsigned int napi_gen_id = NR_CPUS; +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static inline void dev_base_seq_inc(struct net *net) { - while (++net->dev_base_seq == 0); + unsigned int val = net->dev_base_seq + 1; + + WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { - unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } @@ -193,33 +203,218 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *sd) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) { -#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -#endif + static_branch_enable(&use_backlog_threads_key); + return 0; } +early_param("thread_backlog_napi", setup_backlog_napi_threads); -static inline void rps_unlock(struct softnet_data *sd) +static bool use_backlog_threads(void) { -#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); + return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) +{ + return true; +} + #endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else + local_irq_save(*flags); +} + +static inline void backlog_lock_irq_disable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_lock_irq(&sd->input_pkt_queue.lock); + else + local_irq_disable(); +} + +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else + local_irq_restore(*flags); +} + +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else + local_irq_enable(); +} + +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +bool netdev_name_in_use(struct net *net, const char *name) +{ + return netdev_name_node_lookup(net, name); +} +EXPORT_SYMBOL(netdev_name_in_use); + +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail_rcu(&name_node->list, &dev->name_node->list); + + return 0; +} + +static void netdev_name_node_alt_free(struct rcu_head *head) +{ + struct netdev_name_node *name_node = + container_of(head, struct netdev_name_node, rcu); + + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + netdev_name_node_del(name_node); + list_del(&name_node->list); + call_rcu(&name_node->rcu, netdev_name_node_alt_free); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + /* lookup might have found our primary name or a name belonging + * to another device. + */ + if (name_node == dev->name_node || name_node->dev != dev) + return -EINVAL; + + __netdev_name_node_alt_destroy(name_node); + return 0; +} + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { + list_del(&name_node->list); + netdev_name_node_alt_free(&name_node->rcu); + } } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { + struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); - write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock_bh(&dev_base_lock); + + netdev_for_each_altname(dev, name_node) + netdev_name_node_add(net, name_node); + + /* We reserved the ifindex, this can't fail */ + WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } @@ -229,14 +424,20 @@ static void list_netdevice(struct net_device *dev) */ static void unlist_netdevice(struct net_device *dev) { + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + ASSERT_RTNL(); + xa_erase(&net->dev_by_index, dev->ifindex); + + netdev_for_each_altname(dev, name_node) + netdev_name_node_del(name_node); + /* Unlink dev from the device chain */ - write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -252,16 +453,26 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { + .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), +}; EXPORT_PER_CPU_SYMBOL(softnet_data); +/* Page_pool has a lockless array/stack to alloc/recycle pages. + * PP consumers must pay attention to run APIs in the appropriate context + * (e.g. NAPI context). + */ +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; + #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ -static const unsigned short netdev_lock_type[] = - {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, @@ -277,22 +488,22 @@ static const unsigned short netdev_lock_type[] = ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; -static const char *const netdev_lock_name[] = - {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", - "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", - "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", - "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", - "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", - "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", - "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", - "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", - "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", - "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", - "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", - "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", - "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", - "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", - "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; @@ -332,16 +543,18 @@ static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } + static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { } #endif /******************************************************************************* + * + * Protocol management and registration routines + * + *******************************************************************************/ - Protocol management and registration routines - -*******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is @@ -361,10 +574,19 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { - if (pt->type == htons(ETH_P_ALL)) - return &ptype_all; - else - return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + if (pt->type == htons(ETH_P_ALL)) { + if (!pt->af_packet_net && !pt->dev) + return NULL; + + return pt->dev ? &pt->dev->ptype_all : + &pt->af_packet_net->ptype_all; + } + + if (pt->dev) + return &pt->dev->ptype_specific; + + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific : + &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** @@ -384,6 +606,9 @@ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); + if (WARN_ON_ONCE(!head)) + return; + spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); @@ -408,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt) struct list_head *head = ptype_head(pt); struct packet_type *pt1; + if (!head) + return; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { @@ -444,286 +672,258 @@ void dev_remove_pack(struct packet_type *pt) EXPORT_SYMBOL(dev_remove_pack); -/** - * dev_add_offload - register offload handlers - * @po: protocol offload declaration +/******************************************************************************* * - * Add protocol offload handlers to the networking stack. The passed - * &proto_offload is linked into kernel lists and may not be freed until - * it has been removed from the kernel lists. + * Device Interface Subroutines * - * This call does not sleep therefore it can not - * guarantee all CPU's that are in middle of receiving packets - * will see the new offload handlers (until the next received packet). + *******************************************************************************/ + +/** + * dev_get_iflink - get 'iflink' value of a interface + * @dev: targeted interface + * + * Indicates the ifindex the interface is linked to. + * Physical interfaces have the same 'ifindex' and 'iflink' values. */ -void dev_add_offload(struct packet_offload *po) + +int dev_get_iflink(const struct net_device *dev) { - struct list_head *head = &offload_base; + if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) + return dev->netdev_ops->ndo_get_iflink(dev); - spin_lock(&offload_lock); - list_add_rcu(&po->list, head); - spin_unlock(&offload_lock); + return READ_ONCE(dev->ifindex); } -EXPORT_SYMBOL(dev_add_offload); +EXPORT_SYMBOL(dev_get_iflink); /** - * __dev_remove_offload - remove offload handler - * @po: packet offload declaration - * - * Remove a protocol offload handler that was previously added to the - * kernel offload handlers by dev_add_offload(). The passed &offload_type - * is removed from the kernel lists and can be freed or reused once this - * function returns. + * dev_fill_metadata_dst - Retrieve tunnel egress information. + * @dev: targeted interface + * @skb: The packet. * - * The packet type might still be in use by receivers - * and must not be freed until after all the CPU's have gone - * through a quiescent state. + * For better visibility of tunnel traffic OVS needs to retrieve + * egress tunnel information for a packet. Following API allows + * user to get this info. */ -void __dev_remove_offload(struct packet_offload *po) +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { - struct list_head *head = &offload_base; - struct packet_offload *po1; + struct ip_tunnel_info *info; - spin_lock(&offload_lock); + if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) + return -EINVAL; - list_for_each_entry(po1, head, list) { - if (po == po1) { - list_del_rcu(&po->list); - goto out; - } - } + info = skb_tunnel_info_unclone(skb); + if (!info) + return -ENOMEM; + if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) + return -EINVAL; - pr_warn("dev_remove_offload: %p not found\n", po); -out: - spin_unlock(&offload_lock); + return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); } -EXPORT_SYMBOL(__dev_remove_offload); +EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); -/** - * dev_remove_offload - remove packet offload handler - * @po: packet offload declaration - * - * Remove a packet offload handler that was previously added to the kernel - * offload handlers by dev_add_offload(). The passed &offload_type is - * removed from the kernel lists and can be freed or reused once this - * function returns. - * - * This call sleeps to guarantee that no CPU is looking at the packet - * type after return. - */ -void dev_remove_offload(struct packet_offload *po) +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { - __dev_remove_offload(po); + int k = stack->num_paths++; - synchronize_net(); + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + return NULL; + + return &stack->path[k]; } -EXPORT_SYMBOL(dev_remove_offload); -/****************************************************************************** +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack) +{ + const struct net_device *last_dev; + struct net_device_path_ctx ctx = { + .dev = dev, + }; + struct net_device_path *path; + int ret = 0; - Device Boot-time Settings Routines + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); + stack->num_paths = 0; + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { + last_dev = ctx.dev; + path = dev_fwd_path(stack); + if (!path) + return -1; -*******************************************************************************/ + memset(path, 0, sizeof(struct net_device_path)); + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); + if (ret < 0) + return -1; -/* Boot time configuration table */ -static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; + if (WARN_ON_ONCE(last_dev == ctx.dev)) + return -1; + } -/** - * netdev_boot_setup_add - add new setup entry - * @name: name of the device - * @map: configured settings for the device - * - * Adds new setup entry to the dev_boot_setup list. The function - * returns 0 on error and 1 on success. This is a generic routine to - * all netdevices. - */ -static int netdev_boot_setup_add(char *name, struct ifmap *map) -{ - struct netdev_boot_setup *s; - int i; + if (!ctx.dev) + return ret; - s = dev_boot_setup; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { - memset(s[i].name, 0, sizeof(s[i].name)); - strlcpy(s[i].name, name, IFNAMSIZ); - memcpy(&s[i].map, map, sizeof(s[i].map)); - break; - } - } + path = dev_fwd_path(stack); + if (!path) + return -1; + path->type = DEV_PATH_ETHERNET; + path->dev = ctx.dev; - return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; + return ret; } +EXPORT_SYMBOL_GPL(dev_fill_forward_path); -/** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice - * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. - */ -int netdev_boot_setup_check(struct net_device *dev) +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) { - struct netdev_boot_setup *s = dev_boot_setup; - int i; + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && - !strcmp(dev->name, s[i].name)) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; - return 1; - } - } - return 0; -} -EXPORT_SYMBOL(netdev_boot_setup_check); + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + return NULL; +} -/** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device - * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. - */ -unsigned long netdev_boot_base(const char *prefix, int unit) +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct * +netdev_napi_by_id(struct net *net, unsigned int napi_id) { - const struct netdev_boot_setup *s = dev_boot_setup; - char name[IFNAMSIZ]; - int i; + struct napi_struct *napi; - sprintf(name, "%s%d", prefix, unit); + napi = napi_by_id(napi_id); + if (!napi) + return NULL; - /* - * If device already registered then return base of 1 - * to indicate not to probe for this interface - */ - if (__dev_get_by_name(&init_net, name)) - return 1; + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) - if (!strcmp(name, s[i].name)) - return s[i].map.base_addr; - return 0; + return napi; } -/* - * Saves at boot time configured settings for any netdevice. +/** + * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it + * @net: the applicable net namespace + * @napi_id: ID of a NAPI of a target device + * + * Find a NAPI instance with @napi_id. Lock its device. + * The device must be in %NETREG_REGISTERED state for lookup to succeed. + * netdev_unlock() must be called to release it. + * + * Return: pointer to NAPI, its device with lock held, NULL if not found. */ -int __init netdev_boot_setup(char *str) +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) { - int ints[5]; - struct ifmap map; - - str = get_options(str, ARRAY_SIZE(ints), ints); - if (!str || !*str) - return 0; - - /* Save settings */ - memset(&map, 0, sizeof(map)); - if (ints[0] > 0) - map.irq = ints[1]; - if (ints[0] > 1) - map.base_addr = ints[2]; - if (ints[0] > 2) - map.mem_start = ints[3]; - if (ints[0] > 3) - map.mem_end = ints[4]; + struct napi_struct *napi; + struct net_device *dev; - /* Add new entry to the list */ - return netdev_boot_setup_add(str, &map); -} + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { + rcu_read_unlock(); + return NULL; + } -__setup("netdev=", netdev_boot_setup); + dev = napi->dev; + dev_hold(dev); + rcu_read_unlock(); -/******************************************************************************* + dev = __netdev_put_lock(dev, net); + if (!dev) + return NULL; - Device Interface Subroutines + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (napi && napi->dev != dev) + napi = NULL; + rcu_read_unlock(); -*******************************************************************************/ + if (!napi) + netdev_unlock(dev); + return napi; +} /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** - * dev_get_by_name_rcu - find a device by its name - * @net: the applicable net namespace - * @name: name to find + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find * - * Find an interface by name. - * If the name is found a pointer to the device is returned. - * If the name is not found then %NULL is returned. - * The reference counters are not incremented so the caller must be - * careful with locks. The caller must hold RCU lock. + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry_rcu(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); +/* Deprecated for new users, call netdev_get_by_name() instead */ +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_name); + /** - * dev_get_by_name - find a device by its name + * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has - * the usage count incremented and the caller must use dev_put() to + * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ - -struct net_device *dev_get_by_name(struct net *net, const char *name) +struct net_device *netdev_get_by_name(struct net *net, const char *name, + netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_name_rcu(net, name); + dev = dev_get_by_name(net, name); if (dev) - dev_hold(dev); - rcu_read_unlock(); + netdev_tracker_alloc(dev, tracker, gfp); return dev; } -EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex @@ -733,8 +933,7 @@ EXPORT_SYMBOL(dev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -774,63 +973,229 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) } EXPORT_SYMBOL(dev_get_by_index_rcu); +/* Deprecated for new users, call netdev_get_by_index() instead */ +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_index); /** - * dev_get_by_index - find a device by its ifindex + * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. + * netdev_put() to indicate they have finished with it. */ +struct net_device *netdev_get_by_index(struct net *net, int ifindex, + netdevice_tracker *tracker, gfp_t gfp) +{ + struct net_device *dev; -struct net_device *dev_get_by_index(struct net *net, int ifindex) + dev = dev_get_by_index(net, ifindex); + if (dev) + netdev_tracker_alloc(dev, tracker, gfp); + return dev; +} +EXPORT_SYMBOL(netdev_get_by_index); + +/** + * dev_get_by_napi_id - find a device by napi_id + * @napi_id: ID of the NAPI struct + * + * Search for an interface by NAPI ID. Returns %NULL if the device + * is not found or a pointer to the device. The device has not had + * its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ +struct net_device *dev_get_by_napi_id(unsigned int napi_id) +{ + struct napi_struct *napi; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!napi_id_valid(napi_id)) + return NULL; + + napi = napi_by_id(napi_id); + + return napi ? napi->dev : NULL; +} + +/* Release the held reference on the net_device, and if the net_device + * is still registered try to lock the instance lock. If device is being + * unregistered NULL will be returned (but the reference has been released, + * either way!) + * + * This helper is intended for locking net_device after it has been looked up + * using a lockless lookup helper. Lock prevents the instance from going away. + */ +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) +{ + netdev_lock(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +/** + * netdev_get_by_index_lock() - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. If a valid device + * with @ifindex is found it will be returned with netdev->lock held. + * netdev_unlock() must be called to release it. + * + * Return: pointer to a device with lock held, NULL if not found. + */ +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) { struct net_device *dev; - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, ifindex); + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock(dev, net); +} + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); +} + +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index) +{ if (dev) + netdev_unlock(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } dev_hold(dev); - rcu_read_unlock(); - return dev; + rcu_read_unlock(); + + dev = __netdev_put_lock(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); } -EXPORT_SYMBOL(dev_get_by_index); + +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} +EXPORT_IPV6_MOD_GPL(netdev_copy_name); /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. - * - * The use of raw_seqcount_begin() and cond_resched() before - * retrying is required as we want to give the writers a chance - * to complete when CONFIG_PREEMPT is not set. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; - unsigned int seq; + int ret; -retry: - seq = raw_seqcount_begin(&devnet_rename_seq); rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { - rcu_read_unlock(); - return -ENODEV; + ret = -ENODEV; + goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); + + ret = 0; +out: rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) { - cond_resched(); - goto retry; - } + return ret; +} - return 0; +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); } /** @@ -841,7 +1206,7 @@ retry: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -853,26 +1218,38 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) { struct net_device *dev; ASSERT_RTNL(); for_each_netdev(net, dev) - if (dev->type == type) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } -EXPORT_SYMBOL(__dev_getfirstbyhwtype); +EXPORT_SYMBOL(dev_getbyhwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { @@ -891,51 +1268,52 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags_rcu - find any device with given flags - * @net: the applicable net namespace - * @if_flags: IFF_* values - * @mask: bitmask of bits in if_flags to check + * netdev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace + * @tracker: tracking object for the acquired reference + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check * - * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. Must be called inside - * rcu_read_lock(), and result refcount is unchanged. + * Search for any interface with the given flags. + * + * Context: rcu_read_lock() must be held. + * Returns: NULL if a device is not found or a pointer to the device. */ - -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, + unsigned short if_flags, unsigned short mask) { - struct net_device *dev, *ret; + struct net_device *dev; - ret = NULL; for_each_netdev_rcu(net, dev) { - if (((dev->flags ^ if_flags) & mask) == 0) { - ret = dev; - break; + if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) { + netdev_hold(dev, tracker, GFP_ATOMIC); + return dev; } } - return ret; + + return NULL; } -EXPORT_SYMBOL(dev_get_by_flags_rcu); +EXPORT_IPV6_MOD(netdev_get_by_flags_rcu); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to - * to allow sysfs to work. We also disallow any kind of + * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) { if (*name == '\0') return false; - if (strlen(name) >= IFNAMSIZ) + if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; while (*name) { - if (*name == '/' || isspace(*name)) + if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } @@ -947,7 +1325,7 @@ EXPORT_SYMBOL(dev_valid_name); * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string - * @buf: scratch buffer and result name string + * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses @@ -958,55 +1336,79 @@ EXPORT_SYMBOL(dev_valid_name); * Returns the number of the unit assigned or a negative errno code. */ -static int __dev_alloc_name(struct net *net, const char *name, char *buf) +static int __dev_alloc_name(struct net *net, const char *name, char *res) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; + char buf[IFNAMSIZ]; - p = strnchr(name, IFNAMSIZ-1, '%'); - if (p) { - /* - * Verify the string as this thing may have come from - * the user. There must be either one "%d" and no other "%" - * characters. - */ - if (p[1] != 'd' || strchr(p + 2, '%')) - return -EINVAL; + /* Verify the string as this thing may have come from the user. + * There must be one "%d" and no other "%" characters. + */ + p = strchr(name, '%'); + if (!p || p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; - /* Use one page as a bit array of possible slots */ - inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); - if (!inuse) - return -ENOMEM; + /* Use one page as a bit array of possible slots */ + inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); + if (!inuse) + return -ENOMEM; - for_each_netdev(net, d) { - if (!sscanf(d->name, name, &i)) + for_each_netdev(net, d) { + struct netdev_name_node *name_node; + + netdev_for_each_altname(d, name_node) { + if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; - /* avoid cases where sscanf is not exact inverse of printf */ + /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); - if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); + if (!strncmp(buf, name_node->name, IFNAMSIZ)) + __set_bit(i, inuse); } + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; - i = find_first_zero_bit(inuse, max_netdevices); - free_page((unsigned long) inuse); + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + __set_bit(i, inuse); } - if (buf != name) - snprintf(buf, IFNAMSIZ, name, i); - if (!__dev_get_by_name(net, buf)) - return i; + i = find_first_zero_bit(inuse, max_netdevices); + bitmap_free(inuse); + if (i == max_netdevices) + return -ENFILE; - /* It is possible to run out of possible slots - * when the name is long and there isn't enough space left - * for the digits, or if all bits are used. - */ - return -ENFILE; + /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ + strscpy(buf, name, IFNAMSIZ); + snprintf(res, IFNAMSIZ, buf, i); + return i; +} + +/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ +static int dev_prep_valid_name(struct net *net, struct net_device *dev, + const char *want_name, char *out_name, + int dup_errno) +{ + if (!dev_valid_name(want_name)) + return -EINVAL; + + if (strchr(want_name, '%')) + return __dev_alloc_name(net, want_name, out_name); + + if (netdev_name_in_use(net, want_name)) + return -dup_errno; + if (out_name != want_name) + strscpy(out_name, want_name, IFNAMSIZ); + return 0; } /** @@ -1025,107 +1427,65 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) int dev_alloc_name(struct net_device *dev, const char *name) { - char buf[IFNAMSIZ]; - struct net *net; - int ret; - - BUG_ON(!dev_net(dev)); - net = dev_net(dev); - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; + return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE); } EXPORT_SYMBOL(dev_alloc_name); -static int dev_alloc_name_ns(struct net *net, - struct net_device *dev, - const char *name) -{ - char buf[IFNAMSIZ]; - int ret; - - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; -} - -static int dev_get_valid_name(struct net *net, - struct net_device *dev, +static int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { - BUG_ON(!net); - - if (!dev_valid_name(name)) - return -EINVAL; - - if (strchr(name, '%')) - return dev_alloc_name_ns(net, dev, name); - else if (__dev_get_by_name(net, name)) - return -EEXIST; - else if (dev->name != name) - strlcpy(dev->name, name, IFNAMSIZ); + int ret; - return 0; + ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST); + return ret < 0 ? ret : 0; } -/** - * dev_change_name - change name of a device - * @dev: device - * @newname: name (or format string) must be at least IFNAMSIZ - * - * Change name of a device, can pass format strings "eth%d". - * for wildcarding. - */ -int dev_change_name(struct net_device *dev, const char *newname) +int netif_change_name(struct net_device *dev, const char *newname) { + struct net *net = dev_net(dev); + unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; - struct net *net; - - ASSERT_RTNL(); - BUG_ON(!dev_net(dev)); - net = dev_net(dev); - if (dev->flags & IFF_UP) - return -EBUSY; + ASSERT_RTNL_NET(net); - write_seqcount_begin(&devnet_rename_seq); - - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - write_seqcount_end(&devnet_rename_seq); + if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; - } memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); - if (err < 0) { - write_seqcount_end(&devnet_rename_seq); + write_sequnlock_bh(&netdev_rename_lock); + + if (err < 0) return err; - } + + if (oldname[0] && !strchr(oldname, '%')) + netdev_info(dev, "renamed from %s%s\n", oldname, + dev->flags & IFF_UP ? " (while UP)" : ""); + + old_assign_type = dev->name_assign_type; + WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); - write_seqcount_end(&devnet_rename_seq); + write_sequnlock_bh(&netdev_rename_lock); + WRITE_ONCE(dev->name_assign_type, old_assign_type); return ret; } - write_seqcount_end(&devnet_rename_seq); + netdev_adjacent_rename_links(dev, oldname); - write_lock_bh(&dev_base_lock); - hlist_del_rcu(&dev->name_hlist); - write_unlock_bh(&dev_base_lock); + netdev_name_node_del(dev->name_node); - synchronize_rcu(); + synchronize_net(); - write_lock_bh(&dev_base_lock); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); - write_unlock_bh(&dev_base_lock); + netdev_name_node_add(net, dev->name_node); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -1134,50 +1494,71 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - write_seqcount_begin(&devnet_rename_seq); + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); + memcpy(oldname, newname, IFNAMSIZ); + WRITE_ONCE(dev->name_assign_type, old_assign_type); + old_assign_type = NET_NAME_RENAMED; goto rollback; } else { - pr_err("%s: name change rollback failed: %d\n", - dev->name, ret); + netdev_err(dev, "name change rollback failed: %d\n", + ret); } } return err; } -/** - * dev_set_alias - change ifalias of a device - * @dev: device - * @alias: name up to IFALIASZ - * @len: limit of bytes to copy from info - * - * Set ifalias for a device, - */ -int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +int netif_set_alias(struct net_device *dev, const char *alias, size_t len) { - char *new_ifalias; - - ASSERT_RTNL(); + struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; - if (!len) { - kfree(dev->ifalias); - dev->ifalias = NULL; - return 0; + if (len) { + new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); + if (!new_alias) + return -ENOMEM; + + memcpy(new_alias->ifalias, alias, len); + new_alias->ifalias[len] = 0; } - new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!new_ifalias) - return -ENOMEM; - dev->ifalias = new_ifalias; + mutex_lock(&ifalias_mutex); + new_alias = rcu_replace_pointer(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); + mutex_unlock(&ifalias_mutex); + + if (new_alias) + kfree_rcu(new_alias, rcuhead); - strlcpy(dev->ifalias, alias, len+1); return len; } +/** + * dev_get_alias - get ifalias of a device + * @dev: device + * @name: buffer to store name of ifalias + * @len: size of buffer + * + * get ifalias for a device. Caller must make sure dev cannot go + * away, e.g. rcu read lock or own a reference count to device. + */ +int dev_get_alias(const struct net_device *dev, char *name, size_t len) +{ + const struct dev_ifalias *alias; + int ret = 0; + + rcu_read_lock(); + alias = rcu_dereference(dev->ifalias); + if (alias) + ret = snprintf(name, len, "%s", alias->ifalias); + rcu_read_unlock(); + + return ret; +} /** * netdev_features_change - device changes features @@ -1191,26 +1572,43 @@ void netdev_features_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_features_change); +void netif_state_change(struct net_device *dev) +{ + netdev_ops_assert_locked_or_invisible(dev); + + if (dev->flags & IFF_UP) { + struct netdev_notifier_change_info change_info = { + .info.dev = dev, + }; + + call_netdevice_notifiers_info(NETDEV_CHANGE, + &change_info.info); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); + } +} + /** - * netdev_state_change - device changes state - * @dev: device to cause notification + * __netdev_notify_peers - notify network peers about existence of @dev, + * to be called when rtnl lock is already held. + * @dev: network device * - * Called to indicate a device has changed state. This function calls - * the notifier chains for netdev_chain and sends a NEWLINK message - * to the routing socket. + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. */ -void netdev_state_change(struct net_device *dev) +void __netdev_notify_peers(struct net_device *dev) { - if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); - } + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); } -EXPORT_SYMBOL(netdev_state_change); +EXPORT_SYMBOL(__netdev_notify_peers); /** - * netdev_notify_peers - notify network peers about existence of @dev - * @dev: network device + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when @@ -1221,47 +1619,75 @@ EXPORT_SYMBOL(netdev_state_change); void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int napi_threaded_poll(void *data); + +static int napi_kthread_create(struct napi_struct *n) +{ + int err = 0; + + /* Create and wake up the kthread once to put it in + * TASK_INTERRUPTIBLE mode to avoid the blocked task + * warning and work with loadavg. + */ + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", + n->dev->name, n->napi_id); + if (IS_ERR(n->thread)) { + err = PTR_ERR(n->thread); + pr_err("kthread_run failed with err %d\n", err); + n->thread = NULL; + } + + return err; +} + +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); + dev_addr_check(dev); - if (!netif_device_present(dev)) - return -ENODEV; + if (!netif_device_present(dev)) { + /* may be detached because parent is runtime-suspended */ + if (dev->dev.parent) + pm_runtime_resume(dev->dev.parent); + if (!netif_device_present(dev)) + return -ENODEV; + } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ - netpoll_rx_disable(dev); + netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; set_bit(__LINK_STATE_START, &dev->state); + netdev_ops_assert_locked(dev); + if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - netpoll_rx_enable(dev); + netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - dev->flags |= IFF_UP; - net_dmaengine_get(); + netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1270,44 +1696,34 @@ static int __dev_open(struct net_device *dev) return ret; } -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev) +int netif_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; } -EXPORT_SYMBOL(dev_open); -static int __dev_close_many(struct list_head *head) +static void __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { + /* Temporarily disable netpoll until the interface is down */ + netpoll_poll_disable(dev); + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1318,12 +1734,12 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* @@ -1333,243 +1749,524 @@ static int __dev_close_many(struct list_head *head) * We allow it to be called even after a DETACH hot-plug * event. */ + + netdev_ops_assert_locked(dev); + if (ops->ndo_stop) ops->ndo_stop(dev); - dev->flags &= ~IFF_UP; - net_dmaengine_put(); + netif_set_up(dev, false); + netpoll_poll_enable(dev); } - - return 0; } -static int __dev_close(struct net_device *dev) +static void __dev_close(struct net_device *dev) { - int retval; LIST_HEAD(single); - /* Temporarily disable netpoll until the interface is down */ - netpoll_rx_disable(dev); - - list_add(&dev->unreg_list, &single); - retval = __dev_close_many(&single); + list_add(&dev->close_list, &single); + __dev_close_many(&single); list_del(&single); - - netpoll_rx_enable(dev); - return retval; } -static int dev_close_many(struct list_head *head) +void netif_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; - LIST_HEAD(tmp_list); - list_for_each_entry_safe(dev, tmp, head, unreg_list) + /* Remove the devices that don't need to be closed */ + list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) - list_move(&dev->unreg_list, &tmp_list); + list_del_init(&dev->close_list); __dev_close_many(head); - list_for_each_entry(dev, head, unreg_list) { - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + list_for_each_entry_safe(dev, tmp, head, close_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_DOWN, dev); + if (unlink) + list_del_init(&dev->close_list); } - - /* rollback_registered_many needs the complete original list */ - list_splice(&tmp_list, head); - return 0; } +EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL"); -/** - * dev_close - shutdown an interface. - * @dev: device to shutdown - * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. - */ -int dev_close(struct net_device *dev) +void netif_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); - /* Block netpoll rx while the interface is going down */ - netpoll_rx_disable(dev); - - list_add(&dev->unreg_list, &single); - dev_close_many(&single); + list_add(&dev->close_list, &single); + netif_close_many(&single, true); list_del(&single); - - netpoll_rx_enable(dev); } - return 0; } -EXPORT_SYMBOL(dev_close); +EXPORT_SYMBOL(netif_close); + +void netif_disable_lro(struct net_device *dev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + dev->wanted_features &= ~NETIF_F_LRO; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_LRO)) + netdev_WARN(dev, "failed to disable LRO!\n"); + netdev_for_each_lower_dev(dev, lower_dev, iter) { + netdev_lock_ops(lower_dev); + netif_disable_lro(lower_dev); + netdev_unlock_ops(lower_dev); + } +} +EXPORT_IPV6_MOD(netif_disable_lro); /** - * dev_disable_lro - disable Large Receive Offload on a device + * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * - * Disable Large Receive Offload (LRO) on a net device. Must be - * called under RTNL. This is needed if received packets may be - * forwarded to another interface. + * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be + * called under RTNL. This is needed if Generic XDP is installed on + * the device. */ -void dev_disable_lro(struct net_device *dev) +static void dev_disable_gro_hw(struct net_device *dev) { - /* - * If we're trying to disable lro on a vlan device - * use the underlying physical device instead - */ - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); - - dev->wanted_features &= ~NETIF_F_LRO; + dev->wanted_features &= ~NETIF_F_GRO_HW; netdev_update_features(dev); - if (unlikely(dev->features & NETIF_F_LRO)) - netdev_WARN(dev, "failed to disable LRO!\n"); + if (unlikely(dev->features & NETIF_F_GRO_HW)) + netdev_WARN(dev, "failed to disable GRO_HW!\n"); +} + +const char *netdev_cmd_to_name(enum netdev_cmd cmd) +{ +#define N(val) \ + case NETDEV_##val: \ + return "NETDEV_" __stringify(val); + switch (cmd) { + N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) + N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) + N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) + N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) + N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) + N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) + N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) + N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) + N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) + N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) + N(XDP_FEAT_CHANGE) + } +#undef N + return "UNKNOWN_NETDEV_EVENT"; } -EXPORT_SYMBOL(dev_disable_lro); +EXPORT_SYMBOL_GPL(netdev_cmd_to_name); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - netdev_notifier_info_init(&info, dev); return nb->notifier_call(nb, val, &info); } +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + netdev_lock_ops(dev); + err = call_netdevice_register_notifiers(nb, dev); + netdev_unlock_ops(dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + static int dev_boot_phase = 1; /** - * register_netdevice_notifier - register a network notifier block - * @nb: notifier + * register_netdevice_notifier - register a network notifier block + * @nb: notifier * - * Register a notifier to be called when network device events occur. - * The notifier passed is linked into the kernel structures and must - * not be reused until it has been unregistered. A negative errno code - * is returned on a failure. + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. * - * When registered all registration and up events are replayed - * to the new notifier to allow device to have a race free - * view of the network device list. + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; - struct net_device *last; struct net *net; int err; + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + + /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - call_netdevice_notifier(nb, NETDEV_UP, dev); - } + __rtnl_net_lock(net); + err = call_netdevice_register_net_notifiers(nb, net); + __rtnl_net_unlock(net); + if (err) + goto rollback; } unlock: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - goto outroll; - - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } + for_each_net_continue_reverse(net) { + __rtnl_net_lock(net); + call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); } -outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier); /** - * unregister_netdevice_notifier - unregister a network notifier block - * @nb: notifier + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier * - * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the - * kernel structures and may then be reused. A negative errno code - * is returned on a failure. + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. * - * After unregistering unregister and down device events are synthesized - * for all devices on the device list to the removed notifier to remove - * the need for special case cleanup code. + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; struct net *net; int err; + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } + __rtnl_net_lock(net); + call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); } + unlock: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); +static int __register_netdevice_notifier_net(struct net *net, + struct notifier_block *nb, + bool ignore_call_fail) +{ + int err; + + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + return err; + if (dev_boot_phase) + return 0; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err && !ignore_call_fail) + goto chain_unregister; + + return 0; + +chain_unregister: + raw_notifier_chain_unregister(&net->netdev_chain, nb); + return err; +} + +static int __unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + return err; + + call_netdevice_unregister_net_notifiers(nb, net); + return 0; +} + +/** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_net_lock(net); + err = __register_netdevice_notifier_net(net, nb, false); + rtnl_net_unlock(net); + + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier_net(). The notifier is unlinked from the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_net_lock(net); + err = __unregister_netdevice_notifier_net(net, nb); + rtnl_net_unlock(net); + + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + +static void __move_netdevice_notifier_net(struct net *src_net, + struct net *dst_net, + struct notifier_block *nb) +{ + __unregister_netdevice_notifier_net(src_net, nb); + __register_netdevice_notifier_net(dst_net, nb, true); +} + +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + +int register_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_net_dev_lock(dev); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); + if (!err) { + nn->nb = nb; + list_add(&nn->list, &dev->net_notifier_list); + } + rtnl_net_dev_unlock(dev); + + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_dev_net); + +int unregister_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_net_dev_lock(dev); + list_del(&nn->list); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_net_dev_unlock(dev); + + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); + +static void move_netdevice_notifiers_dev_net(struct net_device *dev, + struct net *net) +{ + struct netdev_net_notifier *nn; + + list_for_each_entry(nn, &dev->net_notifier_list, list) + __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); +} + /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function - * @dev: net_device pointer passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ -int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, +int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { + struct net *net = dev_net(info->dev); + int ret; + ASSERT_RTNL(); - netdev_notifier_info_init(info, dev); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } -EXPORT_SYMBOL(call_netdevice_notifiers_info); + +/** + * call_netdevice_notifiers_info_robust - call per-netns notifier blocks + * for and rollback on error + * @val_up: value passed unmodified to notifier function + * @val_down: value passed unmodified to the notifier function when + * recovering from an error on @val_up + * @info: notifier information data + * + * Call all per-netns network notifier blocks, but not notifier blocks on + * the global notifier chain. Parameters and return value are as for + * raw_notifier_call_chain_robust(). + */ + +static int +call_netdevice_notifiers_info_robust(unsigned long val_up, + unsigned long val_down, + struct netdev_notifier_info *info) +{ + struct net *net = dev_net(info->dev); + + ASSERT_RTNL(); + + return raw_notifier_call_chain_robust(&net->netdev_chain, + val_up, val_down, info); +} + +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} /** * call_netdevice_notifiers - call all network notifier blocks @@ -1582,81 +2279,160 @@ EXPORT_SYMBOL(call_netdevice_notifiers_info); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; - - return call_netdevice_notifiers_info(val, dev, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); -static struct static_key netstamp_needed __read_mostly; -#ifdef HAVE_JUMP_LABEL -/* We are not allowed to call static_key_slow_dec() from irq context - * If net_disable_timestamp() is called from irq context, defer the - * static_key_slow_dec() calls. +/** + * call_netdevice_notifiers_mtu - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @arg: additional u32 argument passed to the notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). */ +static int call_netdevice_notifiers_mtu(unsigned long val, + struct net_device *dev, u32 arg) +{ + struct netdev_notifier_info_ext info = { + .info.dev = dev, + .ext.mtu = arg, + }; + + BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); + + return call_netdevice_notifiers_info(val, &info.info); +} + +#ifdef CONFIG_NET_INGRESS +static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); + +void net_inc_ingress_queue(void) +{ + static_branch_inc(&ingress_needed_key); +} +EXPORT_SYMBOL_GPL(net_inc_ingress_queue); + +void net_dec_ingress_queue(void) +{ + static_branch_dec(&ingress_needed_key); +} +EXPORT_SYMBOL_GPL(net_dec_ingress_queue); +#endif + +#ifdef CONFIG_NET_EGRESS +static DEFINE_STATIC_KEY_FALSE(egress_needed_key); + +void net_inc_egress_queue(void) +{ + static_branch_inc(&egress_needed_key); +} +EXPORT_SYMBOL_GPL(net_inc_egress_queue); + +void net_dec_egress_queue(void) +{ + static_branch_dec(&egress_needed_key); +} +EXPORT_SYMBOL_GPL(net_dec_egress_queue); +#endif + +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); +EXPORT_SYMBOL(tcf_sw_enabled_key); +#endif + +DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +EXPORT_SYMBOL(netstamp_needed_key); +#ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; +static atomic_t netstamp_wanted; +static void netstamp_clear(struct work_struct *work) +{ + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + int wanted; + + wanted = atomic_add_return(deferred, &netstamp_wanted); + if (wanted > 0) + static_branch_enable(&netstamp_needed_key); + else + static_branch_disable(&netstamp_needed_key); +} +static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { -#ifdef HAVE_JUMP_LABEL - int deferred = atomic_xchg(&netstamp_needed_deferred, 0); +#ifdef CONFIG_JUMP_LABEL + int wanted = atomic_read(&netstamp_wanted); - if (deferred) { - while (--deferred) - static_key_slow_dec(&netstamp_needed); - return; + while (wanted > 0) { + if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) + return; } + atomic_inc(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else + static_branch_inc(&netstamp_needed_key); #endif - static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { -#ifdef HAVE_JUMP_LABEL - if (in_interrupt()) { - atomic_inc(&netstamp_needed_deferred); - return; +#ifdef CONFIG_JUMP_LABEL + int wanted = atomic_read(&netstamp_wanted); + + while (wanted > 1) { + if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) + return; } + atomic_dec(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else + static_branch_dec(&netstamp_needed_key); #endif - static_key_slow_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { - skb->tstamp.tv64 = 0; - if (static_key_false(&netstamp_needed)) - __net_timestamp(skb); + skb->tstamp = 0; + skb->tstamp_type = SKB_CLOCK_REALTIME; + if (static_branch_unlikely(&netstamp_needed_key)) + skb->tstamp = ktime_get_real(); } -#define net_timestamp_check(COND, SKB) \ - if (static_key_false(&netstamp_needed)) { \ - if ((COND) && !(SKB)->tstamp.tv64) \ - __net_timestamp(SKB); \ - } \ +#define net_timestamp_check(COND, SKB) \ + if (static_branch_unlikely(&netstamp_needed_key)) { \ + if ((COND) && !(SKB)->tstamp) \ + (SKB)->tstamp = ktime_get_real(); \ + } \ -static inline bool is_skb_forwardable(struct net_device *dev, - struct sk_buff *skb) +bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { - unsigned int len; + return __is_skb_forwardable(dev, skb, true); +} +EXPORT_SYMBOL_GPL(is_skb_forwardable); - if (!(dev->flags & IFF_UP)) - return false; +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) +{ + int ret = ____dev_forward_skb(dev, skb, check_mtu); - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len <= len) - return true; + if (likely(!ret)) { + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } - /* if TSO is enabled, we don't care about the length as the packet - * could be forwarded without being segmented before - */ - if (skb_is_gso(skb)) - return true; + return ret; +} - return false; +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); } +EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif @@ -1678,41 +2454,43 @@ static inline bool is_skb_forwardable(struct net_device *dev, */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, GFP_ATOMIC)) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - } - - if (unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - skb_scrub_packet(skb); - skb->protocol = eth_type_trans(skb, dev); - - /* eth_type_trans() can set pkt_type. - * clear pkt_type _after_ calling eth_type_trans() - */ - skb->pkt_type = PACKET_HOST; - - return netif_rx(skb); + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + +static int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; - atomic_inc(&skb->users); + refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline void deliver_ptype_list_skb(struct sk_buff *skb, + struct packet_type **pt, + struct net_device *orig_dev, + __be16 type, + struct list_head *ptype_list) +{ + struct packet_type *ptype, *pt_prev = *pt; + + list_for_each_entry_rcu(ptype, ptype_list, list) { + if (ptype->type != type) + continue; + if (unlikely(pt_prev)) + deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + *pt = pt_prev; +} + static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) @@ -1726,59 +2504,93 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) return false; } +/** + * dev_nit_active_rcu - return true if any network interface taps are in use + * + * The caller must hold the RCU lock + * + * @dev: network device to check for the presence of taps + */ +bool dev_nit_active_rcu(const struct net_device *dev) +{ + /* Callers may hold either RCU or RCU BH lock */ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + return !list_empty(&dev_net(dev)->ptype_all) || + !list_empty(&dev->ptype_all); +} +EXPORT_SYMBOL_GPL(dev_nit_active_rcu); + /* * Support routine. Sends outgoing frames to any network * taps currently in use. */ -static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct packet_type *ptype; + struct packet_type *ptype, *pt_prev = NULL; + struct list_head *ptype_list; struct sk_buff *skb2 = NULL; - struct packet_type *pt_prev = NULL; rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { + ptype_list = &dev_net_rcu(dev)->ptype_all; +again: + list_for_each_entry_rcu(ptype, ptype_list, list) { + if (READ_ONCE(ptype->ignore_outgoing)) + continue; + /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ - if ((ptype->dev == dev || !ptype->dev) && - (!skb_loop_sk(ptype, skb))) { - if (pt_prev) { - deliver_skb(skb2, pt_prev, skb->dev); - pt_prev = ptype; - continue; - } + if (skb_loop_sk(ptype, skb)) + continue; - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - break; + if (unlikely(pt_prev)) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } - net_timestamp_set(skb2); + /* need to clone skb, done only once */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out_unlock; - /* skb->nh should be correctly - set by sender, so that the second statement is - just protection against buggy protocols. - */ - skb_reset_mac_header(skb2); - - if (skb_network_header(skb2) < skb2->data || - skb_network_header(skb2) > skb_tail_pointer(skb2)) { - net_crit_ratelimited("protocol %04x is buggy, dev %s\n", - ntohs(skb2->protocol), - dev->name); - skb_reset_network_header(skb2); - } + net_timestamp_set(skb2); - skb2->transport_header = skb2->network_header; - skb2->pkt_type = PACKET_OUTGOING; - pt_prev = ptype; + /* skb->nh should be correctly + * set by sender, so that the second statement is + * just protection against buggy protocols. + */ + skb_reset_mac_header(skb2); + + if (skb_network_header(skb2) < skb2->data || + skb_network_header(skb2) > skb_tail_pointer(skb2)) { + net_crit_ratelimited("protocol %04x is buggy, dev %s\n", + ntohs(skb2->protocol), + dev->name); + skb_reset_network_header(skb2); } + + skb2->transport_header = skb2->network_header; + skb2->pkt_type = PACKET_OUTGOING; + pt_prev = ptype; + } + + if (ptype_list != &dev->ptype_all) { + ptype_list = &dev->ptype_all; + goto again; + } +out_unlock: + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); } - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change @@ -1800,7 +2612,7 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { - pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); + netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } @@ -1811,79 +2623,152 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { - pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", - i, q); + netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", + i, q); netdev_set_prio_tc_map(dev, i, 0); } } } +int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) +{ + if (dev->num_tc) { + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + int i; + + /* walk through the TCs and see if it falls into any of them */ + for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { + if ((txq - tc->offset) < tc->count) + return i; + } + + /* didn't find it, just return -1 to indicate no match */ + return -1; + } + + return 0; +} +EXPORT_SYMBOL(netdev_txq_to_tc); + #ifdef CONFIG_XPS +static struct static_key xps_needed __read_mostly; +static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) -static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, - int cpu, u16 index) +static bool remove_xps_queue(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; - if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[cpu]); + map = xmap_dereference(dev_maps->attr_map[tci]); + if (!map) + return false; - for (pos = 0; map && pos < map->len; pos++) { - if (map->queues[pos] == index) { - if (map->len > 1) { - map->queues[pos] = map->queues[--map->len]; - } else { - RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); - kfree_rcu(map, rcu); - map = NULL; - } + for (pos = map->len; pos--;) { + if (map->queues[pos] != index) + continue; + + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; break; } + + if (old_maps) + RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); + kfree_rcu(map, rcu); + return false; } - return map; + return true; } -static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +static bool remove_xps_queue_cpu(struct net_device *dev, + struct xps_dev_maps *dev_maps, + int cpu, u16 offset, u16 count) { - struct xps_dev_maps *dev_maps; - int cpu, i; + int num_tc = dev_maps->num_tc; bool active = false; + int tci; - mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); - - if (!dev_maps) - goto out_no_maps; + for (tci = cpu * num_tc; num_tc--; tci++) { + int i, j; - for_each_possible_cpu(cpu) { - for (i = index; i < dev->num_tx_queues; i++) { - if (!remove_xps_queue(dev_maps, cpu, i)) + for (i = count, j = offset; i--; j++) { + if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } - if (i == dev->num_tx_queues) - active = true; + + active |= i < 0; } - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); + return active; +} + +static void reset_xps_maps(struct net_device *dev, + struct xps_dev_maps *dev_maps, + enum xps_map_type type) +{ + static_key_slow_dec_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + + RCU_INIT_POINTER(dev->xps_maps[type], NULL); + + kfree_rcu(dev_maps, rcu); +} + +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, + u16 offset, u16 count) +{ + struct xps_dev_maps *dev_maps; + bool active = false; + int i, j; + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (!dev_maps) + return; + + for (j = 0; j < dev_maps->nr_ids; j++) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); + if (!active) + reset_xps_maps(dev, dev_maps, type); + + if (type == XPS_CPUS) { + for (i = offset + (count - 1); count--; i--) + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } +} + +static void netif_reset_xps_queues(struct net_device *dev, u16 offset, + u16 count) +{ + if (!static_key_false(&xps_needed)) + return; + + cpus_read_lock(); + mutex_lock(&xps_map_mutex); - for (i = index; i < dev->num_tx_queues; i++) - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + if (static_key_false(&xps_rxqs_needed)) + clean_xps_maps(dev, XPS_RXQS, offset, count); + + clean_xps_maps(dev, XPS_CPUS, offset, count); -out_no_maps: mutex_unlock(&xps_map_mutex); + cpus_read_unlock(); } -static struct xps_map *expand_xps_map(struct xps_map *map, - int cpu, u16 index) +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, + u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; @@ -1895,7 +2780,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return map; } - /* Need to add queue to this CPU's existing map */ + /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; @@ -1903,9 +2788,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map, alloc_len = map->alloc_len * 2; } - /* Need to allocate new map to store queue on this CPU's map */ - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, - cpu_to_node(cpu)); + /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's + * map + */ + if (is_rxqs_map) + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); + else + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(attr_index)); if (!new_map) return NULL; @@ -1917,108 +2807,205 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return new_map; } -int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) +/* Copy xps maps at a given index */ +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *new_dev_maps, int index, + int tc, bool skip_tc) { - struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + int i, tci = index * dev_maps->num_tc; + struct xps_map *map; + + /* copy maps belonging to foreign traffic classes */ + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && skip_tc) + continue; + + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); + } +} + +/* Must be called under cpus_read_lock */ +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, enum xps_map_type type) +{ + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; + const unsigned long *online_mask = NULL; + bool active = false, copy = false; + int i, j, tci, numa_node_id = -2; + int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); - int cpu, numa_node_id = -2; - bool active = false; + unsigned int nr_ids; + + WARN_ON_ONCE(index >= dev->num_tx_queues); + + if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ + num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (type == XPS_RXQS) { + maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); + nr_ids = dev->num_rx_queues; + } else { + maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); + if (num_possible_cpus() > 1) + online_mask = cpumask_bits(cpu_online_mask); + nr_ids = nr_cpu_ids; + } - /* allocate memory for queue storage */ - for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mask)) - continue; + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; - if (!new_dev_maps) - new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + /* The old dev_maps could be larger or smaller than the one we're + * setting up now, as dev->num_tc or nr_ids could have been updated in + * between. We could try to be smart, but let's be safe instead and only + * copy foreign traffic classes if the two map sizes match. + */ + if (dev_maps && + dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) + copy = true; + + /* allocate memory for queue storage */ + for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), + j < nr_ids;) { if (!new_dev_maps) { - mutex_unlock(&xps_map_mutex); - return -ENOMEM; + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + new_dev_maps->nr_ids = nr_ids; + new_dev_maps->num_tc = num_tc; } - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : - NULL; + tci = j * num_tc + tc; + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, cpu, index); + map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; - for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { - /* add queue to CPU maps */ + if (!dev_maps) { + /* Increment static keys at most once per type */ + static_key_slow_inc_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + } + + for (j = 0; j < nr_ids; j++) { + bool skip_tc = false; + + tci = j * num_tc + tc; + if (netif_attr_test_mask(j, mask, nr_ids) && + netif_attr_test_online(j, online_mask, nr_ids)) { + /* add tx-queue to CPU/rx-queue maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + skip_tc = true; + + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; + if (type == XPS_CPUS) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(j); + else if (numa_node_id != cpu_to_node(j)) + numa_node_id = -1; + } #endif - } else if (dev_maps) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[cpu]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); } + if (copy) + xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, + skip_tc); } - rcu_assign_pointer(dev->xps_maps, new_dev_maps); + rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ - if (dev_maps) { - for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = xmap_dereference(dev_maps->cpu_map[cpu]); - if (map && map != new_map) - kfree_rcu(map, rcu); - } + if (!dev_maps) + goto out_no_old_maps; + + for (j = 0; j < dev_maps->nr_ids; j++) { + for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { + map = xmap_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; - kfree_rcu(dev_maps, rcu); + if (copy) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + if (map == new_map) + continue; + } + + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); + kfree_rcu(map, rcu); + } } + old_dev_maps = dev_maps; + +out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: - /* update Tx queue numa node */ - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), - (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); + if (type == XPS_CPUS) + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? + numa_node_id : NUMA_NO_NODE); if (!dev_maps) goto out_no_maps; - /* removes queue from unused CPUs */ - for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) - continue; + /* removes tx-queue from unused CPUs/rx-queues */ + for (j = 0; j < dev_maps->nr_ids; j++) { + tci = j * dev_maps->num_tc; + + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && + netif_attr_test_mask(j, mask, dev_maps->nr_ids) && + netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) + continue; - if (remove_xps_queue(dev_maps, cpu, index)) - active = true; + active |= remove_xps_queue(dev_maps, + copy ? old_dev_maps : NULL, + tci, index); + } } + if (old_dev_maps) + kfree_rcu(old_dev_maps, rcu); + /* free map if not active */ - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } + if (!active) + reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2026,12 +3013,15 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : - NULL; - if (new_map && new_map != map) - kfree(new_map); + for (j = 0; j < nr_ids; j++) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + map = copy ? + xmap_dereference(dev_maps->attr_map[tci]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } } mutex_unlock(&xps_map_mutex); @@ -2039,23 +3029,157 @@ error: kfree(new_dev_maps); return -ENOMEM; } +EXPORT_SYMBOL_GPL(__netif_set_xps_queue); + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + int ret; + + cpus_read_lock(); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); + cpus_read_unlock(); + + return ret; +} EXPORT_SYMBOL(netif_set_xps_queue); #endif +static void netdev_unbind_all_sb_channels(struct net_device *dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + + /* Unbind any subordinate channels */ + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev) + netdev_unbind_sb_channel(dev, txq->sb_dev); + } +} + +void netdev_reset_tc(struct net_device *dev) +{ +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + netdev_unbind_all_sb_channels(dev); + + /* Reset TC configuration of device */ + dev->num_tc = 0; + memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); + memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); +} +EXPORT_SYMBOL(netdev_reset_tc); + +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) +{ + if (tc >= dev->num_tc) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues(dev, offset, count); +#endif + dev->tc_to_txq[tc].count = count; + dev->tc_to_txq[tc].offset = offset; + return 0; +} +EXPORT_SYMBOL(netdev_set_tc_queue); + +int netdev_set_num_tc(struct net_device *dev, u8 num_tc) +{ + if (num_tc > TC_MAX_QUEUE) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + netdev_unbind_all_sb_channels(dev); + + dev->num_tc = num_tc; + return 0; +} +EXPORT_SYMBOL(netdev_set_num_tc); + +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(sb_dev, 0); +#endif + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset) +{ + /* Make certain the sb_dev and dev are already configured */ + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) + return -EINVAL; + + /* We cannot hand out queues we don't have */ + if ((offset + count) > dev->real_num_tx_queues) + return -EINVAL; + + /* Record the mapping */ + sb_dev->tc_to_txq[tc].count = count; + sb_dev->tc_to_txq[tc].offset = offset; + + /* Provide a way for Tx queue to find the tc_to_txq map or + * XPS map for itself. + */ + while (count--) + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; + + return 0; +} +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); + +int netdev_set_sb_channel(struct net_device *dev, u16 channel) +{ + /* Do not use a multiqueue device to represent a subordinate channel */ + if (netif_is_multiqueue(dev)) + return -ENODEV; + + /* We allow channels 1 - 32767 to be used for subordinate channels. + * Channel 0 is meant to be "native" mode and used only to represent + * the main root device. We allow writing 0 to reset the device back + * to normal mode after being used as a subordinate channel. + */ + if (channel > S16_MAX) + return -EINVAL; + + dev->num_tc = -channel; + + return 0; +} +EXPORT_SYMBOL(netdev_set_sb_channel); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues - * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { + bool disabling; int rc; + disabling = txq < dev->real_num_tx_queues; + if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); @@ -2065,20 +3189,27 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); - if (txq < dev->real_num_tx_queues) { + net_shaper_set_real_num_tx_queues(dev, txq); + + dev_qdisc_change_real_num_tx(dev, txq); + + dev->real_num_tx_queues = txq; + + if (disabling) { + synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } + } else { + dev->real_num_tx_queues = txq; } - dev->real_num_tx_queues = txq; return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -2097,7 +3228,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); @@ -2109,27 +3240,131 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); -#endif + +/** + * netif_set_real_num_queues - set actual number of RX and TX queues used + * @dev: Network device + * @txq: Actual number of TX queues + * @rxq: Actual number of RX queues + * + * Set the real number of both TX and RX queues. + * Does nothing if the number of queues is already correct. + */ +int netif_set_real_num_queues(struct net_device *dev, + unsigned int txq, unsigned int rxq) +{ + unsigned int old_rxq = dev->real_num_rx_queues; + int err; + + if (txq < 1 || txq > dev->num_tx_queues || + rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + /* Start from increases, so the error path only does decreases - + * decreases can't fail. + */ + if (rxq > dev->real_num_rx_queues) { + err = netif_set_real_num_rx_queues(dev, rxq); + if (err) + return err; + } + if (txq > dev->real_num_tx_queues) { + err = netif_set_real_num_tx_queues(dev, txq); + if (err) + goto undo_rx; + } + if (rxq < dev->real_num_rx_queues) + WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); + if (txq < dev->real_num_tx_queues) + WARN_ON(netif_set_real_num_tx_queues(dev, txq)); + + return 0; +undo_rx: + WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); + return err; +} +EXPORT_SYMBOL(netif_set_real_num_queues); + +/** + * netif_set_tso_max_size() - set the max size of TSO frames supported + * @dev: netdev to update + * @size: max skb->len of a TSO frame + * + * Set the limit on the size of TSO super-frames the device can handle. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. + */ +void netif_set_tso_max_size(struct net_device *dev, unsigned int size) +{ + dev->tso_max_size = min(GSO_MAX_SIZE, size); + if (size < READ_ONCE(dev->gso_max_size)) + netif_set_gso_max_size(dev, size); + if (size < READ_ONCE(dev->gso_ipv4_max_size)) + netif_set_gso_ipv4_max_size(dev, size); +} +EXPORT_SYMBOL(netif_set_tso_max_size); + +/** + * netif_set_tso_max_segs() - set the max number of segs supported for TSO + * @dev: netdev to update + * @segs: max number of TCP segments + * + * Set the limit on the number of TCP segments the device can generate from + * a single TSO super-frame. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. + */ +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) +{ + dev->tso_max_segs = segs; + if (segs < READ_ONCE(dev->gso_max_segs)) + netif_set_gso_max_segs(dev, segs); +} +EXPORT_SYMBOL(netif_set_tso_max_segs); + +/** + * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper + * @to: netdev to update + * @from: netdev from which to copy the limits + */ +void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) +{ + netif_set_tso_max_size(to, from->tso_max_size); + netif_set_tso_max_segs(to, from->tso_max_segs); +} +EXPORT_SYMBOL(netif_inherit_tso_max); /** * netif_get_num_default_rss_queues - default number of RSS queues * - * This routine should set an upper limit on the number of RSS queues - * used by default by multiqueue devices. + * Default value is the number of physical cores if there are only 1 or 2, or + * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { - return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); + cpumask_var_t cpus; + int cpu, count = 0; + + if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) + return 1; + + cpumask_copy(cpus, cpu_online_mask); + for_each_cpu(cpu, cpus) { + ++count; + cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); + } + free_cpumask_var(cpus); + + return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); -static inline void __netif_reschedule(struct Qdisc *q) +static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); + sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; @@ -2139,35 +3374,82 @@ static inline void __netif_reschedule(struct Qdisc *q) void __netif_schedule(struct Qdisc *q) { + /* If q->defer_list is not empty, at least one thread is + * in __dev_xmit_skb() before llist_del_all(&q->defer_list). + * This thread will attempt to run the queue. + */ + if (!llist_empty(&q->defer_list)) + return; + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); -void dev_kfree_skb_irq(struct sk_buff *skb) +struct dev_kfree_skb_cb { + enum skb_drop_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { - if (atomic_dec_and_test(&skb->users)) { - struct softnet_data *sd; - unsigned long flags; + return (struct dev_kfree_skb_cb *)skb->cb; +} - local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); - skb->next = sd->completion_queue; - sd->completion_queue = skb; - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_restore(flags); +void netif_schedule_queue(struct netdev_queue *txq) +{ + rcu_read_lock(); + if (!netif_xmit_stopped(txq)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); + + __netif_schedule(q); } + rcu_read_unlock(); } -EXPORT_SYMBOL(dev_kfree_skb_irq); +EXPORT_SYMBOL(netif_schedule_queue); -void dev_kfree_skb_any(struct sk_buff *skb) +void netif_tx_wake_queue(struct netdev_queue *dev_queue) { - if (in_irq() || irqs_disabled()) - dev_kfree_skb_irq(skb); + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_tx_wake_queue); + +void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + unsigned long flags; + + if (unlikely(!skb)) + return; + + if (likely(refcount_read(&skb->users) == 1)) { + smp_rmb(); + refcount_set(&skb->users, 0); + } else if (likely(!refcount_dec_and_test(&skb->users))) { + return; + } + get_kfree_skb_cb(skb)->reason = reason; + local_irq_save(flags); + skb->next = __this_cpu_read(softnet_data.completion_queue); + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dev_kfree_skb_irq_reason); + +void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + if (in_hardirq() || irqs_disabled()) + dev_kfree_skb_irq_reason(skb, reason); else - dev_kfree_skb(skb); + kfree_skb_reason(skb, reason); } -EXPORT_SYMBOL(dev_kfree_skb_any); +EXPORT_SYMBOL(dev_kfree_skb_any_reason); /** @@ -2196,29 +3478,68 @@ void netif_device_attach(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); -static void skb_warn_bad_offload(const struct sk_buff *skb) +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +static u16 skb_tx_hash(const struct net_device *dev, + const struct net_device *sb_dev, + struct sk_buff *skb) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = dev->real_num_tx_queues; + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + + qoffset = sb_dev->tc_to_txq[tc].offset; + qcount = sb_dev->tc_to_txq[tc].count; + if (unlikely(!qcount)) { + net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", + sb_dev->name, qoffset, tc); + qoffset = 0; + qcount = dev->real_num_tx_queues; + } + } + + if (skb_rx_queue_recorded(skb)) { + DEBUG_NET_WARN_ON_ONCE(qcount == 0); + hash = skb_get_rx_queue(skb); + if (hash >= qoffset) + hash -= qoffset; + while (unlikely(hash >= qcount)) + hash -= qcount; + return hash + qoffset; + } + + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; +} + +void skb_warn_bad_offload(const struct sk_buff *skb) { - static const netdev_features_t null_features = 0; + static const netdev_features_t null_features; struct net_device *dev = skb->dev; - const char *driver = ""; + const char *name = ""; if (!net_ratelimit()) return; - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - - WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " - "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : &null_features, - skb->sk ? &skb->sk->sk_route_caps : &null_features, - skb->len, skb->data_len, skb_shinfo(skb)->gso_size, - skb_shinfo(skb)->gso_type, skb->ip_summed); + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } + skb_dump(KERN_WARNING, skb, false); + WARN(1, "%s: caps=(%pNF, %pNF)\n", + name, dev ? &dev->features : &null_features, + skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* @@ -2233,11 +3554,15 @@ int skb_checksum_help(struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; - if (unlikely(skb_shinfo(skb)->gso_size)) { + if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } + if (!skb_frags_readable(skb)) { + return -EFAULT; + } + /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ @@ -2248,20 +3573,27 @@ int skb_checksum_help(struct sk_buff *skb) } offset = skb_checksum_start_offset(skb); - BUG_ON(offset >= skb_headlen(skb)); + ret = -EINVAL; + if (unlikely(offset >= skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", + offset, skb_headlen(skb)); + goto out; + } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; - BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; + if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", + offset + sizeof(__sum16), skb_headlen(skb)); + goto out; } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: @@ -2269,159 +3601,93 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -__be16 skb_network_protocol(struct sk_buff *skb) +#ifdef CONFIG_NET_CRC32C +int skb_crc32c_csum_help(struct sk_buff *skb) { - __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; + u32 crc; + int ret = 0, offset, start; - /* Tunnel gso handlers can set protocol to ethernet. */ - if (type == htons(ETH_P_TEB)) { - struct ethhdr *eth; + if (skb->ip_summed != CHECKSUM_PARTIAL) + goto out; - if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) - return 0; + if (unlikely(skb_is_gso(skb))) + goto out; - eth = (struct ethhdr *)skb_mac_header(skb); - type = eth->h_proto; + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (unlikely(skb_has_shared_frag(skb))) { + ret = __skb_linearize(skb); + if (ret) + goto out; } - - while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vh; - - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return 0; - - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; + start = skb_checksum_start_offset(skb); + offset = start + offsetof(struct sctphdr, checksum); + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + ret = -EINVAL; + goto out; } - return type; + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + + crc = ~skb_crc32c(skb, start, skb->len - start, ~0); + *(__le32 *)(skb->data + offset) = cpu_to_le32(crc); + skb_reset_csum_not_inet(skb); +out: + return ret; } +EXPORT_SYMBOL(skb_crc32c_csum_help); +#endif /* CONFIG_NET_CRC32C */ -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) { - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - __be16 type = skb_network_protocol(skb); + __be16 type = skb->protocol; - if (unlikely(!type)) - return ERR_PTR(-EINVAL); + /* Tunnel gso handlers can set protocol to ethernet. */ + if (type == htons(ETH_P_TEB)) { + struct ethhdr *eth; - __skb_pull(skb, skb->mac_len); + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + return 0; - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - int err; - - err = ptype->callbacks.gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } - segs = ptype->callbacks.gso_segment(skb, features); - break; - } + eth = (struct ethhdr *)skb->data; + type = eth->h_proto; } - rcu_read_unlock(); - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; + return vlan_get_protocol_and_depth(skb, type, depth); } -EXPORT_SYMBOL(skb_mac_gso_segment); -/* openvswitch calls this on rx path, so we need a different check. - */ -static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL; - else - return skb->ip_summed == CHECKSUM_NONE; + netdev_err(dev, "hw csum failure\n"); + skb_dump(KERN_ERR, skb, true); + dump_stack(); } -/** - * __skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @tx_path: whether it is called in TX path - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - */ -struct sk_buff *__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, bool tx_path) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - if (unlikely(skb_needs_check(skb, tx_path))) { - int err; - - skb_warn_bad_offload(skb); - - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) - return ERR_PTR(err); - } - - SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - return skb_mac_gso_segment(skb, features); -} -EXPORT_SYMBOL(__skb_gso_segment); - -/* Take action when hardware reception checksum errors are detected. */ -#ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) -{ - if (net_ratelimit()) { - pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); - dump_stack(); - } + DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif -/* Actually, we should eliminate this check as soon as we know, that: - * 1. IOMMU is present and allows to map all the memory. - * 2. No high memory really exists on this machine. - */ - +/* XXX: check that highmem exists at all on the given machine. */ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - if (PageHighMem(skb_frag_page(frag))) - return 1; - } - } - - if (PCI_DMA_BUS_IS_PHYS) { - struct device *pdev = dev->dev.parent; + struct page *page = skb_frag_page(frag); - if (!pdev) - return 0; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - dma_addr_t addr = page_to_phys(skb_frag_page(frag)); - if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) + if (page && PageHighMem(page)) return 1; } } @@ -2429,385 +3695,1046 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#if IS_ENABLED(CONFIG_NET_MPLS_GSO) +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + if (eth_p_mpls(type)) + features &= skb->dev->mpls_features; -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + return features; +} +#endif -static void dev_gso_skb_destructor(struct sk_buff *skb) +static netdev_features_t harmonize_features(struct sk_buff *skb, + netdev_features_t features) { - struct dev_gso_cb *cb; + __be16 type; - do { - struct sk_buff *nskb = skb->next; + type = skb_network_protocol(skb, NULL); + features = net_mpls_features(skb, features, type); - skb->next = nskb->next; - nskb->next = NULL; - kfree_skb(nskb); - } while (skb->next); + if (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, type)) { + features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); + } + if (illegal_highdma(skb->dev, skb)) + features &= ~NETIF_F_SG; - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); + return features; } -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) +netdev_features_t passthru_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) { - struct sk_buff *segs; + return features; +} +EXPORT_SYMBOL(passthru_features_check); - segs = skb_gso_segment(skb, features); +static netdev_features_t dflt_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return vlan_features_check(skb, features); +} - /* Verifying header integrity only. */ - if (!segs) - return 0; +static netdev_features_t gso_features_check(const struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + u16 gso_segs = skb_shinfo(skb)->gso_segs; - if (IS_ERR(segs)) - return PTR_ERR(segs); + if (gso_segs > READ_ONCE(dev->gso_max_segs)) + return features & ~NETIF_F_GSO_MASK; - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; + if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) + return features & ~NETIF_F_GSO_MASK; - return 0; -} + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } -static netdev_features_t harmonize_features(struct sk_buff *skb, - netdev_features_t features) -{ - if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, skb_network_protocol(skb))) { - features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(skb->dev, skb)) { - features &= ~NETIF_F_SG; + /* Support for GSO partial features requires software + * intervention before we can actually process the packets + * so we need to strip support for any partial features now + * and we can pull them back in after we have partially + * segmented the frame. + */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) + features &= ~dev->gso_partial_features; + + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + struct iphdr *iph = skb->encapsulation ? + inner_ip_hdr(skb) : ip_hdr(skb); + + if (!(iph->frag_off & htons(IP_DF))) + features &= ~NETIF_F_TSO_MANGLEID; } + /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, + * so neither does TSO that depends on it. + */ + if (features & NETIF_F_IPV6_CSUM && + (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && + skb_transport_header_was_set(skb) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); + return features; } netdev_features_t netif_skb_features(struct sk_buff *skb) { - __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; - - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) - features &= ~NETIF_F_GSO_MASK; - - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, features); - } + struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + if (skb_is_gso(skb)) + features = gso_features_check(skb, dev, features); - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) - features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + + if (skb_vlan_tagged(skb)) + features = netdev_intersect_features(features, + dev->vlan_features | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + + if (dev->netdev_ops->ndo_features_check) + features &= dev->netdev_ops->ndo_features_check(skb, dev, + features); + else + features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); -/* - * Returns true if either: - * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG. - */ -static inline int skb_needs_linearize(struct sk_buff *skb, - netdev_features_t features) +static int xmit_one(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, bool more) { - return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && - !(features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && - !(features & NETIF_F_SG))); + unsigned int len; + int rc; + + if (dev_nit_active_rcu(dev)) + dev_queue_xmit_nit(skb, dev); + + len = skb->len; + trace_net_dev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq, more); + trace_net_dev_xmit(skb, rc, dev, len); + + return rc; } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) { - const struct net_device_ops *ops = dev->netdev_ops; + struct sk_buff *skb = first; int rc = NETDEV_TX_OK; - unsigned int skb_len; - if (likely(!skb->next)) { - netdev_features_t features; + while (skb) { + struct sk_buff *next = skb->next; - /* - * If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); + skb_mark_not_on_list(skb); + rc = xmit_one(skb, dev, txq, next != NULL); + if (unlikely(!dev_xmit_complete(rc))) { + skb->next = next; + goto out; + } - features = netif_skb_features(skb); + skb = next; + if (netif_tx_queue_stopped(txq) && skb) { + rc = NETDEV_TX_BUSY; + break; + } + } - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - goto out; +out: + *ret = rc; + return skb; +} - skb->vlan_tci = 0; - } +static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, + netdev_features_t features) +{ + if (skb_vlan_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); + return skb; +} - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; +int skb_csum_hwoffload_help(struct sk_buff *skb, + const netdev_features_t features) +{ + if (unlikely(skb_csum_is_sctp(skb))) + return !!(features & NETIF_F_SCTP_CRC) ? 0 : + skb_crc32c_csum_help(skb); - if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } else { - if (skb_needs_linearize(skb, features) && - __skb_linearize(skb)) - goto out_kfree_skb; + if (features & NETIF_F_HW_CSUM) + return 0; - /* If packet is not checksummed and device does not - * support checksumming for this protocol, complete - * checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && - skb_checksum_help(skb)) - goto out_kfree_skb; - } + if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { + if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + goto sw_checksum; + + switch (skb->csum_offset) { + case offsetof(struct tcphdr, check): + case offsetof(struct udphdr, check): + return 0; } + } - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); +sw_checksum: + return skb_checksum_help(skb); +} +EXPORT_SYMBOL(skb_csum_hwoffload_help); - skb_len = skb->len; - rc = ops->ndo_start_xmit(skb, dev); - trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); - return rc; +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. + */ +static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + struct sock *sk = skb->sk; + + sk_validate = NULL; + if (sk) { + if (sk_fullsock(sk)) + sk_validate = sk->sk_validate_xmit_skb; + else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT) + sk_validate = inet_twsk(sk)->tw_validate_xmit_skb; + } + + if (sk_validate) { + skb = sk_validate(sk, dev, skb); + } else if (unlikely(skb_is_decrypted(skb))) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; } +#endif -gso: - do { - struct sk_buff *nskb = skb->next; - - skb->next = nskb->next; - nskb->next = NULL; - - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(nskb, dev); - - skb_len = nskb->len; - rc = ops->ndo_start_xmit(nskb, dev); - trace_net_dev_xmit(nskb, rc, dev, skb_len); - if (unlikely(rc != NETDEV_TX_OK)) { - if (rc & ~NETDEV_TX_MASK) - goto out_kfree_gso_skb; - nskb->next = skb->next; - skb->next = nskb; - return rc; + return skb; +} + +static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *shinfo; + struct net_iov *niov; + + if (likely(skb_frags_readable(skb))) + goto out; + + if (!dev->netmem_tx) + goto out_free; + + shinfo = skb_shinfo(skb); + + if (shinfo->nr_frags > 0) { + niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); + if (net_is_devmem_iov(niov) && + net_devmem_iov_binding(niov)->dev != dev) + goto out_free; + } + +out: + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) +{ + netdev_features_t features; + + skb = validate_xmit_unreadable_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + + features = netif_skb_features(skb); + skb = validate_xmit_vlan(skb, features); + if (unlikely(!skb)) + goto out_null; + + skb = sk_validate_xmit_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + + if (netif_needs_gso(skb, features)) { + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + if (IS_ERR(segs)) { + goto out_kfree_skb; + } else if (segs) { + consume_skb(skb); + skb = segs; } - txq_trans_update(txq); - if (unlikely(netif_xmit_stopped(txq) && skb->next)) - return NETDEV_TX_BUSY; - } while (skb->next); + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; -out_kfree_gso_skb: - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (skb_csum_hwoffload_help(skb, features)) + goto out_kfree_skb; + } } + + skb = validate_xmit_xfrm(skb, features, again); + + return skb; + out_kfree_skb: kfree_skb(skb); -out: - return rc; +out_null: + dev_core_stats_tx_dropped_inc(dev); + return NULL; +} + +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) +{ + struct sk_buff *next, *head = NULL, *tail; + + for (; skb != NULL; skb = next) { + next = skb->next; + skb_mark_not_on_list(skb); + + /* in case skb won't be segmented, point to itself */ + skb->prev = skb; + + skb = validate_xmit_skb(skb, dev, again); + if (!skb) + continue; + + if (!head) + head = skb; + else + tail->next = skb; + /* If skb was segmented, skb->prev points to + * the last segment. If not, it still contains skb. + */ + tail = skb->prev; + } + return head; } +EXPORT_SYMBOL_GPL(validate_xmit_skb_list); -static void qdisc_pkt_len_init(struct sk_buff *skb) +static void qdisc_pkt_len_segs_init(struct sk_buff *skb) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + struct skb_shared_info *shinfo = skb_shinfo(skb); + u16 gso_segs; qdisc_skb_cb(skb)->pkt_len = skb->len; + if (!shinfo->gso_size) { + qdisc_skb_cb(skb)->pkt_segs = 1; + return; + } + + qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size) { + if (skb_transport_header_was_set(skb)) { unsigned int hdr_len; - u16 gso_segs = shinfo->gso_segs; /* mac layer + network layer */ - hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, hdr_len, + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, hdr_len, + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } - if (shinfo->gso_type & SKB_GSO_DODGY) - gso_segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) { + int payload = skb->len - hdr_len; + /* Malicious packet. */ + if (payload <= 0) + return; + gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + shinfo->gso_segs = gso_segs; + qdisc_skb_cb(skb)->pkt_segs = gso_segs; + } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } +static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, + struct sk_buff **to_free, + struct netdev_queue *txq) +{ + int rc; + + rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; + if (rc == NET_XMIT_SUCCESS) + trace_qdisc_enqueue(q, txq, skb); + return rc; +} + static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { + struct sk_buff *next, *to_free = NULL, *to_free2 = NULL; spinlock_t *root_lock = qdisc_lock(q); - bool contended; + struct llist_node *ll_list, *first_n; + unsigned long defer_count = 0; int rc; - qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); - /* - * Heuristic to force contended enqueues to serialize on a - * separate lock before trying to get qdisc main lock. - * This permits __QDISC_STATE_RUNNING owner to get the lock more often - * and dequeue packets faster. + + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); + + if (q->flags & TCQ_F_NOLOCK) { + if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && + qdisc_run_begin(q)) { + /* Retest nolock_qdisc_is_empty() within the protection + * of q->seqlock to protect from racing with requeuing. + */ + if (unlikely(!nolock_qdisc_is_empty(q))) { + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + __qdisc_run(q); + to_free2 = qdisc_run_end(q); + + goto free_skbs; + } + + qdisc_bstats_cpu_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && + !nolock_qdisc_is_empty(q)) + __qdisc_run(q); + + to_free2 = qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; + goto free_skbs; + } + + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + to_free2 = qdisc_run(q); + goto free_skbs; + } + + /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. + * In the try_cmpxchg() loop, we want to increment q->defer_count + * at most once to limit the number of skbs in defer_list. + * We perform the defer_count increment only if the list is not empty, + * because some arches have slow atomic_long_inc_return(). */ - contended = qdisc_is_running(q); - if (unlikely(contended)) - spin_lock(&q->busylock); + first_n = READ_ONCE(q->defer_list.first); + do { + if (first_n && !defer_count) { + defer_count = atomic_long_inc_return(&q->defer_count); + if (unlikely(defer_count > READ_ONCE(q->limit))) { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + return NET_XMIT_DROP; + } + } + skb->ll_node.next = first_n; + } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node)); + + /* If defer_list was not empty, we know the cpu which queued + * the first skb will process the whole list for us. + */ + if (first_n) + return NET_XMIT_SUCCESS; spin_lock(root_lock); + + ll_list = llist_del_all(&q->defer_list); + /* There is a small race because we clear defer_count not atomically + * with the prior llist_del_all(). This means defer_list could grow + * over q->limit. + */ + atomic_long_set(&q->defer_count, 0); + + ll_list = llist_reverse_order(ll_list); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - kfree_skb(skb); + llist_for_each_entry_safe(skb, next, ll_list, ll_node) + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; - } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && - qdisc_run_begin(q)) { + goto unlock; + } + if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + !llist_next(ll_list) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); + DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list, + struct sk_buff, + ll_node)); qdisc_bstats_update(q, skb); - - if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - } else - qdisc_run_end(q); - + to_free2 = qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); - rc = q->enqueue(skb, q) & NET_XMIT_MASK; - if (qdisc_run_begin(q)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } - __qdisc_run(q); + int count = 0; + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + prefetch(next); + prefetch(&next->priority); + skb_mark_not_on_list(skb); + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + count++; } + to_free2 = qdisc_run(q); + if (count != 1) + rc = NET_XMIT_SUCCESS; } +unlock: spin_unlock(root_lock); - if (unlikely(contended)) - spin_unlock(&q->busylock); + +free_skbs: + tcf_kfree_skb_list(to_free); + tcf_kfree_skb_list(to_free2); return rc; } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { - struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + const struct netprio_map *map; + const struct sock *sk; + unsigned int prioidx; - if (!skb->priority && skb->sk && map) { - unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + if (skb->priority) + return; + map = rcu_dereference_bh(skb->dev->priomap); + if (!map) + return; + sk = skb_to_full_sk(skb); + if (!sk) + return; - if (prioidx < map->priomap_len) - skb->priority = map->priomap[prioidx]; - } + prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); + + if (prioidx < map->priomap_len) + skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) #endif -static DEFINE_PER_CPU(int, xmit_recursion); -#define RECURSION_LIMIT 10 - /** * dev_loopback_xmit - loop back @skb + * @net: network namespace this loopback is happening in + * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ -int dev_loopback_xmit(struct sk_buff *skb) +int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; - skb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(skb)); + if (skb->ip_summed == CHECKSUM_NONE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); skb_dst_force(skb); - netif_rx_ni(skb); + netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); +#ifdef CONFIG_NET_EGRESS +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +#ifndef CONFIG_PREEMPT_RT +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); + +#else +static bool netdev_xmit_txqueue_skipped(void) +{ + return current->net_xmit.skip_txqueue; +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + current->net_xmit.skip_txqueue = skip; +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_NET_XGRESS +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, + enum skb_drop_reason *drop_reason) +{ + int ret = TC_ACT_UNSPEC; +#ifdef CONFIG_NET_CLS_ACT + struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); + struct tcf_result res; + + if (!miniq) + return ret; + + /* Global bypass */ + if (!static_branch_likely(&tcf_sw_enabled_key)) + return ret; + + /* Block-wise bypass */ + if (tcf_block_bypass_sw(miniq->block)) + return ret; + + tc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; + tcf_set_drop_reason(skb, *drop_reason); + + mini_qdisc_bstats_cpu_update(miniq, skb); + ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); + /* Only tcf related quirks below. */ + switch (ret) { + case TC_ACT_SHOT: + *drop_reason = tcf_get_drop_reason(skb); + mini_qdisc_qstats_cpu_drop(miniq); + break; + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(res.classid); + break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return ret; +} + +static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); + +void tcx_inc(void) +{ + static_branch_inc(&tcx_needed_key); +} + +void tcx_dec(void) +{ + static_branch_dec(&tcx_needed_key); +} + +static __always_inline enum tcx_action_base +tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + const bool needs_mac) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + int ret = TCX_NEXT; + + if (needs_mac) + __skb_push(skb, skb->mac_len); + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != TCX_NEXT) + break; + } + if (needs_mac) + __skb_pull(skb, skb->mac_len); + return tcx_action_code(skb, ret); +} + +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); + enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int sch_ret; + + if (!entry) + return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + if (unlikely(*pt_prev)) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_pkt_len_segs_init(skb); + tcx_set_ingress(skb, true); + + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, true); + if (sch_ret != TC_ACT_UNSPEC) + goto ingress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); +ingress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by BPF, so we can safely + * push the L2 header back before redirecting to another + * netdev. + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + case TC_ACT_SHOT: + kfree_skb_reason(skb, drop_reason); + *ret = NET_RX_DROP; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + } + bpf_net_ctx_clear(bpf_net_ctx); + + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); + enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int sch_ret; + + if (!entry) + return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was + * already set by the caller. + */ + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, false); + if (sch_ret != TC_ACT_UNSPEC) + goto egress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); +egress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* No need to push/pop skb's mac_header here on egress! */ + skb_do_redirect(skb); + *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + case TC_ACT_SHOT: + kfree_skb_reason(skb, drop_reason); + *ret = NET_XMIT_DROP; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); + return NULL; + } + bpf_net_ctx_clear(bpf_net_ctx); + + return skb; +} +#else +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + return skb; +} +#endif /* CONFIG_NET_XGRESS */ + +#ifdef CONFIG_XPS +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, + struct xps_dev_maps *dev_maps, unsigned int tci) +{ + int tc = netdev_get_prio_tc_map(dev, skb->priority); + struct xps_map *map; + int queue_index = -1; + + if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) + return queue_index; + + tci *= dev_maps->num_tc; + tci += tc; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale( + skb_get_hash(skb), map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + return queue_index; +} +#endif + +static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, + struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct sock *sk = skb->sk; + int queue_index = -1; + + if (!static_key_false(&xps_needed)) + return -1; + + rcu_read_lock(); + if (!static_key_false(&xps_rxqs_needed)) + goto get_cpus_map; + + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); + if (dev_maps) { + int tci = sk_rx_queue_get(sk); + + if (tci >= 0) + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } + +get_cpus_map: + if (queue_index < 0) { + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); + if (dev_maps) { + unsigned int tci = skb->sender_cpu - 1; + + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + return 0; +} +EXPORT_SYMBOL(dev_pick_tx_zero); + +int sk_tx_queue_get(const struct sock *sk) +{ + int resel, val; + + if (!sk) + return -1; + /* Paired with WRITE_ONCE() in sk_tx_queue_clear() + * and sk_tx_queue_set(). + */ + val = READ_ONCE(sk->sk_tx_queue_mapping); + + if (val == NO_QUEUE_MAPPING) + return -1; + + if (!sk_fullsock(sk)) + return val; + + resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection); + if (resel && time_is_before_jiffies( + READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel)) + return -1; + + return val; +} +EXPORT_SYMBOL(sk_tx_queue_get); + +u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + sb_dev = sb_dev ? : dev; + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, sb_dev, skb); + + if (new_index < 0) + new_index = skb_tx_hash(dev, sb_dev, skb); + + if (sk && sk_fullsock(sk) && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); + + queue_index = new_index; + } + + return queue_index; +} +EXPORT_SYMBOL(netdev_pick_tx); + +struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, + struct sk_buff *skb, + struct net_device *sb_dev) +{ + int queue_index = 0; + +#ifdef CONFIG_XPS + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) + skb->sender_cpu = raw_smp_processor_id() + 1; +#endif + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb, sb_dev); + else + queue_index = netdev_pick_tx(dev, skb, sb_dev); + + queue_index = netdev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + /** - * dev_queue_xmit - transmit a buffer - * @skb: buffer to transmit + * __dev_queue_xmit() - transmit a buffer + * @skb: buffer to transmit + * @sb_dev: suboordinate device used for L2 forwarding offload * - * Queue a buffer for transmission to a network device. The caller must - * have set the device and priority and built the buffer before calling - * this function. The function can be called from an interrupt. + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. * - * A negative errno code is returned on a failure. A success does not - * guarantee the frame will be transmitted as it may be dropped due - * to congestion or traffic shaping. + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. * - * ----------------------------------------------------------------------------------- - * I notice this method can also return errors from the queue disciplines, - * including NET_XMIT_DROP, which is a positive value. So, errors can also - * be positive. + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) * - * Regardless of the return value, the skb is consumed, so it is currently - * difficult to retry a send to this method. (You can bump the ref count - * before sending to hold a reference for retry if you are careful.) - * - * When calling this method, interrupts MUST be enabled. This is because - * the BH enable code must have IRQs enabled so that it will not deadlock. - * --BLG + * Return: + * * 0 - buffer successfully transmitted + * * positive qdisc return code - NET_XMIT_DROP etc. + * * negative errno - other errors */ -int dev_queue_xmit(struct sk_buff *skb) +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; - struct netdev_queue *txq; + struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; + bool again = false; skb_reset_mac_header(skb); + skb_assert_len(skb); + + if (unlikely(skb_shinfo(skb)->tx_flags & + (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. @@ -2816,12 +4743,41 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = netdev_pick_tx(dev, skb); - q = rcu_dereference_bh(txq->qdisc); + qdisc_pkt_len_segs_init(skb); + tcx_set_ingress(skb, false); +#ifdef CONFIG_NET_EGRESS + if (static_branch_unlikely(&egress_needed_key)) { + if (nf_hook_egress_active()) { + skb = nf_hook_egress(skb, &rc, dev); + if (!skb) + goto out; + } -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); + netdev_xmit_skip_txqueue(false); + + nf_skip_egress(skb, true); + skb = sch_handle_egress(skb, &rc, dev); + if (!skb) + goto out; + nf_skip_egress(skb, false); + + if (netdev_xmit_txqueue_skipped()) + txq = netdev_tx_queue_mapping(dev, skb); + } #endif + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + + if (!txq) + txq = netdev_core_pick_tx(dev, skb, sb_dev); + + q = rcu_dereference_bh(txq->qdisc); + trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); @@ -2829,31 +4785,37 @@ int dev_queue_xmit(struct sk_buff *skb) } /* The device has no queue. Common case for software devices: - loopback, all the sorts of tunnels... + * loopback, all the sorts of tunnels... - Really, it is unlikely that netif_tx_lock protection is necessary - here. (f.e. loopback and IP tunnels are clean ignoring statistics - counters.) - However, it is possible, that they rely on protection - made by us here. + * Really, it is unlikely that netif_tx_lock protection is necessary + * here. (f.e. loopback and IP tunnels are clean ignoring statistics + * counters.) + * However, it is possible, that they rely on protection + * made by us here. - Check this and shot the lock. It is not prone from deadlocks. - Either shot noqueue qdisc, it is even simpler 8) + * Check this and shot the lock. It is not prone from deadlocks. + *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - if (txq->xmit_lock_owner != cpu) { - - if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + if (READ_ONCE(txq->xmit_lock_owner) != cpu) { + if (dev_xmit_recursion()) goto recursion_alert; + skb = validate_xmit_skb(skb, dev, &again); + if (!skb) + goto out; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { - __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); - __this_cpu_dec(xmit_recursion); + dev_xmit_recursion_inc(); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); + dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; @@ -2875,52 +4837,153 @@ recursion_alert: rc = -ENETDOWN; rcu_read_unlock_bh(); - kfree_skb(skb); + dev_core_stats_tx_dropped_inc(dev); + kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } -EXPORT_SYMBOL(dev_queue_xmit); +EXPORT_SYMBOL(__dev_queue_xmit); +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + struct net_device *dev = skb->dev; + struct sk_buff *orig_skb = skb; + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + bool again = false; + + if (unlikely(!netif_running(dev) || + !netif_carrier_ok(dev))) + goto drop; + + skb = validate_xmit_skb_list(skb, dev, &again); + if (skb != orig_skb) + goto drop; + + skb_set_queue_mapping(skb, queue_id); + txq = skb_get_tx_queue(dev, skb); -/*======================================================================= - Receiver routines - =======================================================================*/ + local_bh_disable(); -int netdev_max_backlog __read_mostly = 1000; -EXPORT_SYMBOL(netdev_max_backlog); + dev_xmit_recursion_inc(); + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq, false); + HARD_TX_UNLOCK(dev, txq); + dev_xmit_recursion_dec(); -int netdev_tstamp_prequeue __read_mostly = 1; -int netdev_budget __read_mostly = 300; -int weight_p __read_mostly = 64; /* old backlog weight */ + local_bh_enable(); + return ret; +drop: + dev_core_stats_tx_dropped_inc(dev); + kfree_skb_list(skb); + return NET_XMIT_DROP; +} +EXPORT_SYMBOL(__dev_direct_xmit); + +/************************************************************************* + * Receiver routines + *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); + +int weight_p __read_mostly = 64; /* old backlog weight */ +int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ +int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { + struct task_struct *thread; + + lockdep_assert_irqs_disabled(); + + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { + /* Paired with smp_mb__before_atomic() in + * napi_enable()/netif_set_threaded(). + * Use READ_ONCE() to guarantee a complete + * read on napi->thread. Only call + * wake_up_process() when it's not NULL. + */ + thread = READ_ONCE(napi->thread); + if (thread) { + if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) + goto use_local_napi; + + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + wake_up_process(thread); + return; + } + } + +use_local_napi: + DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list)); list_add_tail(&napi->poll_list, &sd->poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + WRITE_ONCE(napi->list_owner, smp_processor_id()); + /* If not called from net_rx_action() + * we have to raise NET_RX_SOFTIRQ. + */ + if (!sd->in_net_rx_action) + raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); +struct static_key_false rps_needed __read_mostly; +EXPORT_SYMBOL(rps_needed); +struct static_key_false rfs_needed __read_mostly; +EXPORT_SYMBOL(rfs_needed); -struct static_key rps_needed __read_mostly; +static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +{ + return hash_32(hash, flow_table->log); +} + +#ifdef CONFIG_RFS_ACCEL +/** + * rps_flow_is_active - check whether the flow is recently active. + * @rflow: Specific flow to check activity. + * @flow_table: per-queue flowtable that @rflow belongs to. + * @cpu: CPU saved in @rflow. + * + * If the CPU has processed many packets since the flow's last activity + * (beyond 10 times the table size), the flow is considered stale. + * + * Return: true if flow was recently active. + */ +static bool rps_flow_is_active(struct rps_dev_flow *rflow, + struct rps_dev_flow_table *flow_table, + unsigned int cpu) +{ + unsigned int flow_last_active; + unsigned int sd_input_head; + + if (cpu >= nr_cpu_ids) + return false; + + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); + flow_last_active = READ_ONCE(rflow->last_qtail); + + return (int)(sd_input_head - flow_last_active) < + (int)(10 << flow_table->log); +} +#endif static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow *rflow, u16 next_cpu) + struct rps_dev_flow *rflow, u16 next_cpu, u32 hash, + u32 flow_id) { - if (next_cpu != RPS_NO_CPU) { + if (next_cpu < nr_cpu_ids) { + u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; - u32 flow_id; + struct rps_dev_flow *tmp_rflow; + unsigned int tmp_cpu; u16 rxq_index; int rc; @@ -2936,23 +4999,38 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb->rxhash & flow_table->mask; + + tmp_rflow = &flow_table->flows[flow_id]; + tmp_cpu = READ_ONCE(tmp_rflow->cpu); + + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { + if (rps_flow_is_active(tmp_rflow, flow_table, + tmp_cpu)) { + if (hash != READ_ONCE(tmp_rflow->hash) || + next_cpu == tmp_cpu) + goto out; + } + } + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; + old_rflow = rflow; - rflow = &flow_table->flows[flow_id]; - rflow->filter = rc; - if (old_rflow->filter == rflow->filter) - old_rflow->filter = RPS_NO_FILTER; + rflow = tmp_rflow; + WRITE_ONCE(rflow->filter, rc); + WRITE_ONCE(rflow->hash, hash); + + if (old_rflow->filter == rc) + WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif - rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); + rps_input_queue_tail_save(&rflow->last_qtail, head); } - rflow->cpu = next_cpu; + WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } @@ -2964,15 +5042,18 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { - struct netdev_rx_queue *rxqueue; - struct rps_map *map; + const struct rps_sock_flow_table *sock_flow_table; + struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; + struct rps_map *map; int cpu = -1; - u16 tcpu; + u32 flow_id; + u32 tcpu; + u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " @@ -2980,44 +5061,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, dev->name, index, dev->real_num_rx_queues); goto done; } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; + rxqueue += index; + } + /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ + + flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); - if (map) { - if (map->len == 1 && - !rcu_access_pointer(rxqueue->rps_flow_table)) { - tcpu = map->cpus[0]; - if (cpu_online(tcpu)) - cpu = tcpu; - goto done; - } - } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { + if (!flow_table && !map) goto done; - } skb_reset_network_header(skb); - if (!skb_get_rxhash(skb)) + hash = skb_get_hash(skb); + if (!hash) goto done; - flow_table = rcu_dereference(rxqueue->rps_flow_table); - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { - u16 next_cpu; struct rps_dev_flow *rflow; + u32 next_cpu; + u32 ident; - rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; - tcpu = rflow->cpu; + /* First check into global flow table if there is a match. + * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). + */ + ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); + if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) + goto try_rps; + + next_cpu = ident & net_hotdata.rps_cpu_mask; - next_cpu = sock_flow_table->ents[skb->rxhash & - sock_flow_table->mask]; + /* OK, now we know there is a match, + * we can look at the local (per receive queue) flow table + */ + flow_id = rfs_slot(hash, flow_table); + rflow = &flow_table->flows[flow_id]; + tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: - * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. @@ -3025,23 +5110,25 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && - (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; - rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash, + flow_id); } - if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } - if (map) { - tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; +try_rps: + if (map) { + tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; @@ -3072,17 +5159,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id <= flow_table->mask) { + if (flow_table && flow_id < (1UL << flow_table->log)) { + unsigned int cpu; + rflow = &flow_table->flows[flow_id]; - cpu = ACCESS_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu != RPS_NO_CPU && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < - (int)(10 * flow_table->mask))) + cpu = READ_ONCE(rflow->cpu); + if (READ_ONCE(rflow->filter) == filter_id && + rps_flow_is_active(rflow, flow_table, cpu)) expire = false; } rcu_read_unlock(); @@ -3098,52 +5184,95 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data) +{ + struct softnet_data *sd = data; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); +} + /* - * Check if this softnet_data structure is another cpu one - * If yes, queue it to our IPI list and return 1 - * If no, return 0 + * After we queued a packet into sd->input_pkt_queue, + * we need to make sure this queue is serviced soon. + * + * - If this is another cpu queue, link it to our rps_ipi_list, + * and make sure we will process rps_ipi_list from net_rx_action(). + * + * - If this is our own queue, NAPI schedule our backlog. + * Note that this also raises NET_RX_SOFTIRQ. */ -static int rps_ipi_queued(struct softnet_data *sd) +static void napi_schedule_rps(struct softnet_data *sd) { -#ifdef CONFIG_RPS - struct softnet_data *mysd = &__get_cpu_var(softnet_data); + struct softnet_data *mysd = this_cpu_ptr(&softnet_data); +#ifdef CONFIG_RPS if (sd != mysd) { + if (use_backlog_threads()) { + __napi_schedule_irqoff(&sd->backlog); + return; + } + sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - return 1; + /* If not called from net_rx_action() or napi_threaded_poll() + * we have to raise NET_RX_SOFTIRQ. + */ + if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return; } #endif /* CONFIG_RPS */ - return 0; + __napi_schedule_irqoff(&mysd->backlog); +} + +void kick_defer_list_purge(unsigned int cpu) +{ + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + unsigned long flags; + + if (use_backlog_threads()) { + backlog_lock_irq_save(sd, &flags); + + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + __napi_schedule_irqoff(&sd->backlog); + + backlog_unlock_irq_restore(sd, &flags); + + } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { + smp_call_function_single_async(cpu, &sd->defer_csd); + } } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif -static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen, + int max_backlog) { #ifdef CONFIG_NET_FLOW_LIMIT - struct sd_flow_limit *fl; - struct softnet_data *sd; unsigned int old_flow, new_flow; + const struct softnet_data *sd; + struct sd_flow_limit *fl; - if (qlen < (netdev_max_backlog >> 1)) + if (likely(qlen < (max_backlog >> 1))) return false; - sd = &__get_cpu_var(softnet_data); + sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; @@ -3154,7 +5283,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } @@ -3171,78 +5301,335 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { + enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; + u32 tail; - sd = &per_cpu(softnet_data, cpu); + reason = SKB_DROP_REASON_DEV_READY; + if (unlikely(!netif_running(skb->dev))) + goto bad_dev; - local_irq_save(flags); + sd = &per_cpu(softnet_data, cpu); - rps_lock(sd); + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog) || + skb_flow_limit(skb, qlen, max_backlog)) + goto cpu_backlog_drop; + backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { - if (skb_queue_len(&sd->input_pkt_queue)) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - rps_unlock(sd); - local_irq_restore(flags); - return NET_RX_SUCCESS; - } - - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { - if (!rps_ipi_queued(sd)) - ____napi_schedule(sd, &sd->backlog); + if (likely(qlen <= max_backlog)) { + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); } - goto enqueue; - } + __skb_queue_tail(&sd->input_pkt_queue, skb); + tail = rps_input_queue_tail_incr(sd); + backlog_unlock_irq_restore(sd, &flags); - sd->dropped++; - rps_unlock(sd); + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); + return NET_RX_SUCCESS; + } - local_irq_restore(flags); + backlog_unlock_irq_restore(sd, &flags); - atomic_long_inc(&skb->dev->rx_dropped); - kfree_skb(skb); +cpu_backlog_drop: + reason = SKB_DROP_REASON_CPU_BACKLOG; + numa_drop_add(&sd->drop_counters, 1); +bad_dev: + dev_core_stats_rx_dropped_inc(skb->dev); + kfree_skb_reason(skb, reason); return NET_RX_DROP; } -/** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) - * +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) +{ + void *orig_data, *orig_data_end, *hard_start; + struct netdev_rx_queue *rxqueue; + bool orig_bcast, orig_host; + u32 mac_len, frame_sz; + __be16 orig_eth_type; + struct ethhdr *eth; + u32 metalen, act; + int off; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + frame_sz = (void *)skb_end_pointer(skb) - hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + rxqueue = netif_get_rxqueue(skb); + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } + + orig_data_end = xdp->data_end; + orig_data = xdp->data; + eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); + orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); + orig_eth_type = eth->h_proto; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + + /* check if bpf_xdp_adjust_head was used */ + off = xdp->data - orig_data; + if (off) { + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + skb->mac_header += off; + skb_reset_network_header(skb); + } + + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; + if (off != 0) { + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); + skb->len += off; /* positive on grow, negative on shrink */ + } + + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + + /* check if XDP changed eth hdr such SKB needs update */ + eth = (struct ethhdr *)xdp->data; + if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || + (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { + __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + } + + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + __skb_push(skb, mac_len); + break; + case XDP_PASS: + metalen = xdp->data - xdp->data_meta; + if (metalen) + skb_metadata_set(skb, metalen); + break; + } + + return act; +} + +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + +static u32 netif_receive_generic_xdp(struct sk_buff **pskb, + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) +{ + struct sk_buff *skb = *pskb; + u32 mac_len, act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. + */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + if (netif_skb_check_for_xdp(pskb, xdp_prog)) + goto do_drop; + } + + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; + default: + bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception((*pskb)->dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + do_drop: + kfree_skb(*pskb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. This also means + * that XDP packets are able to starve other packets going through a qdisc, + * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX + * queues, so they do not have this starvation issue. */ +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; -int netif_rx(struct sk_buff *skb) + txq = netdev_core_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_frozen_or_drv_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + dev_core_stats_tx_dropped_inc(dev); + kfree_skb(skb); + } +} + +static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); + +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { - int ret; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + + if (xdp_prog) { + struct xdp_buff xdp; + u32 act; + int err; - /* if netpoll wants it, pretend we never saw it */ - if (netpoll_rx(skb)) - return NET_RX_DROP; + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); + if (act != XDP_PASS) { + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, + &xdp, xdp_prog); + if (err) + goto out_redir; + break; + case XDP_TX: + generic_xdp_tx(*pskb, xdp_prog); + break; + } + bpf_net_ctx_clear(bpf_net_ctx); + return XDP_DROP; + } + bpf_net_ctx_clear(bpf_net_ctx); + } + return XDP_PASS; +out_redir: + bpf_net_ctx_clear(bpf_net_ctx); + kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); + return XDP_DROP; +} +EXPORT_SYMBOL_GPL(do_xdp_generic); + +static int netif_rx_internal(struct sk_buff *skb) +{ + int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); + #ifdef CONFIG_RPS - if (static_key_false(&rps_needed)) { + if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -3252,35 +5639,75 @@ int netif_rx(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } -EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) +/** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) { - int err; + int ret; - preempt_disable(); - err = netif_rx(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); + lockdep_assert_once(hardirq_count() | softirq_count()); - return err; + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; } -EXPORT_SYMBOL(netif_rx_ni); +EXPORT_SYMBOL(__netif_rx); + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from interrupt and from process context. The + * caller from process context must not disable interrupts before invoking + * this function. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ +int netif_rx(struct sk_buff *skb) +{ + bool need_bh_off = !(hardirq_count() | softirq_count()); + int ret; + + if (need_bh_off) + local_bh_disable(); + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + if (need_bh_off) + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL(netif_rx); -static void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(void) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; @@ -3292,11 +5719,21 @@ static void net_tx_action(struct softirq_action *h) while (clist) { struct sk_buff *skb = clist; + clist = clist->next; - WARN_ON(atomic_read(&skb->users)); - trace_kfree_skb(skb, net_tx_action); - __kfree_skb(skb); + WARN_ON(refcount_read(&skb->users)); + if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) + trace_consume_skb(skb, net_tx_action); + else + trace_kfree_skb(skb, net_tx_action, + get_kfree_skb_cb(skb)->reason, NULL); + + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __napi_kfree_skb(skb, + get_kfree_skb_cb(skb)->reason); } } @@ -3309,103 +5746,72 @@ static void net_tx_action(struct softirq_action *h) sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); + rcu_read_lock(); + while (head) { + spinlock_t *root_lock = NULL; + struct sk_buff *to_free; struct Qdisc *q = head; - spinlock_t *root_lock; head = head->next_sched; - root_lock = qdisc_lock(q); - if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - qdisc_run(q); - spin_unlock(root_lock); - } else { - if (!test_bit(__QDISC_STATE_DEACTIVATED, - &q->state)) { - __netif_reschedule(q); - } else { - smp_mb__before_clear_bit(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - } + /* We need to make sure head->next_sched is read + * before clearing __QDISC_STATE_SCHED + */ + smp_mb__before_atomic(); + + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, + &q->state))) { + /* There is a synchronize_net() between + * STATE_DEACTIVATED flag being set and + * qdisc_reset()/some_qdisc_is_busy() in + * dev_deactivate(), so we can safely bail out + * early here to avoid data race between + * qdisc_deactivate() and some_qdisc_is_busy() + * for lockless qdisc. + */ + clear_bit(__QDISC_STATE_SCHED, &q->state); + continue; } + + clear_bit(__QDISC_STATE_SCHED, &q->state); + to_free = qdisc_run(q); + if (root_lock) + spin_unlock(root_lock); + tcf_kfree_skb_list(to_free); } + + rcu_read_unlock(); } + + xfrm_dev_backlog(sd); } -#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ - (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -#ifdef CONFIG_NET_CLS_ACT -/* TODO: Maybe we should just force sch_ingress to be compiled in - * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions - * a compare and 2 stores extra right now if we dont have it on - * but have CONFIG_NET_CLS_ACT - * NOTE: This doesn't stop any functionality; if you dont have - * the ingress scheduler, you just can't add policies on ingress. +/** + * netdev_is_rx_handler_busy - check if receive handler is registered + * @dev: device to check * + * Check if a receive handler is already registered for a given device. + * Return true if there one. + * + * The caller must hold the rtnl_mutex. */ -static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) -{ - struct net_device *dev = skb->dev; - u32 ttl = G_TC_RTTL(skb->tc_verd); - int result = TC_ACT_OK; - struct Qdisc *q; - - if (unlikely(MAX_RED_LOOP < ttl++)) { - net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", - skb->skb_iif, dev->ifindex); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - - q = rxq->qdisc; - if (q != &noop_qdisc) { - spin_lock(qdisc_lock(q)); - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - result = qdisc_enqueue_root(skb, q); - spin_unlock(qdisc_lock(q)); - } - - return result; -} - -static inline struct sk_buff *handle_ing(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) +bool netdev_is_rx_handler_busy(struct net_device *dev) { - struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - - if (!rxq || rxq->qdisc == &noop_qdisc) - goto out; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - switch (ing_filter(skb, rxq)) { - case TC_ACT_SHOT: - case TC_ACT_STOLEN: - kfree_skb(skb); - return NULL; - } - -out: - skb->tc_verd = 0; - return skb; + ASSERT_RTNL(); + return dev && rtnl_dereference(dev->rx_handler); } -#endif +EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); /** * netdev_rx_handler_register - register receive handler @@ -3413,7 +5819,7 @@ out: * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * - * Register a receive hander for a device. This handler will then be + * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * @@ -3425,11 +5831,12 @@ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { - ASSERT_RTNL(); - - if (dev->rx_handler) + if (netdev_is_rx_handler_busy(dev)) return -EBUSY; + if (dev->priv_flags & IFF_NO_RX_HANDLER) + return -EINVAL; + /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); @@ -3467,112 +5874,162 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_ARP): - case __constant_htons(ETH_P_IP): - case __constant_htons(ETH_P_IPV6): - case __constant_htons(ETH_P_8021Q): - case __constant_htons(ETH_P_8021AD): + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): return true; default: return false; } } -static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + if (nf_hook_ingress_active(skb)) { + int ingress_retval; + + if (unlikely(*pt_prev)) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + rcu_read_lock(); + ingress_retval = nf_hook_ingress(skb); + rcu_read_unlock(); + return ingress_retval; + } + return 0; +} + +static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, + struct packet_type **ppt_prev) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO; struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; + struct sk_buff *skb = *pskb; struct net_device *orig_dev; - struct net_device *null_or_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!netdev_tstamp_prequeue, skb); + net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); - /* if we've gotten here through NAPI, check netpoll */ - if (netpoll_receive_skb(skb)) - goto out; - orig_dev = skb->dev; skb_reset_network_header(skb); +#if !defined(CONFIG_DEBUG_NET) + /* We plan to no longer reset the transport header here. + * Give some time to fuzzers and dev build to catch bugs + * in network stacks. + */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); +#endif skb_reset_mac_len(skb); pt_prev = NULL; - rcu_read_lock(); - another_round: skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); - if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || - skb->protocol == cpu_to_be16(ETH_P_8021AD)) { - skb = vlan_untag(skb); - if (unlikely(!skb)) - goto unlock; + if (static_branch_unlikely(&generic_xdp_needed_key)) { + int ret2; + + migrate_disable(); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + &skb); + migrate_enable(); + + if (ret2 != XDP_PASS) { + ret = NET_RX_DROP; + goto out; + } } -#ifdef CONFIG_NET_CLS_ACT - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - goto ncls; + if (eth_type_vlan(skb->protocol)) { + skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto out; } -#endif + + if (skb_skip_tc_classify(skb)) + goto skip_classify; if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, + list) { + if (unlikely(pt_prev)) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + + list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { + if (unlikely(pt_prev)) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; } skip_taps: -#ifdef CONFIG_NET_CLS_ACT - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto unlock; -ncls: -#endif +#ifdef CONFIG_NET_INGRESS + if (static_branch_unlikely(&ingress_needed_key)) { + bool another = false; + + nf_skip_egress(skb, true); + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; + if (!skb) + goto out; - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) + nf_skip_egress(skb, false); + if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) + goto out; + } +#endif + skb_reset_redirect(skb); +skip_classify: + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) { + drop_reason = SKB_DROP_REASON_PFMEMALLOC; goto drop; + } - if (vlan_tx_tag_present(skb)) { - if (pt_prev) { + if (skb_vlan_tag_present(skb)) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) - goto unlock; + goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { - if (pt_prev) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; - goto unlock; + goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; + break; case RX_HANDLER_PASS: break; default: @@ -3580,51 +6037,202 @@ ncls: } } - if (vlan_tx_nonzero_tag_present(skb)) - skb->pkt_type = PACKET_OTHERHOST; + if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { +check_vlan_id: + if (skb_vlan_tag_get_id(skb)) { + /* Vlan id is non 0 and vlan_do_receive() above couldn't + * find vlan device. + */ + skb->pkt_type = PACKET_OTHERHOST; + } else if (eth_type_vlan(skb->protocol)) { + /* Outer header is 802.1P with vlan 0, inner header is + * 802.1Q or 802.1AD and vlan_do_receive() above could + * not find vlan dev for vlan id 0. + */ + __vlan_hwaccel_clear_tag(skb); + skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto out; + if (vlan_do_receive(&skb)) + /* After stripping off 802.1P header with vlan 0 + * vlan dev is found for inner header. + */ + goto another_round; + else if (unlikely(!skb)) + goto out; + else + /* We have stripped outer 802.1P vlan 0 header. + * But could not find vlan dev. + * check again for vlan id to set OTHERHOST. + */ + goto check_vlan_id; + } + /* Note: we might in the future use prio bits + * and set skb->priority like in vlan_do_receive() + * For the time being, just ignore Priority Code Point + */ + __vlan_hwaccel_clear_tag(skb); + } + + type = skb->protocol; /* deliver only exact match when indicated */ - null_or_dev = deliver_exact ? skb->dev : NULL; + if (likely(!deliver_exact)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &ptype_base[ntohs(type) & + PTYPE_HASH_MASK]); + + /* orig_dev and skb->dev could belong to different netns; + * Even in such case we need to traverse only the list + * coming from skb->dev, as the ptype owner (packet socket) + * will use dev_net(skb->dev) to do namespace filtering. + */ + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &dev_net_rcu(skb->dev)->ptype_specific); + } - type = skb->protocol; - list_for_each_entry_rcu(ptype, - &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && - (ptype->dev == null_or_dev || ptype->dev == skb->dev || - ptype->dev == orig_dev)) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &orig_dev->ptype_specific); + + if (unlikely(skb->dev != orig_dev)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &skb->dev->ptype_specific); } if (pt_prev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) - goto drop; - else - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + *ppt_prev = pt_prev; } else { drop: - atomic_long_inc(&skb->dev->rx_dropped); - kfree_skb(skb); + if (!deliver_exact) + dev_core_stats_rx_dropped_inc(skb->dev); + else + dev_core_stats_rx_nohandler_inc(skb->dev); + + kfree_skb_reason(skb, drop_reason); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } -unlock: - rcu_read_unlock(); out: + /* The invariant here is that if *ppt_prev is not NULL + * then skb should also be non-NULL. + * + * Apparently *ppt_prev assignment above holds this invariant due to + * skb dereferencing near it. + */ + *pskb = skb; + return ret; +} + +static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) +{ + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + int ret; + + ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); + if (pt_prev) + ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, + skb->dev, pt_prev, orig_dev); return ret; } +/** + * netif_receive_skb_core - special purpose version of netif_receive_skb + * @skb: buffer to process + * + * More direct receive version of netif_receive_skb(). It should + * only be used by callers that have a need to skip RPS and Generic XDP. + * Caller must also take care of handling if ``(page_is_)pfmemalloc``. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb_core(struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __netif_receive_skb_one_core(skb, false); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(netif_receive_skb_core); + +static inline void __netif_receive_skb_list_ptype(struct list_head *head, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + struct sk_buff *skb, *next; + + if (!pt_prev) + return; + if (list_empty(head)) + return; + if (pt_prev->list_func != NULL) + INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, + ip_list_rcv, head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + } +} + +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + /* Fast-path assumptions: + * - There is no RX handler. + * - Only one packet_type matches. + * If either of these fails, we will end up doing some per-packet + * processing in-line, then handling the 'last ptype' for the whole + * sublist. This can't cause out-of-order delivery to any single ptype, + * because the 'last ptype' must be constant across the sublist, and all + * other ptypes are handled per-packet. + */ + /* Current (common) ptype of sublist */ + struct packet_type *pt_curr = NULL; + /* Current (common) orig_dev of sublist */ + struct net_device *od_curr = NULL; + struct sk_buff *skb, *next; + LIST_HEAD(sublist); + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + + skb_list_del_init(skb); + __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); + if (!pt_prev) + continue; + if (pt_curr != pt_prev || od_curr != orig_dev) { + /* dispatch old sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + pt_curr = pt_prev; + od_curr = orig_dev; + } + list_add_tail(&skb->list, &sublist); + } + + /* dispatch final sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); +} + static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should @@ -3635,15 +6243,134 @@ static int __netif_receive_skb(struct sk_buff *skb) * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ - current->flags |= PF_MEMALLOC; - ret = __netif_receive_skb_core(skb, true); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + noreclaim_flag = memalloc_noreclaim_save(); + ret = __netif_receive_skb_one_core(skb, true); + memalloc_noreclaim_restore(noreclaim_flag); } else - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); return ret; } +static void __netif_receive_skb_list(struct list_head *head) +{ + unsigned long noreclaim_flag = 0; + struct sk_buff *skb, *next; + bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ + + list_for_each_entry_safe(skb, next, head, list) { + if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { + struct list_head sublist; + + /* Handle the previous sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + __netif_receive_skb_list_core(&sublist, pfmemalloc); + pfmemalloc = !pfmemalloc; + /* See comments in __netif_receive_skb */ + if (pfmemalloc) + noreclaim_flag = memalloc_noreclaim_save(); + else + memalloc_noreclaim_restore(noreclaim_flag); + } + } + /* Handle the remaining sublist */ + if (!list_empty(head)) + __netif_receive_skb_list_core(head, pfmemalloc); + /* Restore pflags */ + if (pfmemalloc) + memalloc_noreclaim_restore(noreclaim_flag); +} + +static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) +{ + struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); + struct bpf_prog *new = xdp->prog; + int ret = 0; + + switch (xdp->command) { + case XDP_SETUP_PROG: + rcu_assign_pointer(dev->xdp_prog, new); + if (old) + bpf_prog_put(old); + + if (old && !new) { + static_branch_dec(&generic_xdp_needed_key); + } else if (new && !old) { + static_branch_inc(&generic_xdp_needed_key); + netif_disable_lro(dev); + dev_disable_gro_hw(dev); + } + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int netif_receive_skb_internal(struct sk_buff *skb) +{ + int ret; + + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); + + if (skb_defer_rx_timestamp(skb)) + return NET_RX_SUCCESS; + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rps_needed)) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + return ret; + } + } +#endif + ret = __netif_receive_skb(skb); + rcu_read_unlock(); + return ret; +} + +void netif_receive_skb_list_internal(struct list_head *head) +{ + struct sk_buff *skb, *next; + LIST_HEAD(sublist); + + list_for_each_entry_safe(skb, next, head, list) { + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), + skb); + skb_list_del_init(skb); + if (!skb_defer_rx_timestamp(skb)) + list_add_tail(&skb->list, &sublist); + } + list_splice_init(&sublist, head); + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rps_needed)) { + list_for_each_entry_safe(skb, next, head, list) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + /* Will be handled, remove from list */ + skb_list_del_init(skb); + enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + } + } + } +#endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +} + /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process @@ -3661,758 +6388,1565 @@ static int __netif_receive_skb(struct sk_buff *skb) */ int netif_receive_skb(struct sk_buff *skb) { - net_timestamp_check(netdev_tstamp_prequeue, skb); + int ret; - if (skb_defer_rx_timestamp(skb)) - return NET_RX_SUCCESS; + trace_netif_receive_skb_entry(skb); -#ifdef CONFIG_RPS - if (static_key_false(&rps_needed)) { - struct rps_dev_flow voidflow, *rflow = &voidflow; - int cpu, ret; + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); - rcu_read_lock(); + return ret; +} +EXPORT_SYMBOL(netif_receive_skb); - cpu = get_rps_cpu(skb->dev, skb, &rflow); +/** + * netif_receive_skb_list - process many receive buffers from network + * @head: list of skbs to process. + * + * Since return value of netif_receive_skb() is normally ignored, and + * wouldn't be meaningful for a list, this function returns void. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + */ +void netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb; - if (cpu >= 0) { - ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); - rcu_read_unlock(); - return ret; - } - rcu_read_unlock(); + if (list_empty(head)) + return; + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); } -#endif - return __netif_receive_skb(skb); + netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } -EXPORT_SYMBOL(netif_receive_skb); +EXPORT_SYMBOL(netif_receive_skb_list); -/* Network device is going away, flush any packets still pending - * Called with irqs disabled. - */ -static void flush_backlog(void *arg) +/* Network device is going away, flush any packets still pending */ +static void flush_backlog(struct work_struct *work) { - struct net_device *dev = arg; - struct softnet_data *sd = &__get_cpu_var(softnet_data); struct sk_buff *skb, *tmp; + struct sk_buff_head list; + struct softnet_data *sd; + + __skb_queue_head_init(&list); + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); - rps_lock(sd); + backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev == dev) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - kfree_skb(skb); - input_queue_head_incr(sd); + __skb_queue_tail(&list, skb); + rps_input_queue_head_incr(sd); } } - rps_unlock(sd); + backlog_unlock_irq_enable(sd); + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev == dev) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); - input_queue_head_incr(sd); + __skb_queue_tail(&list, skb); + rps_input_queue_head_incr(sd); } } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); + local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } -static int napi_gro_complete(struct sk_buff *skb) +static bool flush_required(int cpu) { - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int err = -ENOENT; +#if IS_ENABLED(CONFIG_RPS) + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + bool do_flush; - BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + backlog_lock_irq_disable(sd); - if (NAPI_GRO_CB(skb)->count == 1) { - skb_shinfo(skb)->gso_size = 0; - goto out; - } + /* as insertion into process_queue happens with the rps lock held, + * process_queue access may race only with dequeue + */ + do_flush = !skb_queue_empty(&sd->input_pkt_queue) || + !skb_queue_empty_lockless(&sd->process_queue); + backlog_unlock_irq_enable(sd); - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_complete) - continue; + return do_flush; +#endif + /* without RPS we can't safely check input_pkt_queue: during a + * concurrent remote skb_queue_splice() we can detect as empty both + * input_pkt_queue and process_queue even if the latter could end-up + * containing a lot of packets. + */ + return true; +} - err = ptype->callbacks.gro_complete(skb); - break; +struct flush_backlogs { + cpumask_t flush_cpus; + struct work_struct w[]; +}; + +static struct flush_backlogs *flush_backlogs_alloc(void) +{ + return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), + GFP_KERNEL); +} + +static struct flush_backlogs *flush_backlogs_fallback; +static DEFINE_MUTEX(flush_backlogs_mutex); + +static void flush_all_backlogs(void) +{ + struct flush_backlogs *ptr = flush_backlogs_alloc(); + unsigned int cpu; + + if (!ptr) { + mutex_lock(&flush_backlogs_mutex); + ptr = flush_backlogs_fallback; } - rcu_read_unlock(); + cpumask_clear(&ptr->flush_cpus); - if (err) { - WARN_ON(&ptype->list == head); - kfree_skb(skb); - return NET_RX_SUCCESS; + cpus_read_lock(); + + for_each_online_cpu(cpu) { + if (flush_required(cpu)) { + INIT_WORK(&ptr->w[cpu], flush_backlog); + queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); + __cpumask_set_cpu(cpu, &ptr->flush_cpus); + } } -out: - return netif_receive_skb(skb); + /* we can have in flight packet[s] on the cpus we are not flushing, + * synchronize_net() in unregister_netdevice_many() will take care of + * them. + */ + for_each_cpu(cpu, &ptr->flush_cpus) + flush_work(&ptr->w[cpu]); + + cpus_read_unlock(); + + if (ptr != flush_backlogs_fallback) + kfree(ptr); + else + mutex_unlock(&flush_backlogs_mutex); } -/* napi->gro_list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +static void net_rps_send_ipi(struct softnet_data *remsd) { - struct sk_buff *skb, *prev = NULL; +#ifdef CONFIG_RPS + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; - /* scan list and build reverse chain */ - for (skb = napi->gro_list; skb != NULL; skb = skb->next) { - skb->prev = prev; - prev = skb; + if (cpu_online(remsd->cpu)) + smp_call_function_single_async(remsd->cpu, &remsd->csd); + remsd = next; } +#endif +} + +/* + * net_rps_action_and_irq_enable sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; - for (skb = prev; skb; skb = prev) { - skb->next = NULL; + if (!use_backlog_threads() && remsd) { + sd->rps_ipi_list = NULL; - if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) - return; + local_irq_enable(); - prev = skb->prev; - napi_gro_complete(skb); - napi->gro_count--; - } + /* Send pending IPI's to kick RPS processing on remote cpus. */ + net_rps_send_ipi(remsd); + } else +#endif + local_irq_enable(); +} - napi->gro_list = NULL; +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return !use_backlog_threads() && sd->rps_ipi_list; +#else + return false; +#endif } -EXPORT_SYMBOL(napi_gro_flush); -static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +static int process_backlog(struct napi_struct *napi, int quota) { - struct sk_buff *p; - unsigned int maclen = skb->dev->hard_header_len; + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + bool again = true; + int work = 0; + + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + + napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); + while (again) { + struct sk_buff *skb; + + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); + while ((skb = __skb_dequeue(&sd->process_queue))) { + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); + rcu_read_lock(); + __netif_receive_skb(skb); + rcu_read_unlock(); + if (++work >= quota) { + rps_input_queue_head_add(sd, work); + return work; + } - for (p = napi->gro_list; p; p = p->next) { - unsigned long diffs; + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); + } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); - diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; - if (maclen == ETH_HLEN) - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); - else if (!diffs) - diffs = memcmp(skb_mac_header(p), - skb_gro_mac_header(skb), - maclen); - NAPI_GRO_CB(p)->same_flow = !diffs; - NAPI_GRO_CB(p)->flush = 0; + backlog_lock_irq_disable(sd); + if (skb_queue_empty(&sd->input_pkt_queue)) { + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + napi->state &= NAPIF_STATE_THREADED; + again = false; + } else { + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); + } + backlog_unlock_irq_enable(sd); } + + if (work) + rps_input_queue_head_add(sd, work); + return work; } -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +/** + * __napi_schedule - schedule for receive + * @n: entry to schedule + * + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. + */ +void __napi_schedule(struct napi_struct *n) { - struct sk_buff **pp = NULL; - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int same_flow; - enum gro_result ret; + unsigned long flags; - if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) - goto normal; + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__napi_schedule); - if (skb_is_gso(skb) || skb_has_frag_list(skb)) - goto normal; +/** + * napi_schedule_prep - check if napi can be scheduled + * @n: napi context + * + * Test if NAPI routine is already running, and if not mark + * it as running. This is used as a condition variable to + * insure only one NAPI poll instance runs. We also make + * sure there is no pending NAPI disable. + */ +bool napi_schedule_prep(struct napi_struct *n) +{ + unsigned long new, val = READ_ONCE(n->state); - gro_list_prepare(napi, skb); + do { + if (unlikely(val & NAPIF_STATE_DISABLE)) + return false; + new = val | NAPIF_STATE_SCHED; - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; + /* Sets STATE_MISSED bit if STATE_SCHED was already set + * This was suggested by Alexander Duyck, as compiler + * emits better code than : + * if (val & NAPIF_STATE_SCHED) + * new |= NAPIF_STATE_MISSED; + */ + new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * + NAPIF_STATE_MISSED; + } while (!try_cmpxchg(&n->state, &val, new)); - skb_set_network_header(skb, skb_gro_offset(skb)); - skb_reset_mac_len(skb); - NAPI_GRO_CB(skb)->same_flow = 0; - NAPI_GRO_CB(skb)->flush = 0; - NAPI_GRO_CB(skb)->free = 0; + return !(val & NAPIF_STATE_SCHED); +} +EXPORT_SYMBOL(napi_schedule_prep); - pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); - break; - } - rcu_read_unlock(); +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked. + * + * On PREEMPT_RT enabled kernels this maps to __napi_schedule() + * because the interrupt disabled assumption might not be true + * due to force-threaded interrupts and spinlock substitution. + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + else + __napi_schedule(n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); - if (&ptype->list == head) - goto normal; +bool napi_complete_done(struct napi_struct *n, int work_done) +{ + unsigned long flags, val, new, timeout = 0; + bool ret = true; - same_flow = NAPI_GRO_CB(skb)->same_flow; - ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + /* + * 1) Don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu. + * 2) If we are busy polling, do nothing here, we have + * the guarantee we will be called later. + */ + if (unlikely(n->state & (NAPIF_STATE_NPSVC | + NAPIF_STATE_IN_BUSY_POLL))) + return false; + + if (work_done) { + if (n->gro.bitmask) + timeout = napi_get_gro_flush_timeout(n); + n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = napi_get_gro_flush_timeout(n); + if (timeout) + ret = false; + } - if (pp) { - struct sk_buff *nskb = *pp; + /* + * When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer. + */ + gro_flush_normal(&n->gro, !!timeout); - *pp = nskb->next; - nskb->next = NULL; - napi_gro_complete(nskb); - napi->gro_count--; + if (unlikely(!list_empty(&n->poll_list))) { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + list_del_init(&n->poll_list); + local_irq_restore(flags); } + WRITE_ONCE(n->list_owner, -1); - if (same_flow) - goto ok; + val = READ_ONCE(n->state); + do { + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) - goto normal; + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_SCHED_THREADED | + NAPIF_STATE_PREFER_BUSY_POLL); - napi->gro_count++; - NAPI_GRO_CB(skb)->count = 1; - NAPI_GRO_CB(skb)->age = jiffies; - skb_shinfo(skb)->gso_size = skb_gro_len(skb); - skb->next = napi->gro_list; - napi->gro_list = skb; - ret = GRO_HELD; + /* If STATE_MISSED was set, leave STATE_SCHED set, + * because we will call napi->poll() one more time. + * This C code was suggested by Alexander Duyck to help gcc. + */ + new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * + NAPIF_STATE_SCHED; + } while (!try_cmpxchg(&n->state, &val, new)); -pull: - if (skb_headlen(skb) < skb_gro_offset(skb)) { - int grow = skb_gro_offset(skb) - skb_headlen(skb); + if (unlikely(val & NAPIF_STATE_MISSED)) { + __napi_schedule(n); + return false; + } - BUG_ON(skb->end - skb->tail < grow); + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; +} +EXPORT_SYMBOL(napi_complete_done); - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); +static void skb_defer_free_flush(void) +{ + struct llist_node *free_list; + struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - skb->tail += grow; - skb->data_len -= grow; + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; - skb_shinfo(skb)->frags[0].page_offset += grow; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(skb_shinfo(skb)->frags, - skb_shinfo(skb)->frags + 1, - --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + prefetch(next); + napi_consume_skb(skb, 1); } } +} -ok: - return ret; +#if defined(CONFIG_NET_RX_BUSY_POLL) + +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +{ + if (!skip_schedule) { + gro_normal_list(&napi->gro); + __napi_schedule(napi); + return; + } -normal: - ret = GRO_NORMAL; - goto pull; + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush_normal(&napi->gro, HZ >= 1000); + + clear_bit(NAPI_STATE_SCHED, &napi->state); } +enum { + NAPI_F_PREFER_BUSY_POLL = 1, + NAPI_F_END_ON_RESCHED = 2, +}; -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + unsigned flags, u16 budget) { - switch (ret) { - case GRO_NORMAL: - if (netif_receive_skb(skb)) - ret = GRO_DROP; - break; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + bool skip_schedule = false; + unsigned long timeout; + int rc; - case GRO_DROP: - kfree_skb(skb); - break; + /* Busy polling means there is a high chance device driver hard irq + * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was + * set in napi_schedule_prep(). + * Since we are about to call napi->poll() once more, we can safely + * clear NAPI_STATE_MISSED. + * + * Note: x86 could use a single "lock and ..." instruction + * to perform these two clear_bit() + */ + clear_bit(NAPI_STATE_MISSED, &napi->state); + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + + if (flags & NAPI_F_PREFER_BUSY_POLL) { + napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); + timeout = napi_get_gro_flush_timeout(napi); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } - case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) - kmem_cache_free(skbuff_head_cache, skb); - else - __kfree_skb(skb); - break; + /* All we really want here is to re-enable device interrupts. + * Ideally, a new ndo_busy_poll_stop() could avoid another round. + */ + rc = napi->poll(napi, budget); + /* We can't gro_normal_list() here, because napi->poll() might have + * rearmed the napi (napi_complete_done()) in which case it could + * already be running on another CPU. + */ + trace_napi_poll(napi, rc, budget); + netpoll_poll_unlock(have_poll_lock); + if (rc == budget) + __busy_poll_stop(napi, skip_schedule); + bpf_net_ctx_clear(bpf_net_ctx); + local_bh_enable(); +} + +static void __napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, unsigned flags, u16 budget) +{ + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; + int (*napi_poll)(struct napi_struct *napi, int budget); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + void *have_poll_lock = NULL; + struct napi_struct *napi; - case GRO_HELD: - case GRO_MERGED: - break; + WARN_ON_ONCE(!rcu_read_lock_held()); + +restart: + napi_poll = NULL; + + napi = napi_by_id(napi_id); + if (!napi) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + for (;;) { + int work = 0; + + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + if (!napi_poll) { + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) { + if (flags & NAPI_F_PREFER_BUSY_POLL) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + goto count; + } + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) { + if (flags & NAPI_F_PREFER_BUSY_POLL) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + goto count; + } + have_poll_lock = netpoll_poll_lock(napi); + napi_poll = napi->poll; + } + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); + gro_normal_list(&napi->gro); +count: + if (work > 0) + __NET_ADD_STATS(dev_net(napi->dev), + LINUX_MIB_BUSYPOLLRXPACKETS, work); + skb_defer_free_flush(); + bpf_net_ctx_clear(bpf_net_ctx); + local_bh_enable(); + + if (!loop_end || loop_end(loop_end_arg, start_time)) + break; + + if (unlikely(need_resched())) { + if (flags & NAPI_F_END_ON_RESCHED) + break; + if (napi_poll) + busy_poll_stop(napi, have_poll_lock, flags, budget); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (loop_end(loop_end_arg, start_time)) + return; + goto restart; + } + cpu_relax(); } + if (napi_poll) + busy_poll_stop(napi, have_poll_lock, flags, budget); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); +} - return ret; +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = NAPI_F_END_ON_RESCHED; + + if (prefer_busy_poll) + flags |= NAPI_F_PREFER_BUSY_POLL; + + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); } -static void skb_gro_reset_offset(struct sk_buff *skb) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; + unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + rcu_read_lock(); + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); + rcu_read_unlock(); +} +EXPORT_SYMBOL(napi_busy_loop); + +void napi_suspend_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + unsigned long timeout = napi_get_irq_suspend_timeout(napi); + + if (timeout) + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); } + rcu_read_unlock(); } -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +void napi_resume_irqs(unsigned int napi_id) { - skb_gro_reset_offset(skb); + struct napi_struct *napi; - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + /* If irq_suspend_timeout is set to 0 between the call to + * napi_suspend_irqs and now, the original value still + * determines the safety timeout as intended and napi_watchdog + * will resume irq processing. + */ + if (napi_get_irq_suspend_timeout(napi)) { + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); + } + } + rcu_read_unlock(); } -EXPORT_SYMBOL(napi_gro_receive); -static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +static void __napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) { - __skb_pull(skb, skb_headlen(skb)); - /* restore the reserve we had after netdev_alloc_skb_ip_align() */ - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; - skb->dev = napi->dev; - skb->skb_iif = 0; + napi->gro.cached_napi_id = napi_id; - napi->skb = skb; + WRITE_ONCE(napi->napi_id, napi_id); + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } -struct sk_buff *napi_get_frags(struct napi_struct *napi) +static void napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) { - struct sk_buff *skb = napi->skb; + unsigned long flags; - if (!skb) { - skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); - if (skb) - napi->skb = skb; - } - return skb; + spin_lock_irqsave(&napi_hash_lock, flags); + WARN_ON_ONCE(napi_by_id(napi_id)); + __napi_hash_add_with_id(napi, napi_id); + spin_unlock_irqrestore(&napi_hash_lock, flags); } -EXPORT_SYMBOL(napi_get_frags); -static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, - gro_result_t ret) +static void napi_hash_add(struct napi_struct *napi) { - switch (ret) { - case GRO_NORMAL: - case GRO_HELD: - skb->protocol = eth_type_trans(skb, skb->dev); + unsigned long flags; - if (ret == GRO_HELD) - skb_gro_pull(skb, -ETH_HLEN); - else if (netif_receive_skb(skb)) - ret = GRO_DROP; - break; + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) + return; - case GRO_DROP: - case GRO_MERGED_FREE: - napi_reuse_skb(napi, skb); - break; + spin_lock_irqsave(&napi_hash_lock, flags); - case GRO_MERGED: - break; + /* 0..NR_CPUS range is reserved for sender_cpu use */ + do { + if (unlikely(!napi_id_valid(++napi_gen_id))) + napi_gen_id = MIN_NAPI_ID; + } while (napi_by_id(napi_gen_id)); + + __napi_hash_add_with_id(napi, napi_gen_id); + + spin_unlock_irqrestore(&napi_hash_lock, flags); +} + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +static void napi_hash_del(struct napi_struct *napi) +{ + unsigned long flags; + + spin_lock_irqsave(&napi_hash_lock, flags); + + hlist_del_init_rcu(&napi->napi_hash_node); + + spin_unlock_irqrestore(&napi_hash_lock, flags); +} + +static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); + + /* Note : we use a relaxed variant of napi_schedule_prep() not setting + * NAPI_STATE_MISSED, since we do not react to a device IRQ. + */ + if (!napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + __napi_schedule_irqoff(napi); } - return ret; + return HRTIMER_NORESTART; } -static struct sk_buff *napi_frags_skb(struct napi_struct *napi) +static void napi_stop_kthread(struct napi_struct *napi) { - struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - unsigned int hlen; - unsigned int off; + unsigned long val, new; - napi->skb = NULL; + /* Wait until the napi STATE_THREADED is unset. */ + while (true) { + val = READ_ONCE(napi->state); - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); - - off = skb_gro_offset(skb); - hlen = off + sizeof(*eth); - eth = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - eth = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!eth)) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; + /* If napi kthread own this napi or the napi is idle, + * STATE_THREADED can be unset here. + */ + if ((val & NAPIF_STATE_SCHED_THREADED) || + !(val & NAPIF_STATE_SCHED)) { + new = val & (~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL)); + } else { + msleep(20); + continue; } - } - skb_gro_pull(skb, sizeof(*eth)); + if (try_cmpxchg(&napi->state, &val, new)) + break; + } - /* - * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. + /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by + * the kthread. */ - skb->protocol = eth->h_proto; + while (true) { + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) + break; -out: - return skb; + msleep(20); + } + + kthread_stop(napi->thread); + napi->thread = NULL; } -gro_result_t napi_gro_frags(struct napi_struct *napi) +static void napi_set_threaded_state(struct napi_struct *napi, + enum netdev_napi_threaded threaded_mode) { - struct sk_buff *skb = napi_frags_skb(napi); - - if (!skb) - return GRO_DROP; + bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED; + bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL; - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll); } -EXPORT_SYMBOL(napi_gro_frags); -/* - * net_rps_action sends any pending IPI's for rps. - * Note: called with local irq disabled, but exits with local irq enabled. - */ -static void net_rps_action_and_irq_enable(struct softnet_data *sd) +int napi_set_threaded(struct napi_struct *napi, + enum netdev_napi_threaded threaded) { -#ifdef CONFIG_RPS - struct softnet_data *remsd = sd->rps_ipi_list; + if (threaded) { + if (!napi->thread) { + int err = napi_kthread_create(napi); - if (remsd) { - sd->rps_ipi_list = NULL; + if (err) + return err; + } + } - local_irq_enable(); + if (napi->config) + napi->config->threaded = threaded; - /* Send pending IPI's to kick RPS processing on remote cpus. */ - while (remsd) { - struct softnet_data *next = remsd->rps_ipi_next; + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + if (!threaded && napi->thread) { + napi_stop_kthread(napi); + } else { + /* Make sure kthread is created before THREADED bit is set. */ + smp_mb__before_atomic(); + napi_set_threaded_state(napi, threaded); + } - if (cpu_online(remsd->cpu)) - __smp_call_function_single(remsd->cpu, - &remsd->csd, 0); - remsd = next; - } - } else -#endif - local_irq_enable(); + return 0; } -static int process_backlog(struct napi_struct *napi, int quota) +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) { - int work = 0; - struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + struct napi_struct *napi; + int i, err = 0; -#ifdef CONFIG_RPS - /* Check if we have pending ipi, its better to send them now, - * not waiting net_rx_action() end. - */ - if (sd->rps_ipi_list) { - local_irq_disable(); - net_rps_action_and_irq_enable(sd); - } -#endif - napi->weight = weight_p; - local_irq_disable(); - while (work < quota) { - struct sk_buff *skb; - unsigned int qlen; + netdev_assert_locked_or_invisible(dev); - while ((skb = __skb_dequeue(&sd->process_queue))) { - local_irq_enable(); - __netif_receive_skb(skb); - local_irq_disable(); - input_queue_head_incr(sd); - if (++work >= quota) { - local_irq_enable(); - return work; + if (threaded) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!napi->thread) { + err = napi_kthread_create(napi); + if (err) { + threaded = NETDEV_NAPI_THREADED_DISABLED; + break; + } } } + } - rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); + WRITE_ONCE(dev->threaded, threaded); - if (qlen < quota - work) { - /* - * Inline a custom version of __napi_complete(). - * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), - * and we dont need an smp_mb() memory barrier. - */ - list_del(&napi->poll_list); - napi->state = 0; + /* The error should not occur as the kthreads are already created. */ + list_for_each_entry(napi, &dev->napi_list, dev_list) + WARN_ON_ONCE(napi_set_threaded(napi, threaded)); - quota = work + qlen; - } - rps_unlock(sd); - } - local_irq_enable(); + /* Override the config for all NAPIs even if currently not listed */ + for (i = 0; i < dev->num_napi_configs; i++) + dev->napi_config[i].threaded = threaded; - return work; + return err; } /** - * __napi_schedule - schedule for receive - * @n: entry to schedule + * netif_threaded_enable() - enable threaded NAPIs + * @dev: net_device instance * - * The entry's receive function will be scheduled to run + * Enable threaded mode for the NAPI instances of the device. This may be useful + * for devices where multiple NAPI instances get scheduled by a single + * interrupt. Threaded NAPI allows moving the NAPI processing to cores other + * than the core where IRQ is mapped. + * + * This function should be called before @dev is registered. */ -void __napi_schedule(struct napi_struct *n) +void netif_threaded_enable(struct net_device *dev) { - unsigned long flags; + WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED)); +} +EXPORT_SYMBOL(netif_threaded_enable); - local_irq_save(flags); - ____napi_schedule(&__get_cpu_var(softnet_data), n); - local_irq_restore(flags); +/** + * netif_queue_set_napi - Associate queue with the napi + * @dev: device to which NAPI and queue belong + * @queue_index: Index of queue + * @type: queue type as RX or TX + * @napi: NAPI context, pass NULL to clear previously set NAPI + * + * Set queue with its corresponding napi context. This should be done after + * registering the NAPI handler for the queue-vector and the queues have been + * mapped to the corresponding interrupt vector. + */ +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, struct napi_struct *napi) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + + if (WARN_ON_ONCE(napi && !napi->dev)) + return; + netdev_ops_assert_locked_or_invisible(dev); + + switch (type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(dev, queue_index); + rxq->napi = napi; + return; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(dev, queue_index); + txq->napi = napi; + return; + default: + return; + } } -EXPORT_SYMBOL(__napi_schedule); +EXPORT_SYMBOL(netif_queue_set_napi); -void __napi_complete(struct napi_struct *n) +static void +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) { - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - BUG_ON(n->gro_list); + struct napi_struct *napi = + container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + int err; +#endif + + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); - list_del(&n->poll_list); - smp_mb__before_clear_bit(); - clear_bit(NAPI_STATE_SCHED, &n->state); +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif } -EXPORT_SYMBOL(__napi_complete); -void napi_complete(struct napi_struct *n) +#ifdef CONFIG_RFS_ACCEL +static void netif_napi_affinity_release(struct kref *ref) { - unsigned long flags; + struct napi_struct *napi = + container_of(ref, struct napi_struct, notify.kref); + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; - /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu - */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + netdev_assert_locked(napi->dev); + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, + &napi->state)); + + if (!napi->dev->rx_cpu_rmap_auto) return; + rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + cpu_rmap_put(rmap); +} - napi_gro_flush(n, false); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + if (dev->rx_cpu_rmap_auto) + return 0; + + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); + if (!dev->rx_cpu_rmap) + return -ENOMEM; + + dev->rx_cpu_rmap_auto = true; + return 0; } -EXPORT_SYMBOL(napi_complete); +EXPORT_SYMBOL(netif_enable_cpu_rmap); -/* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) +static void netif_del_cpu_rmap(struct net_device *dev) { - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; + struct cpu_rmap *rmap = dev->rx_cpu_rmap; - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; + if (!dev->rx_cpu_rmap_auto) + return; - return NULL; + /* Free the rmap */ + cpu_rmap_put(rmap); + dev->rx_cpu_rmap = NULL; + dev->rx_cpu_rmap_auto = false; } -EXPORT_SYMBOL_GPL(napi_by_id); -void napi_hash_add(struct napi_struct *napi) +#else +static void netif_napi_affinity_release(struct kref *ref) { - if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { +} - spin_lock(&napi_hash_lock); +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); - /* 0 is not a valid id, we also skip an id that is taken - * we expect both events to be extremely rare - */ - napi->napi_id = 0; - while (!napi->napi_id) { - napi->napi_id = ++napi_gen_id; - if (napi_by_id(napi->napi_id)) - napi->napi_id = 0; - } +static void netif_del_cpu_rmap(struct net_device *dev) +{ +} +#endif + +void netif_set_affinity_auto(struct net_device *dev) +{ + unsigned int i, maxqs, numa; + + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); + numa = dev_to_node(&dev->dev); - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + + dev->irq_affinity_auto = true; +} +EXPORT_SYMBOL(netif_set_affinity_auto); - spin_unlock(&napi_hash_lock); +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) +{ + int rc; + + netdev_assert_locked_or_invisible(napi->dev); + + if (napi->irq == irq) + return; + + /* Remove existing resources */ + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + napi->irq = irq; + if (irq < 0 || + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) + return; + + /* Abort for buggy drivers */ + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) + return; + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); + if (rc < 0) + return; + + cpu_rmap_get(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = rc; } +#endif + + /* Use core IRQ notifier */ + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + goto put_rmap; + } + + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + return; + +put_rmap: +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; + cpu_rmap_put(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = -1; + } +#endif + napi->notify.notify = NULL; + napi->notify.release = NULL; } -EXPORT_SYMBOL_GPL(napi_hash_add); +EXPORT_SYMBOL(netif_napi_set_irq_locked); -/* Warning : caller is responsible to make sure rcu grace period - * is respected before freeing memory containing @napi - */ -void napi_hash_del(struct napi_struct *napi) +static void napi_restore_config(struct napi_struct *n) { - spin_lock(&napi_hash_lock); + n->defer_hard_irqs = n->config->defer_hard_irqs; + n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) - hlist_del_rcu(&napi->napi_hash_node); + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + + /* a NAPI ID might be stored in the config, if so use it. if not, use + * napi_hash_add to generate one for us. + */ + if (n->config->napi_id) { + napi_hash_add_with_id(n, n->config->napi_id); + } else { + napi_hash_add(n); + n->config->napi_id = n->napi_id; + } - spin_unlock(&napi_hash_lock); + WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded)); } -EXPORT_SYMBOL_GPL(napi_hash_del); -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +static void napi_save_config(struct napi_struct *n) { + n->config->defer_hard_irqs = n->defer_hard_irqs; + n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; + napi_hash_del(n); +} + +/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will + * inherit an existing ID try to insert it at the right position. + */ +static void +netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) +{ + unsigned int new_id, pos_id; + struct list_head *higher; + struct napi_struct *pos; + + new_id = UINT_MAX; + if (napi->config && napi->config->napi_id) + new_id = napi->config->napi_id; + + higher = &dev->napi_list; + list_for_each_entry(pos, &dev->napi_list, dev_list) { + if (napi_id_valid(pos->napi_id)) + pos_id = pos->napi_id; + else if (pos->config) + pos_id = pos->config->napi_id; + else + pos_id = UINT_MAX; + + if (pos_id <= new_id) + break; + higher = &pos->dev_list; + } + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ +} + +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + +void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + netdev_assert_locked(dev); + if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) + return; + INIT_LIST_HEAD(&napi->poll_list); - napi->gro_count = 0; - napi->gro_list = NULL; + INIT_HLIST_NODE(&napi->napi_hash_node); + hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + gro_init(&napi->gro); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) - pr_err_once("netif_napi_add() called with weight %d on device %s\n", - weight, dev->name); + netdev_err_once(dev, "%s() called with weight %d\n", __func__, + weight); napi->weight = weight; - list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL - spin_lock_init(&napi->poll_lock); napi->poll_owner = -1; #endif + napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); + set_bit(NAPI_STATE_NPSVC, &napi->state); + netif_napi_dev_list_add(dev, napi); + + /* default settings from sysfs are applied to all NAPIs. any per-NAPI + * configuration will be loaded in napi_enable + */ + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); + + napi_get_frags_check(napi); + /* Create kthread for this napi if dev->threaded is set. + * Clear dev->threaded if kthread creation failed so that + * threaded mode will not be enabled in napi_enable(). + */ + if (napi_get_threaded_config(dev, napi)) + if (napi_kthread_create(napi)) + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; + netif_napi_set_irq_locked(napi, -1); } -EXPORT_SYMBOL(netif_napi_add); +EXPORT_SYMBOL(netif_napi_add_weight_locked); -void netif_napi_del(struct napi_struct *napi) +void napi_disable_locked(struct napi_struct *n) { - struct sk_buff *skb, *next; + unsigned long val, new; + + might_sleep(); + netdev_assert_locked(n->dev); + + set_bit(NAPI_STATE_DISABLE, &n->state); + + val = READ_ONCE(n->state); + do { + while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { + usleep_range(20, 200); + val = READ_ONCE(n->state); + } + + new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; + new &= ~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL | + NAPIF_STATE_PREFER_BUSY_POLL); + } while (!try_cmpxchg(&n->state, &val, new)); + + hrtimer_cancel(&n->timer); + + if (n->config) + napi_save_config(n); + else + napi_hash_del(n); + + clear_bit(NAPI_STATE_DISABLE, &n->state); +} +EXPORT_SYMBOL(napi_disable_locked); + +/** + * napi_disable() - prevent NAPI from scheduling + * @n: NAPI context + * + * Stop NAPI from being scheduled on this context. + * Waits till any outstanding processing completes. + * Takes netdev_lock() for associated net_device. + */ +void napi_disable(struct napi_struct *n) +{ + netdev_lock(n->dev); + napi_disable_locked(n); + netdev_unlock(n->dev); +} +EXPORT_SYMBOL(napi_disable); + +void napi_enable_locked(struct napi_struct *n) +{ + unsigned long new, val = READ_ONCE(n->state); - list_del_init(&napi->dev_list); + if (n->config) + napi_restore_config(n); + else + napi_hash_add(n); + + do { + BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); + + new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); + if (n->dev->threaded && n->thread) + new |= NAPIF_STATE_THREADED; + } while (!try_cmpxchg(&n->state, &val, new)); +} +EXPORT_SYMBOL(napi_enable_locked); + +/** + * napi_enable() - enable NAPI scheduling + * @n: NAPI context + * + * Enable scheduling of a NAPI instance. + * Must be paired with napi_disable(). + * Takes netdev_lock() for associated net_device. + */ +void napi_enable(struct napi_struct *n) +{ + netdev_lock(n->dev); + napi_enable_locked(n); + netdev_unlock(n->dev); +} +EXPORT_SYMBOL(napi_enable); + +/* Must be called in process context */ +void __netif_napi_del_locked(struct napi_struct *napi) +{ + netdev_assert_locked(napi->dev); + + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) + return; + + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + if (napi->config) { + napi->index = -1; + napi->config = NULL; + } + + list_del_rcu(&napi->dev_list); napi_free_frags(napi); - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; - skb->next = NULL; - kfree_skb(skb); + gro_cleanup(&napi->gro); + + if (napi->thread) { + kthread_stop(napi->thread); + napi->thread = NULL; } +} +EXPORT_SYMBOL(__netif_napi_del_locked); - napi->gro_list = NULL; - napi->gro_count = 0; +static int __napi_poll(struct napi_struct *n, bool *repoll) +{ + int work, weight; + + weight = n->weight; + + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (napi_is_scheduled(n)) { + work = n->poll(n, weight); + trace_napi_poll(n, work, weight); + + xdp_do_check_flushed(n); + } + + if (unlikely(work > weight)) + netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", + n->poll, work, weight); + + if (likely(work < weight)) + return work; + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(napi_disable_pending(n))) { + napi_complete(n); + return work; + } + + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + return work; + } + + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush_normal(&n->gro, HZ >= 1000); + + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. + */ + if (unlikely(!list_empty(&n->poll_list))) { + pr_warn_once("%s: Budget exhausted after napi rescheduled\n", + n->dev ? n->dev->name : "backlog"); + return work; + } + + *repoll = true; + + return work; } -EXPORT_SYMBOL(netif_napi_del); -static void net_rx_action(struct softirq_action *h) +static int napi_poll(struct napi_struct *n, struct list_head *repoll) { - struct softnet_data *sd = &__get_cpu_var(softnet_data); - unsigned long time_limit = jiffies + 2; - int budget = netdev_budget; + bool do_repoll = false; void *have; + int work; - local_irq_disable(); - - while (!list_empty(&sd->poll_list)) { - struct napi_struct *n; - int work, weight; + list_del_init(&n->poll_list); - /* If softirq window is exhuasted then punt. - * Allow this to run for 2 jiffies since which will allow - * an average latency of 1.5/HZ. - */ - if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) - goto softnet_break; + have = netpoll_poll_lock(n); - local_irq_enable(); + work = __napi_poll(n, &do_repoll); - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + if (do_repoll) { +#if defined(CONFIG_DEBUG_NET) + if (unlikely(!napi_is_scheduled(n))) + pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n", + n->dev->name, n->poll); +#endif + list_add_tail(&n->poll_list, repoll); + } + netpoll_poll_unlock(have); - have = netpoll_poll_lock(n); + return work; +} - weight = n->weight; +static int napi_thread_wait(struct napi_struct *napi) +{ + set_current_state(TASK_INTERRUPTIBLE); - /* This NAPI_STATE_SCHED test is for avoiding a race - * with netpoll's poll_napi(). Only the entity which - * obtains the lock and sees NAPI_STATE_SCHED set will - * actually make the ->poll() call. Therefore we avoid - * accidentally calling ->poll() when NAPI is not scheduled. + while (!kthread_should_stop()) { + /* Testing SCHED_THREADED bit here to make sure the current + * kthread owns this napi and could poll on this napi. + * Testing SCHED bit is not enough because SCHED bit might be + * set by some other busy poll thread or by napi_disable(). */ - work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) { - work = n->poll(n, weight); - trace_napi_poll(n); + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; } - WARN_ON_ONCE(work > weight); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); - budget -= work; + return -1; +} - local_irq_disable(); +static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) +{ + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + struct softnet_data *sd; + unsigned long last_qs = jiffies; - /* Drivers must not modify the NAPI state if they - * consume the entire weight. In such cases this code - * still "owns" the NAPI instance and therefore can - * move the instance around on the list at-will. + for (;;) { + bool repoll = false; + void *have; + + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + + sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; + + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + sd->in_napi_threaded_poll = false; + barrier(); + + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + skb_defer_free_flush(); + bpf_net_ctx_clear(bpf_net_ctx); + + /* When busy poll is enabled, the old packets are not flushed in + * napi_complete_done. So flush them here. */ - if (unlikely(work == weight)) { - if (unlikely(napi_disable_pending(n))) { - local_irq_enable(); - napi_complete(n); - local_irq_disable(); - } else { - if (n->gro_list) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - local_irq_enable(); - napi_gro_flush(n, HZ >= 1000); - local_irq_disable(); - } - list_move_tail(&n->poll_list, &sd->poll_list); - } + if (busy_poll) + gro_flush_normal(&napi->gro, HZ >= 1000); + local_bh_enable(); + + /* Call cond_resched here to avoid watchdog warnings. */ + if (repoll || busy_poll) { + rcu_softirq_qs_periodic(last_qs); + cond_resched(); } - netpoll_poll_unlock(have); + if (!repoll) + break; } -out: - net_rps_action_and_irq_enable(sd); +} -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + bool want_busy_poll; + bool in_busy_poll; + unsigned long val; - return; + while (!napi_thread_wait(napi)) { + val = READ_ONCE(napi->state); -softnet_break: - sd->time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - goto out; -} + want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; + in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL; -struct netdev_upper { - struct net_device *dev; - bool master; - struct list_head list; - struct rcu_head rcu; - struct list_head search_list; -}; + if (unlikely(val & NAPIF_STATE_DISABLE)) + want_busy_poll = false; -static void __append_search_uppers(struct list_head *search_list, - struct net_device *dev) -{ - struct netdev_upper *upper; + if (want_busy_poll != in_busy_poll) + assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, + want_busy_poll); - list_for_each_entry(upper, &dev->upper_dev_list, list) { - /* check if this upper is not already in search list */ - if (list_empty(&upper->search_list)) - list_add_tail(&upper->search_list, search_list); + napi_threaded_poll_loop(napi, want_busy_poll); } + + return 0; } -static bool __netdev_search_upper_dev(struct net_device *dev, - struct net_device *upper_dev) +static __latent_entropy void net_rx_action(void) { - LIST_HEAD(search_list); - struct netdev_upper *upper; - struct netdev_upper *tmp; - bool ret = false; + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int budget = READ_ONCE(net_hotdata.netdev_budget); + LIST_HEAD(list); + LIST_HEAD(repoll); + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); +start: + sd->in_net_rx_action = true; + local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); - __append_search_uppers(&search_list, dev); - list_for_each_entry(upper, &search_list, search_list) { - if (upper->dev == upper_dev) { - ret = true; + for (;;) { + struct napi_struct *n; + + skb_defer_free_flush(); + + if (list_empty(&list)) { + if (list_empty(&repoll)) { + sd->in_net_rx_action = false; + barrier(); + /* We need to check if ____napi_schedule() + * had refilled poll_list while + * sd->in_net_rx_action was true. + */ + if (!list_empty(&sd->poll_list)) + goto start; + if (!sd_has_rps_ipi_waiting(sd)) + goto end; + } + break; + } + + n = list_first_entry(&list, struct napi_struct, poll_list); + budget -= napi_poll(n, &repoll); + + /* If softirq window is exhausted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || + time_after_eq(jiffies, time_limit))) { + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } - __append_search_uppers(&search_list, upper->dev); } - list_for_each_entry_safe(upper, tmp, &search_list, search_list) - INIT_LIST_HEAD(&upper->search_list); - return ret; + + local_irq_disable(); + + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + else + sd->in_net_rx_action = false; + + net_rps_action_and_irq_enable(sd); +end: + bpf_net_ctx_clear(bpf_net_ctx); } -static struct netdev_upper *__netdev_find_upper(struct net_device *dev, - struct net_device *upper_dev) +struct netdev_adjacent { + struct net_device *dev; + netdevice_tracker dev_tracker; + + /* upper master flag, there can only be one master device per list */ + bool master; + + /* lookup ignore flag */ + bool ignore; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + + /* private field for the users */ + void *private; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, + struct list_head *adj_list) { - struct netdev_upper *upper; + struct netdev_adjacent *adj; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - if (upper->dev == upper_dev) - return upper; + list_for_each_entry(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; } return NULL; } +static int ____netdev_has_upper_dev(struct net_device *upper_dev, + struct netdev_nested_priv *priv) +{ + struct net_device *dev = (struct net_device *)priv->data; + + return upper_dev == dev; +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -4425,13 +7959,40 @@ static struct netdev_upper *__netdev_find_upper(struct net_device *dev, bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + ASSERT_RTNL(); - return __netdev_find_upper(dev, upper_dev); + return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, + &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); /** + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks the entire upper device chain. + * The caller must hold rcu lock. + */ + +bool netdev_has_upper_dev_all_rcu(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + + return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, + &priv); +} +EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); + +/** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * @@ -4442,7 +8003,7 @@ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->upper_dev_list); + return !list_empty(&dev->adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); @@ -4455,21 +8016,597 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev); */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); - if (list_empty(&dev->upper_dev_list)) + if (list_empty(&dev->adj_list.upper)) return NULL; - upper = list_first_entry(&dev->upper_dev_list, - struct netdev_upper, list); + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); +static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) +{ + struct netdev_adjacent *upper; + + ASSERT_RTNL(); + + if (list_empty(&dev->adj_list.upper)) + return NULL; + + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (likely(upper->master) && !upper->ignore) + return upper->dev; + return NULL; +} + +/** + * netdev_has_any_lower_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to a lower device and return true in case + * it is. The caller must hold the RTNL lock. + */ +static bool netdev_has_any_lower_dev(struct net_device *dev) +{ + ASSERT_RTNL(); + + return !list_empty(&dev->adj_list.lower); +} + +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + adj = list_entry(adj_list, struct netdev_adjacent, list); + + return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + +static struct net_device *__netdev_next_upper_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *upper; + + upper = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + *ignore = upper->ignore; + + return upper->dev; +} + +static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} + +static int __netdev_walk_all_upper_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.upper; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = __netdev_next_upper_dev(now, &iter, &ignore); + if (!udev) + break; + if (ignore) + continue; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + +int netdev_walk_all_upper_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.upper; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = netdev_next_upper_dev_rcu(now, &iter); + if (!udev) + break; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); + +static bool __netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = (void *)upper_dev, + }; + + ASSERT_RTNL(); + + return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, + &priv); +} + +/** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchanged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); + +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchanged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +static struct net_device *netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} + +static struct net_device *__netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + *ignore = lower->ignore; + + return lower->dev; +} + +int netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); + +static int __netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = __netdev_next_lower_dev(now, &iter, &ignore); + if (!ldev) + break; + if (ignore) + continue; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_next_lower_dev_rcu); + +static u8 __netdev_upper_depth(struct net_device *dev) +{ + struct net_device *udev; + struct list_head *iter; + u8 max_depth = 0; + bool ignore; + + for (iter = &dev->adj_list.upper, + udev = __netdev_next_upper_dev(dev, &iter, &ignore); + udev; + udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < udev->upper_level) + max_depth = udev->upper_level; + } + + return max_depth; +} + +static u8 __netdev_lower_depth(struct net_device *dev) +{ + struct net_device *ldev; + struct list_head *iter; + u8 max_depth = 0; + bool ignore; + + for (iter = &dev->adj_list.lower, + ldev = __netdev_next_lower_dev(dev, &iter, &ignore); + ldev; + ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < ldev->lower_level) + max_depth = ldev->lower_level; + } + + return max_depth; +} + +static int __netdev_update_upper_level(struct net_device *dev, + struct netdev_nested_priv *__unused) +{ + dev->upper_level = __netdev_upper_depth(dev) + 1; + return 0; +} + +#ifdef CONFIG_LOCKDEP +static LIST_HEAD(net_unlink_list); + +static void net_unlink_todo(struct net_device *dev) +{ + if (list_empty(&dev->unlink_list)) + list_add_tail(&dev->unlink_list, &net_unlink_list); +} +#endif + +static int __netdev_update_lower_level(struct net_device *dev, + struct netdev_nested_priv *priv) +{ + dev->lower_level = __netdev_lower_depth(dev) + 1; + +#ifdef CONFIG_LOCKDEP + if (!priv) + return 0; + + if (priv->flags & NESTED_SYNC_IMM) + dev->nested_level = dev->lower_level - 1; + if (priv->flags & NESTED_SYNC_TODO) + net_unlink_todo(dev); +#endif + return 0; +} + +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev_rcu(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); + +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) +{ + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->adj_list.lower, + struct netdev_adjacent, list); + if (lower) + return lower->private; + return NULL; +} +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device @@ -4479,20 +8616,218 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get); */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; - upper = list_first_or_null_rcu(&dev->upper_dev_list, - struct netdev_upper, list); + upper = list_first_or_null_rcu(&dev->adj_list.upper, + struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -static int __netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, bool master) +static int netdev_adjacent_sysfs_add(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) { - struct netdev_upper *upper; + char linkname[IFNAMSIZ+7]; + + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", adj_dev->name); + return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), + linkname); +} +static void netdev_adjacent_sysfs_del(struct net_device *dev, + char *name, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", name); + sysfs_remove_link(&(dev->dev.kobj), linkname); +} + +static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + return (dev_list == &dev->adj_list.upper || + dev_list == &dev->adj_list.lower) && + net_eq(dev_net(dev), dev_net(adj_dev)); +} + +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list, + void *private, bool master) +{ + struct netdev_adjacent *adj; + int ret; + + adj = __netdev_find_adj(adj_dev, dev_list); + + if (adj) { + adj->ref_nr += 1; + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", + dev->name, adj_dev->name, adj->ref_nr); + + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->ref_nr = 1; + adj->private = private; + adj->ignore = false; + netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); + + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", + dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); + + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { + ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); + if (ret) + goto free_adj; + } + + /* Ensure that master link is always the first item in list. */ + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto remove_symlinks; + + list_add_rcu(&adj->list, dev_list); + } else { + list_add_tail_rcu(&adj->list, dev_list); + } + + return 0; + +remove_symlinks: + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: + netdev_put(adj_dev, &adj->dev_tracker); + kfree(adj); + + return ret; +} + +static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, + u16 ref_nr, + struct list_head *dev_list) +{ + struct netdev_adjacent *adj; + + pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", + dev->name, adj_dev->name, ref_nr); + + adj = __netdev_find_adj(adj_dev, dev_list); + + if (!adj) { + pr_err("Adjacency does not exist for device %s from %s\n", + dev->name, adj_dev->name); + WARN_ON(1); + return; + } + + if (adj->ref_nr > ref_nr) { + pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", + dev->name, adj_dev->name, ref_nr, + adj->ref_nr - ref_nr); + adj->ref_nr -= ref_nr; + return; + } + + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); + + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); + + list_del_rcu(&adj->list); + pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + netdev_put(adj_dev, &adj->dev_tracker); + kfree_rcu(adj, rcu); +} + +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) +{ + int ret; + + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, + private, master); + if (ret) + return ret; + + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, + private, false); + if (ret) { + __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); + return ret; + } + + return 0; +} + +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + u16 ref_nr, + struct list_head *up_list, + struct list_head *down_list) +{ + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); +} + +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) +{ + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); +} + +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); +} + +static int __netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, bool master, + void *upper_priv, void *upper_info, + struct netdev_nested_priv *priv, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .upper_dev = upper_dev, + .master = master, + .linking = true, + .upper_info = upper_info, + }; + struct net_device *master_dev; + int ret = 0; ASSERT_RTNL(); @@ -4500,37 +8835,58 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_search_upper_dev(upper_dev, dev)) + if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; - if (__netdev_find_upper(dev, upper_dev)) - return -EEXIST; + if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) + return -EMLINK; - if (master && netdev_master_upper_dev_get(dev)) - return -EBUSY; + if (!master) { + if (__netdev_has_upper_dev(dev, upper_dev)) + return -EEXIST; + } else { + master_dev = __netdev_master_upper_dev_get(dev); + if (master_dev) + return master_dev == upper_dev ? -EEXIST : -EBUSY; + } - upper = kmalloc(sizeof(*upper), GFP_KERNEL); - if (!upper) - return -ENOMEM; + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + return ret; - upper->dev = upper_dev; - upper->master = master; - INIT_LIST_HEAD(&upper->search_list); + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, + master); + if (ret) + return ret; + + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + goto rollback; + + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, priv); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + priv); - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&upper->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&upper->list, &dev->upper_dev_list); - dev_hold(upper_dev); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; + +rollback: + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + return ret; } /** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. @@ -4538,9 +8894,16 @@ static int __netdev_upper_dev_link(struct net_device *dev, * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + struct netlink_ext_ack *extack) { - return __netdev_upper_dev_link(dev, upper_dev, false); + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + return __netdev_upper_dev_link(dev, upper_dev, false, + NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -4548,6 +8911,9 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device + * @upper_priv: upper device private + * @upper_info: upper info to be passed down via notifier + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -4556,12 +8922,52 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) { - return __netdev_upper_dev_link(dev, upper_dev, true); + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + return __netdev_upper_dev_link(dev, upper_dev, true, + upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); +static void __netdev_upper_dev_unlink(struct net_device *dev, + struct net_device *upper_dev, + struct netdev_nested_priv *priv) +{ + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .linking = false, + }; + + ASSERT_RTNL(); + + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, + &changeupper_info.info); + + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, + &changeupper_info.info); + + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, priv); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + priv); +} + /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device @@ -4573,141 +8979,647 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_TODO, + .data = NULL, + }; + + __netdev_upper_dev_unlink(dev, upper_dev, &priv); +} +EXPORT_SYMBOL(netdev_upper_dev_unlink); + +static void __netdev_adjacent_dev_set(struct net_device *upper_dev, + struct net_device *lower_dev, + bool val) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); + if (adj) + adj->ignore = val; + + adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); + if (adj) + adj->ignore = val; +} + +static void netdev_adjacent_dev_disable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, true); +} + +static void netdev_adjacent_dev_enable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, false); +} + +int netdev_adjacent_change_prepare(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; + int err; + + if (!new_dev) + return 0; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_disable(dev, old_dev); + err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, + extack); + if (err) { + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + return err; + } + + return 0; +} +EXPORT_SYMBOL(netdev_adjacent_change_prepare); + +void netdev_adjacent_change_commit(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + if (!new_dev || !old_dev) + return; + + if (new_dev == old_dev) + return; + + netdev_adjacent_dev_enable(dev, old_dev); + __netdev_upper_dev_unlink(old_dev, dev, &priv); +} +EXPORT_SYMBOL(netdev_adjacent_change_commit); + +void netdev_adjacent_change_abort(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; + + if (!new_dev) + return; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + + __netdev_upper_dev_unlink(new_dev, dev, &priv); +} +EXPORT_SYMBOL(netdev_adjacent_change_abort); + +/** + * netdev_bonding_info_change - Dispatch event about slave change + * @dev: device + * @bonding_info: info to dispatch + * + * Send NETDEV_BONDING_INFO to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_bonding_info_change(struct net_device *dev, + struct netdev_bonding_info *bonding_info) +{ + struct netdev_notifier_bonding_info info = { + .info.dev = dev, + }; + + memcpy(&info.bonding_info, bonding_info, + sizeof(struct netdev_bonding_info)); + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, + &info.info); +} +EXPORT_SYMBOL(netdev_bonding_info_change); + +static int netdev_offload_xstats_enable_l3(struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + int err; + int rc; + + dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), + GFP_KERNEL); + if (!dev->offload_xstats_l3) + return -ENOMEM; + + rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + err = notifier_to_errno(rc); + if (err) + goto free_stats; + + return 0; + +free_stats: + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; + return err; +} + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return netdev_offload_xstats_enable_l3(dev, extack); + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_enable); + +static void netdev_offload_xstats_disable_l3(struct net_device *dev) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + + call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; +} + +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + if (!netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + netdev_offload_xstats_disable_l3(dev); + return 0; + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_disable); + +static void netdev_offload_xstats_disable_all(struct net_device *dev) +{ + netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); +} + +static struct rtnl_hw_stats64 * +netdev_offload_xstats_get_ptr(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return dev->offload_xstats_l3; + } + + WARN_ON(1); + return NULL; +} + +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + return netdev_offload_xstats_get_ptr(dev, type); +} +EXPORT_SYMBOL(netdev_offload_xstats_enabled); + +struct netdev_notifier_offload_xstats_ru { + bool used; +}; + +struct netdev_notifier_offload_xstats_rd { + struct rtnl_hw_stats64 stats; + bool used; +}; + +static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, + const struct rtnl_hw_stats64 *src) +{ + dest->rx_packets += src->rx_packets; + dest->tx_packets += src->tx_packets; + dest->rx_bytes += src->rx_bytes; + dest->tx_bytes += src->tx_bytes; + dest->rx_errors += src->rx_errors; + dest->tx_errors += src->tx_errors; + dest->rx_dropped += src->rx_dropped; + dest->tx_dropped += src->tx_dropped; + dest->multicast += src->multicast; +} + +static int netdev_offload_xstats_get_used(struct net_device *dev, + enum netdev_offload_xstats_type type, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_ru report_used = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_used = &report_used, + }; + int rc; + + WARN_ON(!netdev_offload_xstats_enabled(dev, type)); + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, + &info.info); + *p_used = report_used.used; + return notifier_to_errno(rc); +} + +static int netdev_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_rd report_delta = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_delta = &report_delta, + }; + struct rtnl_hw_stats64 *stats; + int rc; + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return -EINVAL; + + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, + &info.info); + + /* Cache whatever we got, even if there was an error, otherwise the + * successful stats retrievals would get lost. + */ + netdev_hw_stats64_add(stats, &report_delta.stats); + + if (p_stats) + *p_stats = *stats; + *p_used = report_delta.used; + + return notifier_to_errno(rc); +} + +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, bool *p_used, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (p_stats) + return netdev_offload_xstats_get_stats(dev, type, p_stats, + p_used, extack); + else + return netdev_offload_xstats_get_used(dev, type, p_used, + extack); +} +EXPORT_SYMBOL(netdev_offload_xstats_get); + +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, + const struct rtnl_hw_stats64 *stats) +{ + report_delta->used = true; + netdev_hw_stats64_add(&report_delta->stats, stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_report_delta); + +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) +{ + report_used->used = true; +} +EXPORT_SYMBOL(netdev_offload_xstats_report_used); + +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *p_stats) +{ + struct rtnl_hw_stats64 *stats; ASSERT_RTNL(); - upper = __netdev_find_upper(dev, upper_dev); - if (!upper) + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) return; - list_del_rcu(&upper->list); - dev_put(upper_dev); - kfree_rcu(upper, rcu); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + + netdev_hw_stats64_add(stats, p_stats); } -EXPORT_SYMBOL(netdev_upper_dev_unlink); +EXPORT_SYMBOL(netdev_offload_xstats_push_delta); + +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @dev: device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + +static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_sk_get_lower_dev) + return NULL; + return ops->ndo_sk_get_lower_dev(dev, sk); +} + +/** + * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket + * @dev: device + * @sk: the socket + * + * %NULL is returned if no lower device is found. + */ + +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk) +{ + struct net_device *lower; + + lower = netdev_sk_get_lower_dev(dev, sk); + while (lower) { + dev = lower; + lower = netdev_sk_get_lower_dev(dev, sk); + } + + return dev; +} +EXPORT_SYMBOL(netdev_sk_get_lowest_dev); + +static void netdev_adjacent_add_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.lower); + } +} + +static void netdev_adjacent_del_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.lower); + } +} + +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + } +} + +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + + +/** + * netdev_lower_state_changed - Dispatch event about lower device state change + * @lower_dev: device + * @lower_state_info: state to dispatch + * + * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info) +{ + struct netdev_notifier_changelowerstate_info changelowerstate_info = { + .info.dev = lower_dev, + }; + + ASSERT_RTNL(); + changelowerstate_info.lower_state_info = lower_state_info; + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, + &changelowerstate_info.info); +} +EXPORT_SYMBOL(netdev_lower_state_changed); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; - if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) + if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; + unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); - dev->flags |= IFF_PROMISC; - dev->promiscuity += inc; - if (dev->promiscuity == 0) { + promiscuity = dev->promiscuity + inc; + if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ - if (inc < 0) - dev->flags &= ~IFF_PROMISC; - else { - dev->promiscuity -= inc; - pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", - dev->name); + if (unlikely(inc > 0)) { + netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_PROMISC; + } else { + flags = old_flags | IFF_PROMISC; } - if (dev->flags != old_flags) { - pr_info("device %s %s promiscuous mode\n", - dev->name, - dev->flags & IFF_PROMISC ? "entered" : "left"); + WRITE_ONCE(dev->promiscuity, promiscuity); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); + netdev_info(dev, "%s promiscuous mode\n", + dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); - audit_log(current->audit_context, GFP_ATOMIC, - AUDIT_ANOM_PROMISCUOUS, - "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", - dev->name, (dev->flags & IFF_PROMISC), - (old_flags & IFF_PROMISC), - from_kuid(&init_user_ns, audit_get_loginuid(current)), - from_kuid(&init_user_ns, uid), - from_kgid(&init_user_ns, gid), - audit_get_sessionid(current)); + audit_log(audit_context(), GFP_ATOMIC, + AUDIT_ANOM_PROMISCUOUS, + "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", + dev->name, (dev->flags & IFF_PROMISC), + (old_flags & IFF_PROMISC), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), + audit_get_sessionid(current)); } dev_change_rx_flags(dev, IFF_PROMISC); } + if (notify) { + /* The ops lock is only required to ensure consistent locking + * for `NETDEV_CHANGE` notifiers. This function is sometimes + * called without the lock, even for devices that are ops + * locked, such as in `dev_uc_sync_multiple` when using + * bonding or teaming. + */ + netdev_ops_assert_locked(dev); + __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); + } return 0; } -/** - * dev_set_promiscuity - update promiscuity count on a device - * @dev: device - * @inc: modifier - * - * Add or remove promiscuity from a device. While the count in the device - * remains above zero the interface remains promiscuous. Once it hits zero - * the device reverts back to normal filtering operation. A negative inc - * value is used to drop promiscuity on the device. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_promiscuity(struct net_device *dev, int inc) +int netif_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; - err = __dev_set_promiscuity(dev, inc); + err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); return err; } -EXPORT_SYMBOL(dev_set_promiscuity); - -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_allmulti(struct net_device *dev, int inc) +int netif_set_allmulti(struct net_device *dev, int inc, bool notify) { - unsigned int old_flags = dev->flags; + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + unsigned int allmulti, flags; ASSERT_RTNL(); - dev->flags |= IFF_ALLMULTI; - dev->allmulti += inc; - if (dev->allmulti == 0) { + allmulti = dev->allmulti + inc; + if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ - if (inc < 0) - dev->flags &= ~IFF_ALLMULTI; - else { - dev->allmulti -= inc; - pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", - dev->name); + if (unlikely(inc > 0)) { + netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_ALLMULTI; + } else { + flags = old_flags | IFF_ALLMULTI; } - if (dev->flags ^ old_flags) { + WRITE_ONCE(dev->allmulti, allmulti); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); + netdev_info(dev, "%s allmulticast mode\n", + dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags, 0, NULL); } return 0; } -EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and @@ -4731,10 +9643,10 @@ void __dev_set_rx_mode(struct net_device *dev) * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1); + __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1); + __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } @@ -4751,21 +9663,21 @@ void dev_set_rx_mode(struct net_device *dev) } /** - * dev_get_flags - get flags reported to userspace - * @dev: device + * netif_get_flags() - get flags reported to userspace + * @dev: device * - * Get the combination of flag bits exported through APIs to userspace. + * Get the combination of flag bits exported through APIs to userspace. */ -unsigned int dev_get_flags(const struct net_device *dev) +unsigned int netif_get_flags(const struct net_device *dev) { unsigned int flags; - flags = (dev->flags & ~(IFF_PROMISC | + flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | - (dev->gflags & (IFF_PROMISC | + (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { @@ -4779,9 +9691,10 @@ unsigned int dev_get_flags(const struct net_device *dev) return flags; } -EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(netif_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -4814,38 +9727,47 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ - ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); - - if (!ret) - dev_set_rx_mode(dev); + if ((old_flags ^ flags) & IFF_UP) { + if (old_flags & IFF_UP) + __dev_close(dev); + else + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI - is important. Some (broken) drivers set IFF_PROMISC, when - IFF_ALLMULTI is requested not asking us and not reporting. + * is important. Some (broken) drivers set IFF_PROMISC, when + * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); + netif_set_allmulti(dev, inc, false); } return ret; } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges, u32 portid, + const struct nlmsghdr *nlh) { unsigned int changes = dev->flags ^ old_flags; + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); + if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); @@ -4855,242 +9777,1081 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info = { + .dev = dev, + }, + .flags_changed = changes, + }; - change_info.flags_changed = changes; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, - &change_info.info); + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int netif_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; - unsigned int changes, old_flags = dev->flags; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; - changes = old_flags ^ dev->flags; - if (changes) - rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - - __dev_notify_flags(dev, old_flags); + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); + __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } -EXPORT_SYMBOL(dev_change_flags); + +int __netif_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_mtu) + return ops->ndo_change_mtu(dev, new_mtu); + + /* Pairs with all the lockless reads of dev->mtu in the stack */ + WRITE_ONCE(dev->mtu, new_mtu); + return 0; +} +EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL"); + +int dev_validate_mtu(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) +{ + /* MTU must be positive, and in range */ + if (new_mtu < 0 || new_mtu < dev->min_mtu) { + NL_SET_ERR_MSG(extack, "mtu less than device minimum"); + return -EINVAL; + } + + if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { + NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); + return -EINVAL; + } + return 0; +} /** - * dev_set_mtu - Change maximum transfer unit - * @dev: device - * @new_mtu: new transfer unit + * netif_set_mtu_ext() - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * @extack: netlink extended ack * - * Change the maximum transfer size of the network device. + * Change the maximum transfer size of the network device. + * + * Return: 0 on success, -errno on failure. */ -int dev_set_mtu(struct net_device *dev, int new_mtu) +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) { - const struct net_device_ops *ops = dev->netdev_ops; - int err; + int err, orig_mtu; + + netdev_ops_assert_locked(dev); if (new_mtu == dev->mtu) return 0; - /* MTU must be positive. */ - if (new_mtu < 0) - return -EINVAL; + err = dev_validate_mtu(dev, new_mtu, extack); + if (err) + return err; if (!netif_device_present(dev)) return -ENODEV; - err = 0; - if (ops->ndo_change_mtu) - err = ops->ndo_change_mtu(dev, new_mtu); - else - dev->mtu = new_mtu; + err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) + return err; - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + orig_mtu = dev->mtu; + err = __netif_set_mtu(dev, new_mtu); + + if (!err) { + err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + orig_mtu); + err = notifier_to_errno(err); + if (err) { + /* setting mtu back and notifying everyone again, + * so that they have a chance to revert changes. + */ + __netif_set_mtu(dev, orig_mtu); + call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + new_mtu); + } + } return err; } -EXPORT_SYMBOL(dev_set_mtu); -/** - * dev_set_group - Change group this device belongs to - * @dev: device - * @new_group: group this device should belong to - */ -void dev_set_group(struct net_device *dev, int new_group) +int netif_set_mtu(struct net_device *dev, int new_mtu) +{ + struct netlink_ext_ack extack; + int err; + + memset(&extack, 0, sizeof(extack)); + err = netif_set_mtu_ext(dev, new_mtu, &extack); + if (err && extack._msg) + net_err_ratelimited("%s: %s\n", dev->name, extack._msg); + return err; +} +EXPORT_SYMBOL(netif_set_mtu); + +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; + + if (new_len != orig_len) { + WRITE_ONCE(dev->tx_queue_len, new_len); + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); + if (res) + goto err_rollback; + res = dev_qdisc_change_tx_queue_len(dev); + if (res) + goto err_rollback; + } + + return 0; + +err_rollback: + netdev_err(dev, "refused to change device tx_queue_len\n"); + WRITE_ONCE(dev->tx_queue_len, orig_len); + return res; +} + +void netif_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } -EXPORT_SYMBOL(dev_set_group); /** - * dev_set_mac_address - Change Media Access Control Address - * @dev: device - * @sa: new address + * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack * - * Change the hardware (MAC) address of the device + * Return: 0 on success, -errno on failure. */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_pre_changeaddr_info info = { + .info.dev = dev, + .info.extack = extack, + .dev_addr = addr, + }; + int rc; + + rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); + return notifier_to_errno(rc); +} +EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL"); + +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; - if (sa->sa_family != dev->type) + if (ss->ss_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = ops->ndo_set_mac_address(dev, sa); + err = netif_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; + if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { + err = ops->ndo_set_mac_address(dev, ss); + if (err) + return err; + } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } -EXPORT_SYMBOL(dev_set_mac_address); + +DECLARE_RWSEM(dev_addr_sem); + +/* "sa" is a true struct sockaddr with limited "sa_data" member. */ +int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +{ + size_t size = sizeof(sa->sa_data); + struct net_device *dev; + int ret = 0; + + down_read(&dev_addr_sem); + rcu_read_lock(); + + dev = dev_get_by_name_rcu(net, dev_name); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + if (!dev->addr_len) + memset(sa->sa_data, 0, size); + else + memcpy(sa->sa_data, dev->dev_addr, + min_t(size_t, size, dev->addr_len)); + sa->sa_family = dev->type; + +unlock: + rcu_read_unlock(); + up_read(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL"); + +int netif_change_carrier(struct net_device *dev, bool new_carrier) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_carrier) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_carrier(dev, new_carrier); +} /** - * dev_change_carrier - Change device carrier + * dev_get_phys_port_id - Get device physical port ID * @dev: device - * @new_carrier: new value + * @ppid: port ID * - * Change device carrier + * Get device physical port ID */ -int dev_change_carrier(struct net_device *dev, bool new_carrier) +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; - if (!ops->ndo_change_carrier) + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} + +/** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * @len: limit of bytes to copy to name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (ops->ndo_get_phys_port_name) { + err = ops->ndo_get_phys_port_name(dev, name, len); + if (err != -EOPNOTSUPP) + return err; + } + return devlink_compat_phys_port_name_get(dev, name, len); +} + +/** + * netif_get_port_parent_id() - Get the device's port parent identifier + * @dev: network device + * @ppid: pointer to a storage for the port's parent identifier + * @recurse: allow/disallow recursion to lower devices + * + * Get the devices's port parent identifier. + * + * Return: 0 on success, -errno on failure. + */ +int netif_get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid, bool recurse) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct netdev_phys_item_id first = { }; + struct net_device *lower_dev; + struct list_head *iter; + int err; + + if (ops->ndo_get_port_parent_id) { + err = ops->ndo_get_port_parent_id(dev, ppid); + if (err != -EOPNOTSUPP) + return err; + } + + err = devlink_compat_switch_id_get(dev, ppid); + if (!recurse || err != -EOPNOTSUPP) + return err; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = netif_get_port_parent_id(lower_dev, ppid, true); + if (err) + break; + if (!first.id_len) + first = *ppid; + else if (memcmp(&first, ppid, sizeof(*ppid))) + return -EOPNOTSUPP; + } + + return err; +} +EXPORT_SYMBOL(netif_get_port_parent_id); + +/** + * netdev_port_same_parent_id - Indicate if two network devices have + * the same port parent identifier + * @a: first network device + * @b: second network device + */ +bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) +{ + struct netdev_phys_item_id a_id = { }; + struct netdev_phys_item_id b_id = { }; + + if (netif_get_port_parent_id(a, &a_id, true) || + netif_get_port_parent_id(b, &b_id, true)) + return false; + + return netdev_phys_item_id_same(&a_id, &b_id); +} +EXPORT_SYMBOL(netdev_port_same_parent_id); + +int netif_change_proto_down(struct net_device *dev, bool proto_down) +{ + if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; - return ops->ndo_change_carrier(dev, new_carrier); + if (proto_down) + netif_carrier_off(dev); + else + netif_carrier_on(dev); + WRITE_ONCE(dev->proto_down, proto_down); + return 0; } -EXPORT_SYMBOL(dev_change_carrier); /** - * dev_new_index - allocate an ifindex - * @net: the applicable net namespace + * netdev_change_proto_down_reason_locked - proto down reason * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. + * @dev: device + * @mask: proto down mask + * @value: proto down value */ -static int dev_new_index(struct net *net) +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value) { - int ifindex = net->ifindex; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return net->ifindex = ifindex; + u32 proto_down_reason; + int b; + + if (!mask) { + proto_down_reason = value; + } else { + proto_down_reason = dev->proto_down_reason; + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + proto_down_reason |= BIT(b); + else + proto_down_reason &= ~BIT(b); + } } + WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } -/* Delayed registration/unregisteration */ -static LIST_HEAD(net_todo_list); +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; -static void net_set_todo(struct net_device *dev) +static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) { - list_add_tail(&dev->todo_list, &net_todo_list); + if (flags & XDP_FLAGS_HW_MODE) + return XDP_MODE_HW; + if (flags & XDP_FLAGS_DRV_MODE) + return XDP_MODE_DRV; + if (flags & XDP_FLAGS_SKB_MODE) + return XDP_MODE_SKB; + return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; } -static void rollback_registered_many(struct list_head *head) +static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) { - struct net_device *dev, *tmp; + switch (mode) { + case XDP_MODE_SKB: + return generic_xdp_install; + case XDP_MODE_DRV: + case XDP_MODE_HW: + return dev->netdev_ops->ndo_bpf; + default: + return NULL; + } +} + +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; +} + +static struct bpf_prog *dev_xdp_prog(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; + return dev->xdp_state[mode].prog; +} + +u8 dev_xdp_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog || dev->xdp_state[i].link) + count++; + return count; +} +EXPORT_SYMBOL_GPL(dev_xdp_prog_count); + +u8 dev_xdp_sb_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog && + !dev->xdp_state[i].prog->aux->xdp_has_frags) + count++; + return count; +} + +int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + if (!dev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + bpf->command == XDP_SETUP_PROG && + bpf->prog && !bpf->prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(bpf->extack, + "unable to propagate XDP to device using tcp-data-split"); + return -EBUSY; + } + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); + return -EBUSY; + } + + return dev->netdev_ops->ndo_bpf(dev, bpf); +} +EXPORT_SYMBOL_GPL(netif_xdp_propagate); + +u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) +{ + struct bpf_prog *prog = dev_xdp_prog(dev, mode); + + return prog ? prog->aux->id : 0; +} + +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + +static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_prog *prog) +{ + dev->xdp_state[mode].link = NULL; + dev->xdp_state[mode].prog = prog; +} + +static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, + bpf_op_t bpf_op, struct netlink_ext_ack *extack, + u32 flags, struct bpf_prog *prog) +{ + struct netdev_bpf xdp; + int err; + + netdev_ops_assert_locked(dev); + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + prog && !prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); + return -EBUSY; + } + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); + return -EBUSY; + } + + memset(&xdp, 0, sizeof(xdp)); + xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; + xdp.extack = extack; + xdp.flags = flags; + xdp.prog = prog; + + /* Drivers assume refcnt is already incremented (i.e, prog pointer is + * "moved" into driver), so they don't increment it on their own, but + * they do decrement refcnt when program is detached or replaced. + * Given net_device also owns link/prog, we need to bump refcnt here + * to prevent drivers from underflowing it. + */ + if (prog) + bpf_prog_inc(prog); + err = bpf_op(dev, &xdp); + if (err) { + if (prog) + bpf_prog_put(prog); + return err; + } + + if (mode != XDP_MODE_HW) + bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); + + return 0; +} + +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct bpf_xdp_link *link; + struct bpf_prog *prog; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; - BUG_ON(dev_boot_phase); ASSERT_RTNL(); - list_for_each_entry_safe(dev, tmp, head, unreg_list) { - /* Some devices call without registering - * for initialization unwind. Remove those - * devices and proceed with the remaining. - */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - pr_debug("unregister_netdevice: device %s/%p never was registered\n", - dev->name, dev); + for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { + prog = dev_xdp_prog(dev, mode); + if (!prog) + continue; - WARN_ON(1); - list_del(&dev->unreg_list); + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) continue; + + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); + } +} + +static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) +{ + unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); + struct bpf_prog *cur_prog; + struct net_device *upper; + struct list_head *iter; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + int err; + + ASSERT_RTNL(); + + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } + /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ + if (num_modes > 1) { + NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); + return -EINVAL; + } + /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ + if (!num_modes && dev_xdp_prog_count(dev) > 1) { + NL_SET_ERR_MSG(extack, + "More than one program loaded, unset mode is ambiguous"); + return -EINVAL; + } + /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ + if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { + NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); + return -EINVAL; + } + + mode = dev_xdp_mode(dev, flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; + } + + /* don't allow if an upper device already has a program */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (dev_xdp_prog_count(upper) > 0) { + NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); + return -EEXIST; } - dev->dismantle = true; - BUG_ON(dev->reg_state != NETREG_REGISTERED); } - /* If device is running, close it first. */ - dev_close_many(head); + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } + if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { + NL_SET_ERR_MSG(extack, "Active program does not match expected"); + return -EEXIST; + } - list_for_each_entry(dev, head, unreg_list) { - /* And unlink it from device chain. */ - unlist_netdevice(dev); + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; - dev->reg_state = NETREG_UNREGISTERING; + if (new_prog) { + bool offload = mode == XDP_MODE_HW; + enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB + ? XDP_MODE_DRV : XDP_MODE_SKB; + + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { + NL_SET_ERR_MSG(extack, "XDP program already attached"); + return -EBUSY; + } + if (!offload && dev_xdp_prog(dev, other_mode)) { + NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); + return -EEXIST; + } + if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { + NL_SET_ERR_MSG(extack, "Program bound to different device"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } + if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + return -EINVAL; + } + if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); + return -EINVAL; + } } - synchronize_net(); + /* don't call drivers if the effective program didn't change */ + if (new_prog != cur_prog) { + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) { + NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); + return -EOPNOTSUPP; + } - list_for_each_entry(dev, head, unreg_list) { - /* Shutdown queueing discipline. */ - dev_shutdown(dev); + err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); + if (err) + return err; + } + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); + if (cur_prog) + bpf_prog_put(cur_prog); - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + return 0; +} - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} - /* - * Flush the unicast and multicast chains - */ - dev_uc_flush(dev); - dev_mc_flush(dev); +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); + ASSERT_RTNL(); - /* Notifier chain MUST detach us all upper devices. */ - WARN_ON(netdev_has_any_upper_dev(dev)); + mode = dev_xdp_mode(dev, link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); -#ifdef CONFIG_XPS - /* Remove XPS queueing entries */ - netif_reset_xps_queues_gt(dev, 0); + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) { + netdev_lock_ops(xdp_link->dev); + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + netdev_unlock_ops(xdp_link->dev); + xdp_link->dev = NULL; + } + + rtnl_unlock(); +} + +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); +} + +static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + info->xdp.ifindex = ifindex; + return 0; +} + +static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + int err = 0; + + rtnl_lock(); + + /* link might have been auto-released already, so fail */ + if (!xdp_link->dev) { + err = -ENOLINK; + goto out_unlock; + } + + if (old_prog && link->prog != old_prog) { + err = -EPERM; + goto out_unlock; + } + old_prog = link->prog; + if (old_prog->type != new_prog->type || + old_prog->expected_attach_type != new_prog->expected_attach_type) { + err = -EINVAL; + goto out_unlock; + } + + if (old_prog == new_prog) { + /* no-op, don't disturb drivers */ + bpf_prog_put(new_prog); + goto out_unlock; + } + + netdev_lock_ops(xdp_link->dev); + mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); + bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); + err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, + xdp_link->flags, new_prog); + netdev_unlock_ops(xdp_link->dev); + if (err) + goto out_unlock; + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + rtnl_unlock(); + return err; +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, + .show_fdinfo = bpf_xdp_link_show_fdinfo, + .fill_link_info = bpf_xdp_link_fill_link_info, + .update_prog = bpf_xdp_link_update, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct netlink_ext_ack extack = {}; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + rtnl_lock(); + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) { + rtnl_unlock(); + return -EINVAL; + } + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto unlock; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog, + attr->link_create.attach_type); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto unlock; + } + + netdev_lock_ops(dev); + err = dev_xdp_attach_link(dev, &extack, link); + netdev_unlock_ops(dev); + rtnl_unlock(); + + if (err) { + link->dev = NULL; + bpf_link_cleanup(&link_primer); + trace_bpf_xdp_link_attach_failed(extack._msg); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +unlock: + rtnl_unlock(); + +out_put_dev: + dev_put(dev); + return err; +} + +/** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @extack: netlink extended ack + * @fd: new program fd or negative value to clear + * @expected_fd: old program fd that userspace expects to replace or clear + * @flags: xdp-related flags + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags) +{ + enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); + struct bpf_prog *new_prog = NULL, *old_prog = NULL; + int err; + + ASSERT_RTNL(); + + if (fd >= 0) { + new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(new_prog)) + return PTR_ERR(new_prog); + } + + if (expected_fd >= 0) { + old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(old_prog)) { + err = PTR_ERR(old_prog); + old_prog = NULL; + goto err_out; + } + } + + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); + +err_out: + if (err && new_prog) + bpf_prog_put(new_prog); + if (old_prog) + bpf_prog_put(old_prog); + return err; +} + +u32 dev_get_min_mp_channel_count(const struct net_device *dev) +{ + int i; + + netdev_ops_assert_locked(dev); + + for (i = dev->real_num_rx_queues - 1; i >= 0; i--) + if (dev->_rx[i].mp_params.mp_priv) + /* The channel count is the idx plus 1. */ + return i + 1; + + return 0; +} + +/** + * dev_index_reserve() - allocate an ifindex in a namespace + * @net: the applicable net namespace + * @ifindex: requested ifindex, pass %0 to get one allocated + * + * Allocate a ifindex for a new device. Caller must either use the ifindex + * to store the device (via list_netdevice()) or call dev_index_release() + * to give the index up. + * + * Return: a suitable unique value for a new device interface number or -errno. + */ +static int dev_index_reserve(struct net *net, u32 ifindex) +{ + int err; + + if (ifindex > INT_MAX) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EINVAL; + } + + if (!ifindex) + err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, + xa_limit_31b, &net->ifindex, GFP_KERNEL); + else + err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); + if (err < 0) + return err; + + return ifindex; +} + +static void dev_index_release(struct net *net, int ifindex) +{ + /* Expect only unused indexes, unlist_netdevice() removes the used */ + WARN_ON(xa_erase(&net->dev_by_index, ifindex)); +} + +static bool from_cleanup_net(void) +{ +#ifdef CONFIG_NET_NS + return current == READ_ONCE(cleanup_net_task); +#else + return false; #endif +} + +/* Delayed registration/unregisteration */ +LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +atomic_t dev_unreg_count = ATOMIC_INIT(0); + +static void net_set_todo(struct net_device *dev) +{ + list_add_tail(&dev->todo_list, &net_todo_list); +} + +static netdev_features_t netdev_sync_upper_features(struct net_device *lower, + struct net_device *upper, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(upper->wanted_features & feature) + && (features & feature)) { + netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", + &feature, upper->name); + features &= ~feature; + } } - synchronize_net(); + return features; +} - list_for_each_entry(dev, head, unreg_list) - dev_put(dev); +static void netdev_sync_lower_features(struct net_device *upper, + struct net_device *lower, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(features & feature) && (lower->features & feature)) { + netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", + &feature, lower->name); + netdev_lock_ops(lower); + lower->wanted_features &= ~feature; + __netdev_update_features(lower); + + if (unlikely(lower->features & feature)) + netdev_WARN(upper, "failed to disable %pNF on %s!\n", + &feature, lower->name); + else + netdev_features_change(lower); + netdev_unlock_ops(lower); + } + } } -static void rollback_registered(struct net_device *dev) +static bool netdev_has_ip_or_hw_csum(netdev_features_t features) { - LIST_HEAD(single); + netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; + bool hw_csum = features & NETIF_F_HW_CSUM; - list_add(&dev->unreg_list, &single); - rollback_registered_many(&single); - list_del(&single); + return ip_csum || hw_csum; } static netdev_features_t netdev_fix_features(struct net_device *dev, @@ -5122,6 +10883,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_TSO6; } + /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ + if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) + features &= ~NETIF_F_TSO_MANGLEID; + /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; @@ -5132,33 +10897,71 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_GSO; } - /* UFO needs SG and checksumming */ - if (features & NETIF_F_UFO) { - /* maybe split UFO into V4 and V6? */ - if (!((features & NETIF_F_GEN_CSUM) || - (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no checksum offload features.\n"); - features &= ~NETIF_F_UFO; + /* GSO partial features require GSO partial be set */ + if ((features & dev->gso_partial_features) && + !(features & NETIF_F_GSO_PARTIAL)) { + netdev_dbg(dev, + "Dropping partially supported GSO features since no GSO partial.\n"); + features &= ~dev->gso_partial_features; + } + + if (!(features & NETIF_F_RXCSUM)) { + /* NETIF_F_GRO_HW implies doing RXCSUM since every packet + * successfully merged by hardware must also have the + * checksum verified by hardware. If the user does not + * want to enable RXCSUM, logically, we should disable GRO_HW. + */ + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + /* LRO/HW-GRO features cannot be combined with RX-FCS */ + if (features & NETIF_F_RXFCS) { + if (features & NETIF_F_LRO) { + netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_LRO; } - if (!(features & NETIF_F_SG)) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); - features &= ~NETIF_F_UFO; + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_GRO_HW; } } + if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { + netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); + features &= ~NETIF_F_LRO; + } + + if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + + if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { + netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); + features &= ~NETIF_F_GSO_UDP_L4; + } + return features; } int __netdev_update_features(struct net_device *dev) { + struct net_device *upper, *lower; netdev_features_t features; - int err = 0; + struct list_head *iter; + int err = -1; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); features = netdev_get_wanted_features(dev); @@ -5168,26 +10971,81 @@ int __netdev_update_features(struct net_device *dev) /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); + /* some features can't be enabled if they're off on an upper device */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) + features = netdev_sync_upper_features(dev, upper, features); + if (dev->features == features) - return 0; + goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ return -1; } - if (!err) +sync_lower: + /* some features must be disabled on lower devices when disabled + * on an upper device (think: bonding master or bridge) + */ + netdev_for_each_lower_dev(dev, lower, iter) + netdev_sync_lower_features(dev, lower, features); + + if (!err) { + netdev_features_t diff = features ^ dev->features; + + if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { + /* udp_tunnel_{get,drop}_rx_info both need + * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the + * device, or they won't do anything. + * Thus we need to update dev->features + * *before* calling udp_tunnel_get_rx_info, + * but *after* calling udp_tunnel_drop_rx_info. + */ + udp_tunnel_nic_lock(dev); + if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { + dev->features = features; + udp_tunnel_get_rx_info(dev); + } else { + udp_tunnel_drop_rx_info(dev); + } + udp_tunnel_nic_unlock(dev); + } + + if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { + if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_ctag_filter_info(dev); + } else { + vlan_drop_rx_ctag_filter_info(dev); + } + } + + if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { + if (features & NETIF_F_HW_VLAN_STAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_stag_filter_info(dev); + } else { + vlan_drop_rx_stag_filter_info(dev); + } + } + dev->features = features; + } - return 1; + return err < 0 ? 0 : 1; } /** @@ -5239,35 +11097,65 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); - if (netif_carrier_ok(rootdev)) { - if (!netif_carrier_ok(dev)) - netif_carrier_on(dev); - } else { - if (netif_carrier_ok(dev)) - netif_carrier_off(dev); - } + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + + if (netif_carrier_ok(rootdev)) + netif_carrier_on(dev); + else + netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; + size_t sz = count * sizeof(*rx); + int err = 0; BUG_ON(count < 1); - rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; dev->_rx = rx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); + if (err < 0) + goto err_rxq_info; + } return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kvfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); + + kvfree(dev->_rx); } -#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -5285,10 +11173,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netif_free_tx_queues(struct net_device *dev) { - if (is_vmalloc_addr(dev->_tx)) - vfree(dev->_tx); - else - kfree(dev->_tx); + kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) @@ -5297,14 +11182,13 @@ static int netif_alloc_netdev_queues(struct net_device *dev) struct netdev_queue *tx; size_t sz = count * sizeof(*tx); - BUG_ON(count < 1 || count > 0xffff); + if (count < 1 || count > 0xffff) + return -EINVAL; + + tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!tx) + return -ENOMEM; - tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!tx) { - tx = vzalloc(sz); - if (!tx) - return -ENOMEM; - } dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -5313,28 +11197,93 @@ static int netif_alloc_netdev_queues(struct net_device *dev) return 0; } +void netif_tx_stop_all_queues(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + netif_tx_stop_queue(txq); + } +} +EXPORT_SYMBOL(netif_tx_stop_all_queues); + +static int netdev_do_alloc_pcpu_stats(struct net_device *dev) +{ + void __percpu *v; + + /* Drivers implementing ndo_get_peer_dev must support tstat + * accounting, so that skb_do_redirect() can bump the dev's + * RX stats upon network namespace switch. + */ + if (dev->netdev_ops->ndo_get_peer_dev && + dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) + return -EOPNOTSUPP; + + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return 0; + case NETDEV_PCPU_STAT_LSTATS: + v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); + break; + default: + return -EINVAL; + } + + return v ? 0 : -ENOMEM; +} + +static void netdev_do_free_pcpu_stats(struct net_device *dev) +{ + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return; + case NETDEV_PCPU_STAT_LSTATS: + free_percpu(dev->lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + free_percpu(dev->tstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + free_percpu(dev->dstats); + break; + } +} + +static void netdev_free_phy_link_topology(struct net_device *dev) +{ + struct phy_link_topology *topo = dev->link_topo; + + if (IS_ENABLED(CONFIG_PHYLIB) && topo) { + xa_destroy(&topo->phys); + kfree(topo); + dev->link_topo = NULL; + } +} + /** - * register_netdevice - register a network device - * @dev: device to register + * register_netdevice() - register a network device + * @dev: device to register * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * Callers must hold the rtnl semaphore. You may want - * register_netdev() instead of this. - * - * BUGS: - * The locking appears insufficient to guarantee two parallel registers - * will not get the same name. + * Take a prepared network device structure and make it externally accessible. + * A %NETDEV_REGISTER message is sent to the netdev notifier chain. + * Callers must hold the rtnl lock - you may want register_netdev() + * instead of this. */ - int register_netdevice(struct net_device *dev) { int ret; struct net *net = dev_net(dev); + BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < + NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -5344,22 +11293,33 @@ int register_netdevice(struct net_device *dev) BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); + ret = ethtool_check_ops(dev->ethtool_ops); + if (ret) + return ret; + + /* rss ctx ID 0 is reserved for the default context, start from 1 */ + xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); + mutex_init(&dev->ethtool->rss_lock); + spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - dev->iflink = -1; - ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; + ret = -ENOMEM; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; - goto out; + goto err_free_name; } } @@ -5372,30 +11332,44 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } - ret = -EBUSY; - if (!dev->ifindex) - dev->ifindex = dev_new_index(net); - else if (__dev_get_by_index(net, dev->ifindex)) + ret = netdev_do_alloc_pcpu_stats(dev); + if (ret) goto err_uninit; - if (dev->iflink == -1) - dev->iflink = dev->ifindex; + ret = dev_index_reserve(net, dev->ifindex); + if (ret < 0) + goto err_free_pcpu; + dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ - dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; + + if (dev->udp_tunnel_nic_info) { + dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; + dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; + } + dev->wanted_features = dev->features & dev->hw_features; - /* Turn on no cache copy if HW is doing checksum */ - if (!(dev->flags & IFF_LOOPBACK)) { + if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; - if (dev->features & NETIF_F_ALL_CSUM) { - dev->wanted_features |= NETIF_F_NOCACHE_COPY; - dev->features |= NETIF_F_NOCACHE_COPY; - } - } + + /* If IPv4 TCP segmentation offload is supported we should also + * allow the device to enable segmenting the frame with the option + * of ignoring a static IP ID value. This doesn't enable the + * feature itself but allows the user to enable it later. + */ + if (dev->hw_features & NETIF_F_TSO) + dev->hw_features |= NETIF_F_TSO_MANGLEID; + if (dev->vlan_features & NETIF_F_TSO) + dev->vlan_features |= NETIF_F_TSO_MANGLEID; + if (dev->mpls_features & NETIF_F_TSO) + dev->mpls_features |= NETIF_F_TSO_MANGLEID; + if (dev->hw_enc_features & NETIF_F_TSO) + dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ @@ -5403,7 +11377,7 @@ int register_netdevice(struct net_device *dev) /* Make NETIF_F_SG inheritable to tunnel devices. */ - dev->hw_enc_features |= NETIF_F_SG; + dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ @@ -5412,14 +11386,20 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) - goto err_uninit; + goto err_ifindex_release; ret = netdev_register_kobject(dev); + + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); + netdev_unlock(dev); + if (ret) - goto err_uninit; - dev->reg_state = NETREG_REGISTERED; + goto err_uninit_notify; + netdev_lock_ops(dev); __netdev_update_features(dev); + netdev_unlock_ops(dev); /* * Default initial state at registry is that the @@ -5431,8 +11411,10 @@ int register_netdevice(struct net_device *dev) linkwatch_init_dev(dev); dev_init_scheduler(dev); - dev_hold(dev); + + netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should @@ -5443,57 +11425,54 @@ int register_netdevice(struct net_device *dev) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ + netdev_lock_ops(dev); ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); ret = notifier_to_errno(ret); if (ret) { - rollback_registered(dev); - dev->reg_state = NETREG_UNREGISTERED; + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; + unregister_netdevice_queue(dev, NULL); + goto out; } /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: return ret; +err_uninit_notify: + call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); +err_ifindex_release: + dev_index_release(net, dev->ifindex); +err_free_pcpu: + netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + if (dev->priv_destructor) + dev->priv_destructor(dev); +err_free_name: + netdev_name_node_free(dev->name_node); goto out; } EXPORT_SYMBOL(register_netdevice); -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initialize the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * The setup steps dummy netdevs need which normal netdevs get by going + * through register_netdevice(). */ -int init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev(struct net_device *dev) { - /* Clear everything. Note we don't initialize spinlocks - * are they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; - /* NAPI wants this */ - INIT_LIST_HEAD(&dev->napi_list); - /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); @@ -5502,11 +11481,7 @@ int init_dummy_netdev(struct net_device *dev) * because users of this 'device' dont need to change * its refcount. */ - - return 0; } -EXPORT_SYMBOL_GPL(init_dummy_netdev); - /** * register_netdev - register a network device @@ -5523,28 +11498,41 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev); */ int register_netdev(struct net_device *dev) { + struct net *net = dev_net(dev); int err; - rtnl_lock(); + if (rtnl_net_lock_killable(net)) + return -EINTR; + err = register_netdevice(dev); - rtnl_unlock(); + + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; +#else + return refcount_read(&dev->dev_refcnt); +#endif } EXPORT_SYMBOL(netdev_refcnt_read); +int netdev_unregister_timeout_secs __read_mostly = 10; + +#define WAIT_REFS_MIN_MSECS 1 +#define WAIT_REFS_MAX_MSECS 250 /** - * netdev_wait_allrefs - wait until all references are gone. - * @dev: target net_device + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on * * This is called when unregistering network devices. * @@ -5554,51 +11542,69 @@ EXPORT_SYMBOL(netdev_refcnt_read); * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -static void netdev_wait_allrefs(struct net_device *dev) +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; - int refcnt; - - linkwatch_forget_dev(dev); + struct net_device *dev; + int wait = 0; rebroadcast_time = warning_time = jiffies; - refcnt = netdev_refcnt_read(dev); - while (refcnt != 0) { + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + } __rtnl_unlock(); rebroadcast_time = jiffies; } - msleep(250); + rcu_barrier(); - refcnt = netdev_refcnt_read(dev); + if (!wait) { + wait = WAIT_REFS_MIN_MSECS; + } else { + msleep(wait); + wait = min(wait << 1, WAIT_REFS_MAX_MSECS); + } + + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + if (time_after(jiffies, warning_time + + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { + list_for_each_entry(dev, list, todo_list) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); + } - if (time_after(jiffies, warning_time + 10 * HZ)) { - pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", - dev->name, refcnt); warning_time = jiffies; } } @@ -5630,76 +11636,172 @@ static void netdev_wait_allrefs(struct net_device *dev) */ void netdev_run_todo(void) { + struct net_device *dev, *tmp; struct list_head list; + int cnt; +#ifdef CONFIG_LOCKDEP + struct list_head unlink_list; + + list_replace_init(&net_unlink_list, &unlink_list); + + while (!list_empty(&unlink_list)) { + dev = list_first_entry(&unlink_list, struct net_device, + unlink_list); + list_del_init(&dev->unlink_list); + dev->nested_level = dev->lower_level - 1; + } +#endif /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); - /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); - while (!list_empty(&list)) { - struct net_device *dev - = list_first_entry(&list, struct net_device, todo_list); - list_del(&dev->todo_list); - - rtnl_lock(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - __rtnl_unlock(); - + list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - pr_err("network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); continue; } - dev->reg_state = NETREG_UNREGISTERED; - - on_each_cpu(flush_backlog, dev, 1); + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); + netdev_unlock(dev); + linkwatch_sync_dev(dev); + } - netdev_wait_allrefs(dev); + cnt = 0; + while (!list_empty(&list)) { + dev = netdev_wait_allrefs_any(&list); + list_del(&dev->todo_list); /* paranoia */ - BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(netdev_refcnt_read(dev) != 1); + BUG_ON(!list_empty(&dev->ptype_all)); + BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); - WARN_ON(dev->dn_ptr); - if (dev->destructor) - dev->destructor(dev); + netdev_do_free_pcpu_stats(dev); + if (dev->priv_destructor) + dev->priv_destructor(dev); + if (dev->needs_free_netdev) + free_netdev(dev); + + cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } + if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) + wake_up(&netdev_unregistering_wq); +} + +/* Collate per-cpu network dstats statistics + * + * Read per-cpu network statistics from dev->dstats and populate the related + * fields in @s. + */ +static void dev_fetch_dstats(struct rtnl_link_stats64 *s, + const struct pcpu_dstats __percpu *dstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, rx_drops; + u64 tx_packets, tx_bytes, tx_drops; + const struct pcpu_dstats *stats; + unsigned int start; + + stats = per_cpu_ptr(dstats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + rx_drops = u64_stats_read(&stats->rx_drops); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); + tx_drops = u64_stats_read(&stats->tx_drops); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->rx_dropped += rx_drops; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; + s->tx_dropped += tx_drops; + } +} + +/* ndo_get_stats64 implementation for dtstats-based accounting. + * + * Populate @s from dev->stats and dev->dstats. This is used internally by the + * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. + */ +static void dev_get_dstats64(const struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_dstats(s, dev->dstats); } -/* Convert net_device_stats to rtnl_link_stats64. They have the same - * fields in the same order, with only the type differing. +/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has + * all the same fields in the same order as net_device_stats, with only + * the type differing, but rtnl_link_stats64 may have additional fields + * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { -#if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); - memcpy(stats64, netdev_stats, sizeof(*stats64)); -#else - size_t i, n = sizeof(*stats64) / sizeof(u64); - const unsigned long *src = (const unsigned long *)netdev_stats; + size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); + const atomic_long_t *src = (atomic_long_t *)netdev_stats; u64 *dst = (u64 *)stats64; - BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != - sizeof(*stats64) / sizeof(u64)); + BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = src[i]; -#endif + dst[i] = (unsigned long)atomic_long_read(&src[i]); + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + n * sizeof(u64), 0, + sizeof(*stats64) - n * sizeof(u64)); } EXPORT_SYMBOL(netdev_stats_to_stats64); +static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc( + struct net_device *dev) +{ + struct net_device_core_stats __percpu *p; + + p = alloc_percpu_gfp(struct net_device_core_stats, + GFP_ATOMIC | __GFP_NOWARN); + + if (p && cmpxchg(&dev->core_stats, NULL, p)) + free_percpu(p); + + /* This READ_ONCE() pairs with the cmpxchg() above */ + return READ_ONCE(dev->core_stats); +} + +noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) +{ + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); + unsigned long __percpu *field; + + if (unlikely(!p)) { + p = netdev_core_stats_alloc(dev); + if (!p) + return; + } + + field = (unsigned long __percpu *)((void __percpu *)p + offset); + this_cpu_inc(*field); +} +EXPORT_SYMBOL_GPL(netdev_core_stats_inc); + /** * dev_get_stats - get network device statistics * @dev: device to get statistics from @@ -5714,20 +11816,102 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; + const struct net_device_core_stats __percpu *p; + + /* + * IPv{4,6} and udp tunnels share common stat helpers and use + * different stat type (NETDEV_PCPU_STAT_TSTATS vs + * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. + */ + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != + offsetof(struct pcpu_dstats, rx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != + offsetof(struct pcpu_dstats, rx_packets)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != + offsetof(struct pcpu_dstats, tx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != + offsetof(struct pcpu_dstats, tx_packets)); if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { + dev_get_tstats64(dev, storage); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { + dev_get_dstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } - storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + p = READ_ONCE(dev->core_stats); + if (p) { + const struct net_device_core_stats *core_stats; + int i; + + for_each_possible_cpu(i) { + core_stats = per_cpu_ptr(p, i); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); + storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); + } + } return storage; } EXPORT_SYMBOL(dev_get_stats); +/** + * dev_fetch_sw_netstats - get per-cpu network device statistics + * @s: place to store stats + * @netstats: per-cpu network stats to read from + * + * Read per-cpu network statistics and populate the related fields in @s. + */ +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + const struct pcpu_sw_netstats *stats; + unsigned int start; + + stats = per_cpu_ptr(netstats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; + } +} +EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); + +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -5739,8 +11923,8 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); - queue->qdisc = &noop_qdisc; - queue->qdisc_sleeping = &noop_qdisc; + RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); + RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc); rcu_assign_pointer(dev->ingress_queue, queue); #endif return queue; @@ -5757,24 +11941,44 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); /** - * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate + * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default + * @dev: netdev to enable the IRQ coalescing on * - * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. Also allocates subquue structs - * for each queue on the device. + * Sets a conservative default for SW IRQ coalescing. Users can use + * sysfs attributes to override the default values. + */ +void netdev_sw_irq_coalesce_default_on(struct net_device *dev) +{ + WARN_ON(dev->reg_state == NETREG_REGISTERED); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + netdev_set_gro_flush_timeout(dev, 20000); + netdev_set_defer_hard_irqs(dev, 1); + } +} +EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); + +/** + * alloc_netdev_mqs - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. Also allocates subqueue structs + * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { struct net_device *dev; - size_t alloc_size; - struct net_device *p; + size_t napi_config_sz; + unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -5783,32 +11987,29 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_RPS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } -#endif - alloc_size = sizeof(struct net_device); - if (sizeof_priv) { - /* ensure 32-byte alignment of private area */ - alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); - alloc_size += sizeof_priv; - } - /* ensure 32-byte alignment of whole construct */ - alloc_size += NETDEV_ALIGN - 1; + maxqs = max(txqs, rxqs); - p = kzalloc(alloc_size, GFP_KERNEL); - if (!p) + dev = kvzalloc(struct_size(dev, priv, sizeof_priv), + GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!dev) return NULL; - dev = PTR_ALIGN(p, NETDEV_ALIGN); - dev->padded = (char *)dev - (char *)p; + dev->priv_len = sizeof_priv; + ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev"); +#ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_p; + goto free_dev; + __dev_hold(dev); +#else + refcount_set(&dev->dev_refcnt, 1); +#endif if (dev_addr_init(dev)) goto free_pcpu; @@ -5818,32 +12019,76 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; + dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; + dev->tso_max_size = TSO_LEGACY_MAX_SIZE; + dev->tso_max_segs = TSO_MAX_SEGS; + dev->upper_level = 1; + dev->lower_level = 1; +#ifdef CONFIG_LOCKDEP + dev->nested_level = 0; + INIT_LIST_HEAD(&dev->unlink_list); +#endif INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); - INIT_LIST_HEAD(&dev->upper_dev_list); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->ptype_all); + INIT_LIST_HEAD(&dev->ptype_specific); + INIT_LIST_HEAD(&dev->net_notifier_list); +#ifdef CONFIG_NET_SCHED + hash_init(dev->qdisc_hash); +#endif + + mutex_init(&dev->lock); + + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); + if (!dev->tx_queue_len) { + dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + } + dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_RPS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; -#endif + dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); + if (!dev->ethtool) + goto free_all; + + dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); + if (!dev->cfg) + goto free_all; + dev->cfg_pending = dev->cfg; - strcpy(dev->name, name); + dev->num_napi_configs = maxqs; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); + if (!dev->napi_config) + goto free_all; + + strscpy(dev->name, name); + dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; + + nf_hook_netdev_init(dev); + return dev; free_all: @@ -5851,56 +12096,92 @@ free_all: return NULL; free_pcpu: +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); - netif_free_tx_queues(dev); -#ifdef CONFIG_RPS - kfree(dev->_rx); +free_dev: #endif - -free_p: - kfree(p); + kvfree(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); +static void netdev_napi_exit(struct net_device *dev) +{ + if (!list_empty(&dev->napi_list)) { + struct napi_struct *p, *n; + + netdev_lock(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + __netif_napi_del_locked(p); + netdev_unlock(dev); + + synchronize_net(); + } + + kvfree(dev->napi_config); +} + /** - * free_netdev - free network device - * @dev: device + * free_netdev - free network device + * @dev: device * - * This function does the last stage of destroying an allocated device - * interface. The reference to the device object is released. - * If this is the last reference then it will be freed. + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. If this + * is the last reference then it will be freed.Must be called in process + * context. */ void free_netdev(struct net_device *dev) { - struct napi_struct *p, *n; + might_sleep(); - release_net(dev_net(dev)); + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + WARN_ON(dev->cfg != dev->cfg_pending); + kfree(dev->cfg); + kfree(dev->ethtool); netif_free_tx_queues(dev); -#ifdef CONFIG_RPS - kfree(dev->_rx); -#endif + netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); - list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) - netif_napi_del(p); + netdev_napi_exit(dev); + netif_del_cpu_rmap(dev); + + ref_tracker_dir_exit(&dev->refcnt_tracker); +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; +#endif + free_percpu(dev->core_stats); + dev->core_stats = NULL; + free_percpu(dev->xdp_bulkq); + dev->xdp_bulkq = NULL; + + netdev_free_phy_link_topology(dev); + + mutex_destroy(&dev->lock); /* Compatibility with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); + if (dev->reg_state == NETREG_UNINITIALIZED || + dev->reg_state == NETREG_DUMMY) { + kvfree(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -5908,6 +12189,19 @@ void free_netdev(struct net_device *dev) EXPORT_SYMBOL(free_netdev); /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, + init_dummy_netdev); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. @@ -5916,13 +12210,28 @@ EXPORT_SYMBOL(free_netdev); void synchronize_net(void) { might_sleep(); - if (rtnl_is_locked()) + if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); +static void netdev_rss_contexts_free(struct net_device *dev) +{ + struct ethtool_rxfh_context *ctx; + unsigned long context; + + mutex_lock(&dev->ethtool->rss_lock); + xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { + xa_erase(&dev->ethtool->rss_ctx, context); + dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL); + kfree(ctx); + } + xa_destroy(&dev->ethtool->rss_ctx); + mutex_unlock(&dev->ethtool->rss_lock); +} + /** * unregister_netdevice_queue - remove device from the kernel * @dev: device @@ -5943,26 +12252,196 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) if (head) { list_move_tail(&dev->unreg_list, head); } else { - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + +/* devices must be UP and netdev_lock()'d */ +static void netif_close_many_and_unlock(struct list_head *close_head) +{ + struct net_device *dev, *tmp; + + netif_close_many(close_head, false); + + /* ... now unlock them */ + list_for_each_entry_safe(dev, tmp, close_head, close_list) { + netdev_unlock(dev); + list_del_init(&dev->close_list); + } +} + +static void netif_close_many_and_unlock_cond(struct list_head *close_head) +{ +#ifdef CONFIG_LOCKDEP + /* We can only track up to MAX_LOCK_DEPTH locks per task. + * + * Reserve half the available slots for additional locks possibly + * taken by notifiers and (soft)irqs. + */ + unsigned int limit = MAX_LOCK_DEPTH / 2; + + if (lockdep_depth(current) > limit) + netif_close_many_and_unlock(close_head); +#endif +} + +void unregister_netdevice_many_notify(struct list_head *head, + u32 portid, const struct nlmsghdr *nlh) +{ + struct net_device *dev, *tmp; + LIST_HEAD(close_head); + int cnt = 0; + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + if (list_empty(head)) + return; + + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never was registered\n", + dev->name, dev); + + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } + + /* If device is running, close it first. Start with ops locked... */ + list_for_each_entry(dev, head, unreg_list) { + if (!(dev->flags & IFF_UP)) + continue; + if (netdev_need_ops_lock(dev)) { + list_add_tail(&dev->close_list, &close_head); + netdev_lock(dev); + } + netif_close_many_and_unlock_cond(&close_head); + } + netif_close_many_and_unlock(&close_head); + /* ... now go over the rest. */ + list_for_each_entry(dev, head, unreg_list) { + if (!netdev_need_ops_lock(dev)) + list_add_tail(&dev->close_list, &close_head); + } + netif_close_many(&close_head, true); + + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); + netdev_lock(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); + netdev_unlock(dev); + } + flush_all_backlogs(); + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + + /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); + dev_shutdown(dev); + dev_tcx_uninstall(dev); + dev_xdp_uninstall(dev); + dev_memory_provider_uninstall(dev); + netdev_unlock_ops(dev); + bpf_dev_bound_netdev_unregister(dev); + + netdev_offload_xstats_disable_all(dev); + + /* Notify protocols, that we are about to destroy + * this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, + GFP_KERNEL, NULL, 0, + portid, nlh); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + + netdev_rss_contexts_free(dev); + + call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + mutex_destroy(&dev->ethtool->rss_lock); + + net_shaper_flush_netdev(dev); + + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); + + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); + WARN_ON(netdev_has_any_lower_dev(dev)); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + netdev_put(dev, &dev->dev_registered_tracker); + net_set_todo(dev); + cnt++; + } + atomic_add(cnt, &dev_unreg_count); + + list_del(head); +} + /** * unregister_netdevice_many - unregister many devices * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { - struct net_device *dev; - - if (!list_empty(head)) { - rollback_registered_many(head); - list_for_each_entry(dev, head, unreg_list) - net_set_todo(dev); - } + unregister_netdevice_many_notify(head, 0, NULL); } EXPORT_SYMBOL(unregister_netdevice_many); @@ -5979,85 +12458,129 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - rtnl_lock(); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_unlock(); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); -/** - * dev_change_net_namespace - move device to different nethost namespace - * @dev: device - * @net: network namespace - * @pat: If not NULL name pattern to try if the current device name - * is already taken in the destination network namespace. - * - * This function shuts down a device interface and moves it - * to a new network namespace. On success 0 is returned, on - * a failure a netagive errno code is returned. - * - * Callers must hold the rtnl semaphore. - */ - -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex, + struct netlink_ext_ack *extack) { - int err; + struct netdev_name_node *name_node; + struct net *net_old = dev_net(dev); + char new_name[IFNAMSIZ] = {}; + int err, new_nsid; ASSERT_RTNL(); /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_immutable) { + NL_SET_ERR_MSG(extack, "The interface netns is immutable"); goto out; + } - /* Ensure the device has been registrered */ - if (dev->reg_state != NETREG_REGISTERED) + /* Ensure the device has been registered */ + if (dev->reg_state != NETREG_REGISTERED) { + NL_SET_ERR_MSG(extack, "The interface isn't registered"); goto out; + } /* Get out if there is nothing todo */ err = 0; - if (net_eq(dev_net(dev), net)) + if (net_eq(net_old, net)) goto out; /* Pick the destination device name, and ensure * we can use it in the destination network namespace. */ err = -EEXIST; - if (__dev_get_by_name(net, dev->name)) { + if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ - if (!pat) + if (!pat) { + NL_SET_ERR_MSG(extack, + "An interface with the same name exists in the target netns"); goto out; - if (dev_get_valid_name(net, dev, pat) < 0) + } + err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "Unable to use '%s' for the new interface name in the target netns", + pat); goto out; + } + } + /* Check that none of the altnames conflicts. */ + err = -EEXIST; + netdev_for_each_altname(dev, name_node) { + if (netdev_name_in_use(net, name_node->name)) { + NL_SET_ERR_MSG_FMT(extack, + "An interface with the altname %s exists in the target netns", + name_node->name); + goto out; + } + } + + /* Check that new_ifindex isn't used yet. */ + if (new_ifindex) { + err = dev_index_reserve(net, new_ifindex); + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "The ifindex %d is not available in the target netns", + new_ifindex); + goto out; + } + } else { + /* If there is an ifindex conflict assign a new one */ + err = dev_index_reserve(net, dev->ifindex); + if (err == -EBUSY) + err = dev_index_reserve(net, 0); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Unable to allocate a new ifindex in the target netns"); + goto out; + } + new_ifindex = err; } /* * And now a mini version of register_netdevice unregister_netdevice. */ + netdev_lock_ops(dev); /* If device is running close it first. */ - dev_close(dev); - + netif_close(dev); /* And unlink it from device chain */ - err = -ENODEV; unlist_netdevice(dev); + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); + synchronize_net(); /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); + netdev_unlock_ops(dev); /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - - Note that dev->reg_state stays at NETREG_REGISTERED. - This is wanted because this way 8021q and macvlan know - the device is just moving and can keep their slaves up. - */ + * this device. They should clean all the things. + * + * Note that dev->reg_state stays at NETREG_REGISTERED. + * This is wanted because this way 8021q and macvlan know + * the device is just moving and can keep their slaves up. + */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + + new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); + + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, + new_ifindex); /* * Flush the unicast and multicast chains @@ -6067,55 +12590,69 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + netdev_adjacent_del_links(dev); + + /* Move per-net netdevice notifiers that are following the netdevice */ + move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); + dev->ifindex = new_ifindex; - /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) { - int iflink = (dev->iflink == dev->ifindex); - dev->ifindex = dev_new_index(net); - if (iflink) - dev->iflink = dev->ifindex; + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock_bh(&netdev_rename_lock); + strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); } + /* Fixup kobjects */ + dev_set_uevent_suppress(&dev->dev, 1); + err = device_rename(&dev->dev, dev->name); + dev_set_uevent_suppress(&dev->dev, 0); + WARN_ON(err); + /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); - /* Fixup kobjects */ - err = device_rename(&dev->dev, dev->name); + /* Adapt owner in case owning user namespace of target network + * namespace is different from the original one. + */ + err = netdev_change_owner(dev, net_old, net); WARN_ON(err); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); - /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); synchronize_net(); err = 0; out: return err; } -EXPORT_SYMBOL_GPL(dev_change_net_namespace); -static int dev_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *ocpu) +static int dev_cpu_dead(unsigned int oldcpu) { struct sk_buff **list_skb; struct sk_buff *skb; - unsigned int cpu, oldcpu = (unsigned long)ocpu; - struct softnet_data *sd, *oldsd; - - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; + unsigned int cpu; + struct softnet_data *sd, *oldsd, *remsd = NULL; local_irq_disable(); cpu = smp_processor_id(); @@ -6137,29 +12674,47 @@ static int dev_cpu_callback(struct notifier_block *nfb, oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } - /* Append NAPI poll list from offline CPU. */ - if (!list_empty(&oldsd->poll_list)) { - list_splice_init(&oldsd->poll_list, &sd->poll_list); - raise_softirq_irqoff(NET_RX_SOFTIRQ); + /* Append NAPI poll list from offline CPU, with one exception : + * process_backlog() must be called by cpu owning percpu backlog. + * We properly handle process_queue & input_pkt_queue later. + */ + while (!list_empty(&oldsd->poll_list)) { + struct napi_struct *napi = list_first_entry(&oldsd->poll_list, + struct napi_struct, + poll_list); + + list_del_init(&napi->poll_list); + if (napi->poll == process_backlog) + napi->state &= NAPIF_STATE_THREADED; + else + ____napi_schedule(sd, napi); } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + if (!use_backlog_threads()) { +#ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; +#endif + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + } + /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } - return NOTIFY_OK; + return 0; } - /** * netdev_increment_features - increment feature set by one * @all: current feature set @@ -6173,27 +12728,115 @@ static int dev_cpu_callback(struct notifier_block *nfb, netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { - if (mask & NETIF_F_GEN_CSUM) - mask |= NETIF_F_ALL_CSUM; + if (mask & NETIF_F_HW_CSUM) + mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; - all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ - if (all & NETIF_F_GEN_CSUM) - all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + if (all & NETIF_F_HW_CSUM) + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } EXPORT_SYMBOL(netdev_increment_features); +/** + * netdev_compute_master_upper_features - compute feature from lowers + * @dev: the upper device + * @update_header: whether to update upper device's header_len/headroom/tailroom + * + * Recompute the upper device's feature based on all lower devices. + */ +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header) +{ + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES; + netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES; + netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES; + netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES; + netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES; + unsigned short max_header_len = ETH_HLEN; + unsigned int tso_max_size = TSO_MAX_SIZE; + unsigned short max_headroom = 0; + unsigned short max_tailroom = 0; + u16 tso_max_segs = TSO_MAX_SEGS; + struct net_device *lower_dev; + struct list_head *iter; + + mpls_features = netdev_base_features(mpls_features); + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + gso_partial_features = netdev_increment_features(gso_partial_features, + lower_dev->gso_partial_features, + MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES); + + vlan_features = netdev_increment_features(vlan_features, + lower_dev->vlan_features, + MASTER_UPPER_DEV_VLAN_FEATURES); + + enc_features = netdev_increment_features(enc_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_ENC_FEATURES); + + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + xfrm_features = netdev_increment_features(xfrm_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_XFRM_FEATURES); + + mpls_features = netdev_increment_features(mpls_features, + lower_dev->mpls_features, + MASTER_UPPER_DEV_MPLS_FEATURES); + + dst_release_flag &= lower_dev->priv_flags; + + if (update_header) { + max_header_len = max(max_header_len, lower_dev->hard_header_len); + max_headroom = max(max_headroom, lower_dev->needed_headroom); + max_tailroom = max(max_tailroom, lower_dev->needed_tailroom); + } + + tso_max_size = min(tso_max_size, lower_dev->tso_max_size); + tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs); + } + + dev->gso_partial_features = gso_partial_features; + dev->vlan_features = vlan_features; + dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + dev->hw_enc_features |= xfrm_features; + dev->mpls_features = mpls_features; + + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + dev->priv_flags |= IFF_XMIT_DST_RELEASE; + + if (update_header) { + dev->hard_header_len = max_header_len; + dev->needed_headroom = max_headroom; + dev->needed_tailroom = max_tailroom; + } + + netif_set_tso_max_segs(dev, tso_max_segs); + netif_set_tso_max_size(dev, tso_max_size); + + netdev_change_features(dev); +} +EXPORT_SYMBOL(netdev_compute_master_upper_features); + static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; - hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); @@ -6204,8 +12847,10 @@ static struct hlist_head * __net_init netdev_create_hash(void) /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { - if (net != &init_net) - INIT_LIST_HEAD(&net->dev_base_head); + BUILD_BUG_ON(GRO_HASH_BUCKETS > + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); + + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) @@ -6215,6 +12860,10 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); + + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + return 0; err_idx: @@ -6245,51 +12894,45 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf) +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) { - int r; - if (dev && dev->dev.parent) { - r = dev_printk_emit(level[1] - '0', - dev->dev.parent, - "%s %s %s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), vaf); + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { - r = printk("%s(NULL net_device): %pV", level, vaf); + printk("%s(NULL net_device): %pV", level, vaf); } - - return r; } -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = __netdev_printk(level, dev, &vaf); + __netdev_printk(level, dev, &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ -int func(const struct net_device *dev, const char *fmt, ...) \ +void func(const struct net_device *dev, const char *fmt, ...) \ { \ - int r; \ struct va_format vaf; \ va_list args; \ \ @@ -6298,11 +12941,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __netdev_printk(level, dev, &vaf); \ + __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ - \ - return r; \ } \ EXPORT_SYMBOL(func); @@ -6318,6 +12959,9 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + xa_destroy(&net->dev_by_index); + if (net != &init_net) + WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { @@ -6325,28 +12969,36 @@ static struct pernet_operations __net_initdata netdev_net_ops = { .exit = netdev_exit, }; -static void __net_exit default_device_exit(struct net *net) +static void __net_exit default_device_exit_net(struct net *net) { + struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ - rtnl_lock(); + ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_immutable) continue; /* Leave virtual devices for the generic cleanup */ - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + if (netdev_name_in_use(&init_net, fb_name)) + snprintf(fb_name, IFNAMSIZ, "dev%%d"); + + netdev_for_each_altname_safe(dev, name_node, tmp) + if (netdev_name_in_use(&init_net, name_node->name)) + __netdev_name_node_alt_destroy(name_node); + err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", @@ -6354,7 +13006,6 @@ static void __net_exit default_device_exit(struct net *net) BUG(); } } - rtnl_unlock(); } static void __net_exit default_device_exit_batch(struct list_head *net_list) @@ -6370,23 +13021,81 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { + default_device_exit_net(net); + cond_resched(); + } + + list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); } } unregister_netdevice_many(&dev_kill_list); - list_del(&dev_kill_list); rtnl_unlock(); } static struct pernet_operations __net_initdata default_device_ops = { - .exit = default_device_exit, .exit_batch = default_device_exit_batch, }; +static void __init net_dev_struct_check(void) +{ + /* TX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); +#ifdef CONFIG_XPS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); + + /* TXRX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); + + /* RX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); +#ifdef CONFIG_NETPOLL + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); +} + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -6394,6 +13103,67 @@ static struct pernet_operations __net_initdata default_device_ops = { * */ +/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ +#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) + +static int net_page_pool_create(int cpuid) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + struct page_pool_params page_pool_params = { + .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, + .flags = PP_FLAG_SYSTEM_POOL, + .nid = cpu_to_mem(cpuid), + }; + struct page_pool *pp_ptr; + int err; + + pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); + if (IS_ERR(pp_ptr)) + return -ENOMEM; + + err = xdp_reg_page_pool(pp_ptr); + if (err) { + page_pool_destroy(pp_ptr); + return err; + } + + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; +#endif + return 0; +} + +static int backlog_napi_should_run(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + + napi_threaded_poll_loop(&sd->backlog, false); +} + +static void backlog_napi_setup(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + napi->thread = this_cpu_read(backlog_napi); + set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { + .store = &backlog_napi, + .thread_should_run = backlog_napi_should_run, + .thread_fn = run_backlog_napi, + .thread_comm = "backlog_napi/%u", + .setup = backlog_napi_setup, +}; + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -6404,18 +13174,17 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); + net_dev_struct_check(); + if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; - INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); - INIT_LIST_HEAD(&offload_base); - if (register_pernet_subsys(&netdev_net_ops)) goto out; @@ -6423,32 +13192,41 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ + flush_backlogs_fallback = flush_backlogs_alloc(); + if (!flush_backlogs_fallback) + goto out; + for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); - memset(sd, 0, sizeof(*sd)); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); - sd->completion_queue = NULL; +#ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +#endif INIT_LIST_HEAD(&sd->poll_list); - sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS - sd->csd.func = rps_trigger_softirq; - sd->csd.info = sd; - sd->csd.flags = 0; + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); + gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; - sd->backlog.gro_list = NULL; - sd->backlog.gro_count = 0; + INIT_LIST_HEAD(&sd->backlog.poll_list); -#ifdef CONFIG_NET_FLOW_LIMIT - sd->flow_limit = NULL; -#endif + if (net_page_pool_create(i)) + goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; + if (use_backlog_threads()) + smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; @@ -6470,10 +13248,29 @@ static int __init net_dev_init(void) open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); - hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); + rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", + NULL, dev_cpu_dead); + WARN_ON(rc < 0); rc = 0; + + /* avoid static key IPIs to isolated CPUs */ + if (housekeeping_enabled(HK_TYPE_MISC)) + net_enable_timestamp(); out: + if (rc < 0) { + for_each_possible_cpu(i) { + struct page_pool *pp_ptr; + + pp_ptr = per_cpu(system_page_pool.pool, i); + if (!pp_ptr) + continue; + + xdp_unreg_page_pool(pp_ptr); + page_pool_destroy(pp_ptr); + per_cpu(system_page_pool.pool, i) = NULL; + } + } + return rc; } diff --git a/net/core/dev.h b/net/core/dev.h new file mode 100644 index 000000000000..da18536cbd35 --- /dev/null +++ b/net/core/dev.h @@ -0,0 +1,406 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_DEV_H +#define _NET_CORE_DEV_H + +#include <linux/cleanup.h> +#include <linux/types.h> +#include <linux/rwsem.h> +#include <linux/netdevice.h> +#include <net/netdev_lock.h> + +struct net; +struct netlink_ext_ack; +struct cpumask; + +/* Random bits of netdevice that don't need to be exposed */ +#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ +struct sd_flow_limit { + struct rcu_head rcu; + unsigned int count; + u8 log_buckets; + unsigned int history_head; + u16 history[FLOW_LIMIT_HISTORY]; + u8 buckets[]; +}; + +extern int netdev_flow_limit_table_len; + +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); +struct net_device *dev_get_by_napi_id(unsigned int napi_id); + +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T)); + +#define for_each_netdev_lock_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock) = NULL; \ + (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \ + ifindex++) + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex); +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *, + if (_T) netdev_unlock_ops_compat(_T)); + +#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \ + (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \ + &ifindex)); \ + ifindex++) + +#ifdef CONFIG_PROC_FS +int __init dev_proc_init(void); +#else +#define dev_proc_init() 0 +#endif + +void linkwatch_init_dev(struct net_device *dev); +void linkwatch_run_queue(void); + +void dev_addr_flush(struct net_device *dev); +int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); + +#if IS_ENABLED(CONFIG_NET_SHAPER) +void net_shaper_flush_netdev(struct net_device *dev); +void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq); +#else +static inline void net_shaper_flush_netdev(struct net_device *dev) {} +static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq) {} +#endif + +/* sysctls not referred to from outside net/core/ */ +extern int netdev_unregister_timeout_secs; +extern int weight_p; +extern int dev_weight_rx_bias; +extern int dev_weight_tx_bias; + +extern struct rw_semaphore dev_addr_sem; + +/* rtnl helpers */ +extern struct list_head net_todo_list; +void netdev_run_todo(void); + +/* netdev management, shared between various uAPI entry points */ +struct netdev_name_node { + struct hlist_node hlist; + struct list_head list; + struct net_device *dev; + const char *name; + struct rcu_head rcu; +}; + +int netdev_get_name(struct net *net, char *name, int ifindex); +int netif_change_name(struct net_device *dev, const char *newname); +int dev_change_name(struct net_device *dev, const char *newname); + +#define netdev_for_each_altname(dev, namenode) \ + list_for_each_entry((namenode), &(dev)->name_node->list, list) +#define netdev_for_each_altname_safe(dev, namenode, next) \ + list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \ + list) + +int netdev_name_node_alt_create(struct net_device *dev, const char *name); +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); + +int dev_validate_mtu(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack); + +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); + +int netif_change_proto_down(struct net_device *dev, bool proto_down); +int dev_change_proto_down(struct net_device *dev, bool proto_down); +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value); + +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags); + +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +void netif_set_group(struct net_device *dev, int new_group); +void dev_set_group(struct net_device *dev, int new_group); +int netif_change_carrier(struct net_device *dev, bool new_carrier); +int dev_change_carrier(struct net_device *dev, bool new_carrier); + +void __dev_set_rx_mode(struct net_device *dev); + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges, u32 portid, + const struct nlmsghdr *nlh); + +void unregister_netdevice_many_notify(struct list_head *head, + u32 portid, const struct nlmsghdr *nlh); + +static inline void netif_set_up(struct net_device *dev, bool value) +{ + if (value) + dev->flags |= IFF_UP; + else + dev->flags &= ~IFF_UP; + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->up = value; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + +static inline void netif_set_gso_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); + if (size <= GSO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gso_ipv4_max_size, size); +} + +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); + if (size <= GRO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gro_ipv4_max_size, size); +} + +static inline void netif_set_gso_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_ipv4_max_size, size); +} + +static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_ipv4_max_size, size); +} + +/** + * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs + * @n: napi struct to get the defer_hard_irqs field from + * + * Return: the per-NAPI value of the defar_hard_irqs field. + */ +static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n) +{ + return READ_ONCE(n->defer_hard_irqs); +} + +/** + * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi + * @n: napi_struct to set the defer_hard_irqs field + * @defer: the value the field should be set to + */ +static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) +{ + WRITE_ONCE(n->defer_hard_irqs, defer); +} + +/** + * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev + * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set + * @defer: the defer_hard_irqs value to set + */ +static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, + u32 defer) +{ + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); + struct napi_struct *napi; + int i; + + WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_defer_hard_irqs(napi, defer); + + for (i = 0; i < count; i++) + netdev->napi_config[i].defer_hard_irqs = defer; +} + +/** + * napi_get_gro_flush_timeout - get the gro_flush_timeout + * @n: napi struct to get the gro_flush_timeout from + * + * Return: the per-NAPI value of the gro_flush_timeout field. + */ +static inline unsigned long +napi_get_gro_flush_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->gro_flush_timeout); +} + +/** + * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi + * @n: napi struct to set the gro_flush_timeout + * @timeout: timeout value to set + * + * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout + */ +static inline void napi_set_gro_flush_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->gro_flush_timeout, timeout); +} + +/** + * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs + * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set + * @timeout: the timeout value to set + */ +static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, + unsigned long timeout) +{ + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); + struct napi_struct *napi; + int i; + + WRITE_ONCE(netdev->gro_flush_timeout, timeout); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_gro_flush_timeout(napi, timeout); + + for (i = 0; i < count; i++) + netdev->napi_config[i].gro_flush_timeout = timeout; +} + +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + +static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) +{ + if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state)) + return NETDEV_NAPI_THREADED_BUSY_POLL; + + if (test_bit(NAPI_STATE_THREADED, &n->state)) + return NETDEV_NAPI_THREADED_ENABLED; + + return NETDEV_NAPI_THREADED_DISABLED; +} + +static inline enum netdev_napi_threaded +napi_get_threaded_config(struct net_device *dev, struct napi_struct *n) +{ + if (n->config) + return n->config->threaded; + return dev->threaded; +} + +int napi_set_threaded(struct napi_struct *n, + enum netdev_napi_threaded threaded); + +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded); + +int rps_cpumask_housekeeping(struct cpumask *mask); + +#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) +void xdp_do_check_flushed(struct napi_struct *napi); +#else +static inline void xdp_do_check_flushed(struct napi_struct *napi) { } +#endif + +/* Best effort check that NAPI is not idle (can't be scheduled to run) */ +static inline void napi_assert_will_not_race(const struct napi_struct *napi) +{ + /* uninitialized instance, can't race */ + if (!napi->poll_list.next) + return; + + /* SCHED bit is set on disabled instances */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + WARN_ON(READ_ONCE(napi->list_owner) != -1); +} + +void kick_defer_list_purge(unsigned int cpu); + +#define XMIT_RECURSION_LIMIT 8 + +#ifndef CONFIG_PREEMPT_RT +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} +#else +static inline bool dev_xmit_recursion(void) +{ + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ + current->net_xmit.recursion--; +} +#endif + +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg); +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg); + +#endif diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 6cda4e2c2132..76c91f224886 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/dev_addr_lists.c - Functions for handling net device lists * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com> * * This file contains functions for working with unicast, multicast and device * addresses lists. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #include <linux/netdevice.h> @@ -16,14 +12,44 @@ #include <linux/export.h> #include <linux/list.h> +#include "dev.h" + /* * General list handling functions */ -static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, - const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global, - bool sync) +static int __hw_addr_insert(struct netdev_hw_addr_list *list, + struct netdev_hw_addr *new, int addr_len) +{ + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; + struct netdev_hw_addr *ha; + + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(new->addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&new->type, &ha->type, sizeof(new->type)); + + parent = *ins_point; + if (diff < 0) + ins_point = &parent->rb_left; + else if (diff > 0) + ins_point = &parent->rb_right; + else + return -EEXIST; + } + + rb_link_node_rcu(&new->node, parent, ins_point); + rb_insert_color(&new->node, &list->tree); + + return 0; +} + +static struct netdev_hw_addr* +__hw_addr_create(const unsigned char *addr, int addr_len, + unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; int alloc_size; @@ -33,31 +59,44 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, alloc_size = L1_CACHE_BYTES; ha = kmalloc(alloc_size, GFP_ATOMIC); if (!ha) - return -ENOMEM; + return NULL; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; ha->refcount = 1; ha->global_use = global; - ha->synced = sync; + ha->synced = sync ? 1 : 0; ha->sync_cnt = 0; - list_add_tail_rcu(&ha->list, &list->list); - list->count++; - return 0; + return ha; } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global, bool sync) + unsigned char addr_type, bool global, bool sync, + int sync_count, bool exclusive) { + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; if (addr_len > MAX_ADDR_LEN) return -EINVAL; - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - ha->type == addr_type) { + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + parent = *ins_point; + if (diff < 0) { + ins_point = &parent->rb_left; + } else if (diff > 0) { + ins_point = &parent->rb_right; + } else { + if (exclusive) + return -EEXIST; if (global) { /* check if addr is already used as global */ if (ha->global_use) @@ -66,25 +105,35 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, ha->global_use = true; } if (sync) { - if (ha->synced) + if (ha->synced && sync_count) return -EEXIST; else - ha->synced = true; + ha->synced++; } ha->refcount++; return 0; } } - return __hw_addr_create_ex(list, addr, addr_len, addr_type, global, - sync); + ha = __hw_addr_create(addr, addr_len, addr_type, global, sync); + if (!ha) + return -ENOMEM; + + rb_link_node(&ha->node, parent, ins_point); + rb_insert_color(&ha->node, &list->tree); + + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + + return 0; } static int __hw_addr_add(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { - return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false); + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, + 0, false); } static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, @@ -101,28 +150,54 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, ha->global_use = false; if (sync) - ha->synced = false; + ha->synced--; if (--ha->refcount) return 0; + + rb_erase(&ha->node, &list->tree); + list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); list->count--; return 0; } +static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) +{ + struct rb_node *node; + + node = list->tree.rb_node; + + while (node) { + struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node); + int diff = memcmp(addr, ha->addr, addr_len); + + if (diff == 0 && addr_type) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + if (diff < 0) + node = node->rb_left; + else if (diff > 0) + node = node->rb_right; + else + return ha; + } + + return NULL; +} + static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { - struct netdev_hw_addr *ha; + struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type); - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - (ha->type == addr_type || !addr_type)) - return __hw_addr_del_entry(list, ha, global, sync); - } - return -ENOENT; + if (!ha) + return -ENOENT; + return __hw_addr_del_entry(list, ha, global, sync); } static int __hw_addr_del(struct netdev_hw_addr_list *list, @@ -139,7 +214,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, int err; err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, - false, true); + false, true, ha->sync_cnt, false); if (err && err != -EEXIST) return err; @@ -167,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, __hw_addr_del_entry(from_list, ha, false, false); } -static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) +int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; @@ -185,50 +260,10 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, } return err; } - -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) -{ - int err; - struct netdev_hw_addr *ha, *ha2; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); - if (err) - goto unroll; - } - return 0; - -unroll: - list_for_each_entry(ha2, &from_list->list, list) { - if (ha2 == ha) - break; - type = addr_type ? addr_type : ha2->type; - __hw_addr_del(to_list, ha2->addr, addr_len, type); - } - return err; -} -EXPORT_SYMBOL(__hw_addr_add_multiple); - -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, type); - } -} -EXPORT_SYMBOL(__hw_addr_del_multiple); +EXPORT_SYMBOL(__hw_addr_sync_multiple); /* This function only works where there is a strict 1-1 relationship - * between source and destionation of they synch. If you ever need to + * between source and destination of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use * __hw_addr_sync_multiple(). */ @@ -264,22 +299,205 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, } EXPORT_SYMBOL(__hw_addr_unsync); -void __hw_addr_flush(struct netdev_hw_addr_list *list) +/** + * __hw_addr_sync_dev - Synchronize device's multicast list + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address should be added + * @unsync: function to call if address should be removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address add/remove + * notifications. The unsync function may be NULL in which case + * the addresses requiring removal will simply be removed without + * any notification to the device. + **/ +int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, const unsigned char *), + int (*unsync)(struct net_device *, + const unsigned char *)) +{ + struct netdev_hw_addr *ha, *tmp; + int err; + + /* first go through and flush out any stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt || ha->refcount != 1) + continue; + + /* if unsync is defined and fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr)) + continue; + + ha->sync_cnt--; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (ha->sync_cnt) + continue; + + err = sync(dev, ha->addr); + if (err) + return err; + + ha->sync_cnt++; + ha->refcount++; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_sync_dev); + +/** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + +/** + * __hw_addr_unsync_dev - Remove synchronized addresses from device + * @list: address list to remove synchronized addresses from + * @dev: device to sync + * @unsync: function to call if address should be removed + * + * Remove all addresses that were added to the device by __hw_addr_sync_dev(). + * This function is intended to be called from the ndo_stop or ndo_open + * functions on devices that require explicit address add/remove + * notifications. If the unsync function pointer is NULL then this function + * can be used to just reset the sync_cnt for the addresses in the list. + **/ +void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if unsync is defined and fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr)) + continue; + + ha->sync_cnt--; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_unsync_dev); + +static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; + list->tree = RB_ROOT; list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); } list->count = 0; } -EXPORT_SYMBOL(__hw_addr_flush); void __hw_addr_init(struct netdev_hw_addr_list *list) { INIT_LIST_HEAD(&list->list); list->count = 0; + list->tree = RB_ROOT; } EXPORT_SYMBOL(__hw_addr_init); @@ -287,6 +505,21 @@ EXPORT_SYMBOL(__hw_addr_init); * Device addresses handling functions */ +/* Check that netdev->dev_addr is not written to directly as this would + * break the rbtree layout. All changes should go thru dev_addr_set() and co. + * Remove this check in mid-2024. + */ +void dev_addr_check(struct net_device *dev) +{ + if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN)) + return; + + netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr); + netdev_warn(dev, "Expected addr: %*ph\n", + MAX_ADDR_LEN, dev->dev_addr_shadow); + netdev_WARN(dev, "Incorrect netdev->dev_addr\n"); +} + /** * dev_addr_flush - Flush device address list * @dev: device @@ -298,11 +531,11 @@ EXPORT_SYMBOL(__hw_addr_init); void dev_addr_flush(struct net_device *dev) { /* rtnl_mutex must be held here */ + dev_addr_check(dev); __hw_addr_flush(&dev->dev_addrs); dev->dev_addr = NULL; } -EXPORT_SYMBOL(dev_addr_flush); /** * dev_addr_init - Init device address list @@ -336,7 +569,21 @@ int dev_addr_init(struct net_device *dev) } return err; } -EXPORT_SYMBOL(dev_addr_init); + +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len) +{ + struct netdev_hw_addr *ha; + + dev_addr_check(dev); + + ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); + rb_erase(&ha->node, &dev->dev_addrs.tree); + memcpy(&ha->addr[offset], addr, len); + memcpy(&dev->dev_addr_shadow[offset], addr, len); + WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len)); +} +EXPORT_SYMBOL(dev_addr_mod); /** * dev_addr_add - Add a device address @@ -356,6 +603,9 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); + err = netif_pre_changeaddr_notify(dev, addr, NULL); + if (err) + return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); @@ -400,59 +650,6 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, } EXPORT_SYMBOL(dev_addr_del); -/** - * dev_addr_add_multiple - Add device addresses from another device - * @to_dev: device to which addresses will be added - * @from_dev: device from which addresses will be added - * @addr_type: address type - 0 means type will be used from from_dev - * - * Add device addresses of the one device to another. - ** - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - * dev_addr_del_multiple - Delete device addresses by another device - * @to_dev: device where the addresses will be deleted - * @from_dev: device supplying the addresses to be deleted - * @addr_type: address type - 0 means type will be used from from_dev - * - * Deletes addresses in to device by the list of addresses in from device. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - /* * Unicast list handling functions */ @@ -464,22 +661,14 @@ EXPORT_SYMBOL(dev_addr_del_multiple); */ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { - struct netdev_hw_addr *ha; int err; netif_addr_lock_bh(dev); - list_for_each_entry(ha, &dev->uc.list, list) { - if (!memcmp(ha->addr, addr, dev->addr_len) && - ha->type == NETDEV_HW_ADDR_T_UNICAST) { - err = -EEXIST; - goto out; - } - } - err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST, true, false); + err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST, true, false, + 0, true); if (!err) __dev_set_rx_mode(dev); -out: netif_addr_unlock_bh(dev); return err; } @@ -549,7 +738,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -579,7 +768,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -602,8 +791,17 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return; + /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two + * reasons: + * 1) This is always called without any addr_list_lock, so as the + * outermost one here, it must be 0. + * 2) This is called by some callers after unlinking the upper device, + * so the dev->lower_level becomes 1 again. + * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or + * larger. + */ netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -626,7 +824,7 @@ void dev_uc_flush(struct net_device *dev) EXPORT_SYMBOL(dev_uc_flush); /** - * dev_uc_flush - Init unicast address list + * dev_uc_init - Init unicast address list * @dev: device * * Init unicast address list. @@ -648,22 +846,14 @@ EXPORT_SYMBOL(dev_uc_init); */ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { - struct netdev_hw_addr *ha; int err; netif_addr_lock_bh(dev); - list_for_each_entry(ha, &dev->mc.list, list) { - if (!memcmp(ha->addr, addr, dev->addr_len) && - ha->type == NETDEV_HW_ADDR_T_MULTICAST) { - err = -EEXIST; - goto out; - } - } - err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, true, false); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, true, false, + 0, true); if (!err) __dev_set_rx_mode(dev); -out: netif_addr_unlock_bh(dev); return err; } @@ -676,7 +866,8 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global, false); + NETDEV_HW_ADDR_T_MULTICAST, global, false, + 0, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); @@ -752,7 +943,7 @@ int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) EXPORT_SYMBOL(dev_mc_del_global); /** - * dev_mc_sync - Synchronize device's unicast list to another device + * dev_mc_sync - Synchronize device's multicast list to another device * @to: destination device * @from: source device * @@ -770,7 +961,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -780,7 +971,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) EXPORT_SYMBOL(dev_mc_sync); /** - * dev_mc_sync_multiple - Synchronize device's unicast list to another + * dev_mc_sync_multiple - Synchronize device's multicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device @@ -800,7 +991,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -823,8 +1014,9 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return; + /* See the above comments inside dev_uc_unsync(). */ netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -847,7 +1039,7 @@ void dev_mc_flush(struct net_device *dev) EXPORT_SYMBOL(dev_mc_flush); /** - * dev_mc_flush - Init multicast address list + * dev_mc_init - Init multicast address list * @dev: device * * Init multicast address list. diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c new file mode 100644 index 000000000000..8e1dba825e94 --- /dev/null +++ b/net/core/dev_addr_lists_test.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <kunit/test.h> +#include <linux/etherdevice.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> + +static const struct net_device_ops dummy_netdev_ops = { +}; + +struct dev_addr_test_priv { + u32 addr_seen; +}; + +static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + datp->addr_seen |= 1 << a[0]; + return 0; +} + +static int dev_addr_test_unsync(struct net_device *netdev, + const unsigned char *a) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + datp->addr_seen &= ~(1 << a[0]); + return 0; +} + +static int dev_addr_test_init(struct kunit *test) +{ + struct dev_addr_test_priv *datp; + struct net_device *netdev; + int err; + + netdev = alloc_etherdev(sizeof(*datp)); + KUNIT_ASSERT_TRUE(test, !!netdev); + + test->priv = netdev; + netdev->netdev_ops = &dummy_netdev_ops; + + err = register_netdev(netdev); + if (err) { + free_netdev(netdev); + KUNIT_FAIL(test, "Can't register netdev %d", err); + } + + return 0; +} + +static void dev_addr_test_exit(struct kunit *test) +{ + struct net_device *netdev = test->priv; + + unregister_netdev(netdev); + free_netdev(netdev); +} + +static void dev_addr_test_basic(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + + rtnl_lock(); + KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); + + memset(addr, 2, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); + + memset(addr, 3, sizeof(addr)); + dev_addr_set(netdev, addr); + KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); + rtnl_unlock(); +} + +static void dev_addr_test_sync_one(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + + datp = netdev_priv(netdev); + + rtnl_lock(); + memset(addr, 1, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 2, datp->addr_seen); + + memset(addr, 2, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + datp->addr_seen = 0; + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + /* It's not going to sync anything because the main address is + * considered synced and we overwrite in place. + */ + KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); + rtnl_unlock(); +} + +static void dev_addr_test_add_del(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + int i; + + datp = netdev_priv(netdev); + + rtnl_lock(); + for (i = 1; i < 4; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + /* Add 3 again */ + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen); + + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen); + + for (i = 1; i < 4; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); + rtnl_unlock(); +} + +static void dev_addr_test_del_main(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + + rtnl_lock(); + memset(addr, 1, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + rtnl_unlock(); +} + +static void dev_addr_test_add_set(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + int i; + + datp = netdev_priv(netdev); + + rtnl_lock(); + /* There is no external API like dev_addr_add_excl(), + * so shuffle the tree a little bit and exploit aliasing. + */ + for (i = 1; i < 16; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + + memset(addr, i, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + memset(addr, 0, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); + rtnl_unlock(); +} + +static void dev_addr_test_add_excl(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + int i; + + rtnl_lock(); + for (i = 0; i < 10; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); + } + KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); + + for (i = 0; i < 10; i += 2) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr)); + } + for (i = 1; i < 10; i += 2) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); + } + rtnl_unlock(); +} + +static struct kunit_case dev_addr_test_cases[] = { + KUNIT_CASE(dev_addr_test_basic), + KUNIT_CASE(dev_addr_test_sync_one), + KUNIT_CASE(dev_addr_test_add_del), + KUNIT_CASE(dev_addr_test_del_main), + KUNIT_CASE(dev_addr_test_add_set), + KUNIT_CASE(dev_addr_test_add_excl), + {} +}; + +static struct kunit_suite dev_addr_test_suite = { + .name = "dev-addr-list-test", + .test_cases = dev_addr_test_cases, + .init = dev_addr_test_init, + .exit = dev_addr_test_exit, +}; +kunit_test_suite(dev_addr_test_suite); + +MODULE_DESCRIPTION("KUnit tests for struct netdev_hw_addr_list"); +MODULE_LICENSE("GPL"); diff --git a/net/core/dev_api.c b/net/core/dev_api.c new file mode 100644 index 000000000000..f28852078aa6 --- /dev/null +++ b/net/core/dev_api.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/netdevice.h> +#include <net/netdev_lock.h> + +#include "dev.h" + +/** + * dev_change_name() - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_name(struct net_device *dev, const char *newname) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_name(dev, newname); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_set_alias() - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device. + * + * Return: 0 on success, -errno on failure. + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_alias(dev, alias, len); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_alias); + +/** + * dev_change_flags() - change device settings + * @dev: device + * @flags: device state flags + * @extack: netlink extended ack + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_flags(dev, flags, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_change_flags); + +/** + * dev_set_group() - change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + netdev_lock_ops(dev); + netif_set_group(dev, new_group); + netdev_unlock_ops(dev); +} + +int dev_set_mac_address_user(struct net_device *dev, + struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + netdev_lock_ops(dev); + ret = netif_set_mac_address(dev, ss, extack); + netdev_unlock_ops(dev); + up_write(&dev_addr_sem); + + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +/** + * dev_change_net_namespace() - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat) +{ + return __dev_change_net_namespace(dev, net, pat, 0, NULL); +} +EXPORT_SYMBOL_GPL(dev_change_net_namespace); + +/** + * dev_change_carrier() - change device carrier + * @dev: device + * @new_carrier: new value + * + * Change device carrier + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_carrier(struct net_device *dev, bool new_carrier) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_carrier(dev, new_carrier); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_change_tx_queue_len() - change TX queue length of a netdevice + * @dev: device + * @new_len: new tx queue length + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_tx_queue_len(dev, new_len); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_change_proto_down() - set carrier according to proto_down + * @dev: device + * @proto_down: new value + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_proto_down(dev, proto_down); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_open() - prepare an interface for use + * @dev: device to open + * @extack: netlink extended ack + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + * + * Return: 0 on success, -errno on failure. + */ +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_open(dev, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +/** + * dev_close() - shutdown an interface + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +void dev_close(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_close(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(dev_close); + +int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int ret = -ENODEV; + + if (!ops->ndo_eth_ioctl) + return -EOPNOTSUPP; + + netdev_lock_ops(dev); + if (netif_device_present(dev)) + ret = ops->ndo_eth_ioctl(dev, ifr, cmd); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_eth_ioctl); + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_mtu(dev, new_mtu); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_mtu); + +/** + * dev_disable_lro() - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_disable_lro(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(dev_disable_lro); + +/** + * dev_set_promiscuity() - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_promiscuity(dev, inc); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_promiscuity); + +/** + * dev_set_allmulti() - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * + * Return: 0 on success, -errno on failure. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_allmulti(dev, inc, true); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_allmulti); + +/** + * dev_set_mac_address() - change Media Access Control Address + * @dev: device + * @ss: new address + * @extack: netlink extended ack + * + * Change the hardware (MAC) address of the device + * + * Return: 0 on success, -errno on failure. + */ +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_mac_address(dev, ss, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address); + +int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_xdp_propagate(dev, bpf); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL_GPL(dev_xdp_propagate); + +/** + * netdev_state_change() - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_state_change(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(netdev_state_change); + +int dev_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) +{ + int ret; + + netdev_lock(dev); + ret = netif_set_threaded(dev, threaded); + netdev_unlock(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_threaded); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 5b7d0e1d0664..53a53357cfef 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -1,11 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kmod.h> #include <linux/netdevice.h> +#include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> +#include <linux/phylib_stubs.h> +#include <linux/ptp_clock_kernel.h> #include <linux/wireless.h> +#include <linux/if_bridge.h> +#include <net/dsa_stubs.h> +#include <net/netdev_lock.h> #include <net/wext.h> +#include "dev.h" + /* * Map an interface index to its name (SIOCGIFNAME) */ @@ -17,102 +26,112 @@ * match. --pb */ -static int dev_ifname(struct net *net, struct ifreq __user *arg) -{ - struct ifreq ifr; - int error; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); - if (error) - return error; - - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; -} - -static gifconf_func_t *gifconf_list[NPROTO]; - -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +static int dev_ifname(struct net *net, struct ifreq *ifr) { - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; + ifr->ifr_name[IFNAMSIZ-1] = 0; + return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } -EXPORT_SYMBOL(register_gifconf); /* * Perform a SIOCGIFCONF call. This structure will change * size eventually, and there is nothing I can do about it. * Thus we will need a 'compatibility mode'. */ - -static int dev_ifconf(struct net *net, char __user *arg) +int dev_ifconf(struct net *net, struct ifconf __user *uifc) { - struct ifconf ifc; struct net_device *dev; - char __user *pos; - int len; - int total; - int i; + void __user *pos; + size_t size; + int len, total = 0, done; - /* - * Fetch the caller's info block. - */ + /* both the ifconf and the ifreq structures are slightly different */ + if (in_compat_syscall()) { + struct compat_ifconf ifc32; - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; + if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) + return -EFAULT; - pos = ifc.ifc_buf; - len = ifc.ifc_len; + pos = compat_ptr(ifc32.ifcbuf); + len = ifc32.ifc_len; + size = sizeof(struct compat_ifreq); + } else { + struct ifconf ifc; - /* - * Loop over the interfaces, and write an info block for each. - */ + if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + size = sizeof(struct ifreq); + } - total = 0; + /* Loop over the interfaces, and write an info block for each. */ + rtnl_net_lock(net); for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0); - else - done = gifconf_list[i](dev, pos + total, - len - total); - if (done < 0) - return -EFAULT; - total += done; - } + if (!pos) + done = inet_gifconf(dev, NULL, 0, size); + else + done = inet_gifconf(dev, pos + total, + len - total, size); + if (done < 0) { + rtnl_net_unlock(net); + return -EFAULT; } + total += done; } + rtnl_net_unlock(net); - /* - * All done. Write the updated control block back to the caller. - */ - ifc.ifc_len = total; + return put_user(total, &uifc->ifc_len); +} - /* - * Both BSD and Solaris return 0 here, so we do too. - */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct ifmap *ifmap = &ifr->ifr_map; + + if (in_compat_syscall()) { + struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; + + cifmap->mem_start = dev->mem_start; + cifmap->mem_end = dev->mem_end; + cifmap->base_addr = dev->base_addr; + cifmap->irq = dev->irq; + cifmap->dma = dev->dma; + cifmap->port = dev->if_port; + + return 0; + } + + ifmap->mem_start = dev->mem_start; + ifmap->mem_end = dev->mem_end; + ifmap->base_addr = dev->base_addr; + ifmap->irq = dev->irq; + ifmap->dma = dev->dma; + ifmap->port = dev->if_port; + + return 0; +} + +static int netif_setifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; + + if (!dev->netdev_ops->ndo_set_config) + return -EOPNOTSUPP; + + if (in_compat_syscall()) { + struct ifmap ifmap = { + .mem_start = cifmap->mem_start, + .mem_end = cifmap->mem_end, + .base_addr = cifmap->base_addr, + .irq = cifmap->irq, + .dma = cifmap->dma, + .port = cifmap->port, + }; + + return dev->netdev_ops->ndo_set_config(dev, &ifmap); + } + + return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); } /* @@ -128,7 +147,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (short) dev_get_flags(dev); + ifr->ifr_flags = (short)netif_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface @@ -140,27 +159,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm ifr->ifr_mtu = dev->mtu; return 0; - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - case SIOCGIFSLAVE: err = -EINVAL; break; case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; + return dev_getifmap(dev, ifr); case SIOCGIFINDEX: ifr->ifr_ifindex = dev->ifindex; @@ -182,29 +186,29 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm return err; } -static int net_hwtstamp_validate(struct ifreq *ifr) +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { - struct hwtstamp_config cfg; enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; int tx_type_valid = 0; int rx_filter_valid = 0; - if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) - return -EFAULT; - - if (cfg.flags) /* reserved for future extensions */ + if (cfg->flags & ~HWTSTAMP_FLAG_MASK) return -EINVAL; - tx_type = cfg.tx_type; - rx_filter = cfg.rx_filter; + tx_type = cfg->tx_type; + rx_filter = cfg->rx_filter; switch (tx_type) { case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: case HWTSTAMP_TX_ONESTEP_SYNC: + case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; + case __HWTSTAMP_TX_CNT: + /* not a real value */ + break; } switch (rx_filter) { @@ -223,8 +227,12 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; + case __HWTSTAMP_FILTER_CNT: + /* not a real value */ + break; } if (!tx_type_valid || !rx_filter_valid) @@ -233,10 +241,333 @@ static int net_hwtstamp_validate(struct ifreq *ifr) return 0; } +/** + * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * + * Helper for calling the default hardware provider timestamping. + * + * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but + * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this + * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not + * implemented for now, which is still more accurate than letting the netdev + * handle the GET request. + */ +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + cfg->qualifier = hwprov->desc.qualifier; + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) + return phy_hwtstamp_get(hwprov->phydev, cfg); + + if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); + + return -EOPNOTSUPP; + } + + if (phy_is_default_hwtstamp(dev->phydev)) + return phy_hwtstamp_get(dev->phydev, cfg); + + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); +} + +static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; + struct hwtstamp_config cfg; + int err; + + if (!ops->ndo_hwtstamp_get) + return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + kernel_cfg.ifr = ifr; + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); + netdev_unlock_ops(dev); + if (err) + return err; + + /* If the request was resolved through an unconverted driver, omit + * the copy_to_user(), since the implementation has already done that + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +/** + * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * @extack: Netlink extended ack message structure, for error reporting + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. If the netdev driver needs to perform specific actions even for PHY + * timestamping to work properly (a switch port must trap the timestamped + * frames and not forward them), it must set dev->see_all_hwtstamp_requests. + */ +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config old_cfg = {}; + struct hwtstamp_provider *hwprov; + struct phy_device *phydev; + bool changed = false; + bool phy_ts; + int err; + + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) { + phy_ts = true; + phydev = hwprov->phydev; + } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) { + phy_ts = false; + } else { + return -EOPNOTSUPP; + } + + cfg->qualifier = hwprov->desc.qualifier; + } else { + phy_ts = phy_is_default_hwtstamp(dev->phydev); + if (phy_ts) + phydev = dev->phydev; + } + + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; + + if (phy_ts && dev->see_all_hwtstamp_requests) { + err = ops->ndo_hwtstamp_get(dev, &old_cfg); + if (err) + return err; + } + + if (!phy_ts || dev->see_all_hwtstamp_requests) { + err = ops->ndo_hwtstamp_set(dev, cfg, extack); + if (err) { + if (extack->_msg) + netdev_err(dev, "%s\n", extack->_msg); + return err; + } + } + + if (phy_ts && dev->see_all_hwtstamp_requests) + changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); + + if (phy_ts) { + err = phy_hwtstamp_set(phydev, cfg, extack); + if (err) { + if (changed) + ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); + return err; + } + } + + return 0; +} + +static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; + struct netlink_ext_ack extack = {}; + struct hwtstamp_config cfg; + int err; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + hwtstamp_config_to_kernel(&kernel_cfg, &cfg); + kernel_cfg.ifr = ifr; + + err = net_hwtstamp_validate(&kernel_cfg); + if (err) + return err; + + err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack); + if (err) { + if (extack._msg) + netdev_err(dev, "%s\n", extack._msg); + return err; + } + + if (!ops->ndo_hwtstamp_set) + return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); + netdev_unlock_ops(dev); + if (err) + return err; + + /* The driver may have modified the configuration, so copy the + * updated version of it back to user space + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, + struct kernel_hwtstamp_config *kernel_cfg) +{ + struct ifreq ifrr; + int err; + + if (!kernel_cfg->ifr) + return -EINVAL; + + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); + ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; + + err = dev_eth_ioctl(dev, &ifrr, cmd); + if (err) + return err; + + kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; + kernel_cfg->copied_to_user = true; + + return 0; +} + +int generic_hwtstamp_get_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); +} +EXPORT_SYMBOL(generic_hwtstamp_get_lower); + +int generic_hwtstamp_set_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); +} +EXPORT_SYMBOL(generic_hwtstamp_set_lower); + +static int dev_siocbond(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocbond) { + int ret = -ENODEV; + + netdev_lock_ops(dev); + if (netif_device_present(dev)) + ret = ops->ndo_siocbond(dev, ifr, cmd); + netdev_unlock_ops(dev); + + return ret; + } + + return -EOPNOTSUPP; +} + +static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocdevprivate) { + int ret = -ENODEV; + + netdev_lock_ops(dev); + if (netif_device_present(dev)) + ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd); + netdev_unlock_ops(dev); + + return ret; + } + + return -EOPNOTSUPP; +} + +static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocwandev) { + int ret = -ENODEV; + + netdev_lock_ops(dev); + if (netif_device_present(dev)) + ret = ops->ndo_siocwandev(dev, ifs); + netdev_unlock_ops(dev); + + return ret; + } + + return -EOPNOTSUPP; +} + /* - * Perform the SIOCxIFxxx calls, inside rtnl_lock() + * Perform the SIOCxIFxxx calls, inside rtnl_net_lock() */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, + unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); @@ -249,7 +580,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ @@ -259,23 +590,28 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + if (dev->addr_len > sizeof(ifr->ifr_hwaddr)) + return -EINVAL; + return dev_set_mac_address_user(dev, + (struct sockaddr_storage *)&ifr->ifr_hwaddr, + NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + min(sizeof(ifr->ifr_hwaddr.sa_data), + (size_t)dev->addr_len)); + netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + netdev_unlock_ops(dev); return 0; case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; + netdev_lock_ops(dev); + err = netif_setifmap(dev, ifr); + netdev_unlock_ops(dev); + return err; case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || @@ -283,7 +619,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_lock_ops(dev); + err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_unlock_ops(dev); + return err; case SIOCDELMULTI: if (!ops->ndo_set_rx_mode || @@ -291,53 +630,48 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_lock_ops(dev); + err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_unlock_ops(dev); + return err; case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; + return dev_change_tx_queue_len(dev, ifr->ifr_qlen); case SIOCSIFNAME: ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); + case SIOCWANDEV: + return dev_siocwandev(dev, &ifr->ifr_settings); + + case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: + return dev_siocdevprivate(dev, ifr, data, cmd); + case SIOCSHWTSTAMP: - err = net_hwtstamp_validate(ifr); - if (err) - return err; - /* fall through */ + return dev_set_hwtstamp(dev, ifr); - /* - * Unknown or private ioctl - */ - default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (ops->ndo_do_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); - else - err = -ENODEV; - } - } else - err = -EINVAL; + case SIOCGHWTSTAMP: + return dev_get_hwtstamp(dev, ifr); + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSMIIREG: + return dev_eth_ioctl(dev, ifr, cmd); + + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + case SIOCBONDCHANGEACTIVE: + return dev_siocbond(dev, ifr, cmd); + + /* Unknown ioctl */ + default: + err = -EINVAL; } return err; } @@ -364,11 +698,8 @@ void dev_load(struct net *net, const char *name) no_module = !dev; if (no_module && capable(CAP_NET_ADMIN)) no_module = request_module("netdev-%s", name); - if (no_module && capable(CAP_SYS_MODULE)) { - if (!request_module("%s", name)) - pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", - name); - } + if (no_module && capable(CAP_SYS_MODULE)) + request_module("%s", name); } EXPORT_SYMBOL(dev_load); @@ -381,7 +712,9 @@ EXPORT_SYMBOL(dev_load); * dev_ioctl - network device ioctl * @net: the applicable net namespace * @cmd: command to issue - * @arg: pointer to a struct ifreq in user space + * @ifr: pointer to a struct ifreq in user space + * @data: data exchanged with userspace + * @need_copyout: whether or not copy_to_user() should be called * * Issue ioctl functions to devices. This is normally called by the * user space syscall interfaces but can sometimes be useful for @@ -389,32 +722,20 @@ EXPORT_SYMBOL(dev_load); * positive or a negative errno code on error. */ -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, + void __user *data, bool *need_copyout) { - struct ifreq ifr; int ret; char *colon; - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_lock(); - ret = dev_ifconf(net, (char __user *) arg); - rtnl_unlock(); - return ret; - } + if (need_copyout) + *need_copyout = true; if (cmd == SIOCGIFNAME) - return dev_ifname(net, (struct ifreq __user *)arg); - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; + return dev_ifname(net, ifr); - ifr.ifr_name[IFNAMSIZ-1] = 0; + ifr->ifr_name[IFNAMSIZ-1] = 0; - colon = strchr(ifr.ifr_name, ':'); + colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; @@ -423,6 +744,13 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ switch (cmd) { + case SIOCGIFHWADDR: + dev_load(net, ifr->ifr_name); + ret = netif_get_mac_address(&ifr->ifr_hwaddr, net, + ifr->ifr_name); + if (colon) + *colon = ':'; + return ret; /* * These ioctl calls: * - can be done by all. @@ -432,36 +760,23 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: - case SIOCGIFHWADDR: case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); + dev_load(net, ifr->ifr_name); rcu_read_lock(); - ret = dev_ifsioc_locked(net, &ifr, cmd); + ret = dev_ifsioc_locked(net, ifr, cmd); rcu_read_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ethtool(net, &ifr); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + dev_load(net, ifr->ifr_name); + ret = dev_ethtool(net, ifr, data); + if (colon) + *colon = ':'; return ret; /* @@ -473,19 +788,16 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSIFNAME: + dev_load(net, ifr->ifr_name); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + + rtnl_net_lock(net); + ret = dev_ifsioc(net, ifr, data, cmd); + rtnl_net_unlock(net); + + if (colon) + *colon = ':'; return ret; /* @@ -498,7 +810,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFTXQLEN: if (!capable(CAP_NET_ADMIN)) return -EPERM; - /* fall through */ + fallthrough; /* * These ioctl calls: * - require local superuser power. @@ -518,18 +830,20 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - /* fall through */ + fallthrough; case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); + dev_load(net, ifr->ifr_name); + + rtnl_net_lock(net); + ret = dev_ifsioc(net, ifr, data, cmd); + rtnl_net_unlock(net); + + if (need_copyout) + *need_copyout = false; return ret; case SIOCGIFMEM: @@ -546,20 +860,16 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ default: if (cmd == SIOCWANDEV || + cmd == SIOCGHWTSTAMP || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; + dev_load(net, ifr->ifr_name); + + rtnl_net_lock(net); + ret = dev_ifsioc(net, ifr, data, cmd); + rtnl_net_unlock(net); return ret; } - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) - return wext_handle_ioctl(net, &ifr, cmd, arg); return -ENOTTY; } } diff --git a/net/core/devmem.c b/net/core/devmem.c new file mode 100644 index 000000000000..ec4217d6c0b4 --- /dev/null +++ b/net/core/devmem.c @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Devmem TCP + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemdebruijn.kernel@gmail.com> + * Kaiyuan Zhang <kaiyuanz@google.com + */ + +#include <linux/dma-buf.h> +#include <linux/genalloc.h> +#include <linux/mm.h> +#include <linux/netdevice.h> +#include <linux/types.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <trace/events/page_pool.h> + +#include "devmem.h" +#include "mp_dmabuf_devmem.h" +#include "page_pool_priv.h" + +/* Device memory support */ + +static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); + +static const struct memory_provider_ops dmabuf_devmem_ops; + +bool net_is_devmem_iov(struct net_iov *niov) +{ + return niov->type == NET_IOV_DMABUF; +} + +static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, + struct gen_pool_chunk *chunk, + void *not_used) +{ + struct dmabuf_genpool_chunk_owner *owner = chunk->owner; + + kvfree(owner->area.niovs); + kfree(owner); +} + +static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) +{ + struct dmabuf_genpool_chunk_owner *owner; + + owner = net_devmem_iov_to_chunk_owner(niov); + return owner->base_dma_addr + + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); +} + +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) +{ + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + + size_t size, avail; + + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + + size = gen_pool_size(binding->chunk_pool); + avail = gen_pool_avail(binding->chunk_pool); + + if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu", + size, avail)) + gen_pool_destroy(binding->chunk_pool); + + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + binding->direction); + dma_buf_detach(binding->dmabuf, binding->attachment); + dma_buf_put(binding->dmabuf); + xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); + kfree(binding); +} + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct dmabuf_genpool_chunk_owner *owner; + unsigned long dma_addr; + struct net_iov *niov; + ssize_t offset; + ssize_t index; + + dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, + (void **)&owner); + if (!dma_addr) + return NULL; + + offset = dma_addr - owner->base_dma_addr; + index = offset / PAGE_SIZE; + niov = &owner->area.niovs[index]; + + niov->desc.pp_magic = 0; + niov->desc.pp = NULL; + atomic_long_set(&niov->desc.pp_ref_count, 0); + + return niov; +} + +void net_devmem_free_dmabuf(struct net_iov *niov) +{ + struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); + unsigned long dma_addr = net_devmem_get_dma_addr(niov); + + if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, + PAGE_SIZE))) + return; + + gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); +} + +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct netdev_rx_queue *rxq; + unsigned long xa_idx; + unsigned int rxq_idx; + + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase. + */ + synchronize_net(); + + if (binding->list.next) + list_del(&binding->list); + + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { + const struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; + + rxq_idx = get_netdev_rx_queue_index(rxq); + + __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); + } + + net_devmem_dmabuf_binding_put(binding); +} + +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) +{ + struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; + struct netdev_rx_queue *rxq; + u32 xa_idx; + int err; + + err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); + if (err) + return err; + + rxq = __netif_get_rx_queue(dev, rxq_idx); + err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, + GFP_KERNEL); + if (err) + goto err_close_rxq; + + return 0; + +err_close_rxq: + __net_mp_close_rxq(dev, rxq_idx, &mp_params); + return err; +} + +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack) +{ + struct net_devmem_dmabuf_binding *binding; + static u32 id_alloc_next; + struct scatterlist *sg; + struct dma_buf *dmabuf; + unsigned int sg_idx, i; + unsigned long virtual; + int err; + + if (!dma_dev) { + NL_SET_ERR_MSG(extack, "Device doesn't support DMA"); + return ERR_PTR(-EOPNOTSUPP); + } + + dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!binding) { + err = -ENOMEM; + goto err_put_dmabuf; + } + + binding->dev = dev; + xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); + + refcount_set(&binding->ref, 1); + + mutex_init(&binding->lock); + + binding->dmabuf = dmabuf; + binding->direction = direction; + + binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev); + if (IS_ERR(binding->attachment)) { + err = PTR_ERR(binding->attachment); + NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); + goto err_free_binding; + } + + binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, + direction); + if (IS_ERR(binding->sgt)) { + err = PTR_ERR(binding->sgt); + NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); + goto err_detach; + } + + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + + /* For simplicity we expect to make PAGE_SIZE allocations, but the + * binding can be much more flexible than that. We may be able to + * allocate MTU sized chunks here. Leave that for future work... + */ + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); + if (!binding->chunk_pool) { + err = -ENOMEM; + goto err_tx_vec; + } + + virtual = 0; + for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { + dma_addr_t dma_addr = sg_dma_address(sg); + struct dmabuf_genpool_chunk_owner *owner; + size_t len = sg_dma_len(sg); + struct net_iov *niov; + + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!owner) { + err = -ENOMEM; + goto err_free_chunks; + } + + owner->area.base_virtual = virtual; + owner->base_dma_addr = dma_addr; + owner->area.num_niovs = len / PAGE_SIZE; + owner->binding = binding; + + err = gen_pool_add_owner(binding->chunk_pool, dma_addr, + dma_addr, len, dev_to_node(&dev->dev), + owner); + if (err) { + kfree(owner); + err = -EINVAL; + goto err_free_chunks; + } + + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { + err = -ENOMEM; + goto err_free_chunks; + } + + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->type = NET_IOV_DMABUF; + niov->owner = &owner->area; + page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), + net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; + } + + virtual += len; + } + + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_chunks; + + list_add(&binding->list, &priv->bindings); + + return binding; + +err_free_chunks: + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); +err_unmap: + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + direction); +err_detach: + dma_buf_detach(dmabuf, binding->attachment); +err_free_binding: + kfree(binding); +err_put_dmabuf: + dma_buf_put(dmabuf); + return ERR_PTR(err); +} + +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); +} + +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct net_device *dst_dev; + struct dst_entry *dst; + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + rcu_read_lock(); + dst = __sk_dst_get(sk); + /* If dst is NULL (route expired), attempt to rebuild it. */ + if (unlikely(!dst)) { + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { + err = -EHOSTUNREACH; + goto out_unlock; + } + dst = __sk_dst_get(sk); + if (unlikely(!dst)) { + err = -ENODEV; + goto out_unlock; + } + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + dst_dev = dst_dev_rcu(dst); + if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { + err = -ENODEV; + goto out_unlock; + } + + rcu_read_unlock(); + return binding; + +out_unlock: + rcu_read_unlock(); +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + if (virt_addr >= binding->dmabuf->size) + return NULL; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[virt_addr / PAGE_SIZE]; +} + +/*** "Dmabuf devmem memory provider" ***/ + +int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + if (!binding) + return -EINVAL; + + /* dma-buf dma addresses do not need and should not be used with + * dma_sync_for_cpu/device. Force disable dma_sync. + */ + pool->dma_sync = false; + pool->dma_sync_for_cpu = false; + + if (pool->p.order != 0) + return -E2BIG; + + net_devmem_dmabuf_binding_get(binding); + return 0; +} + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + struct net_iov *niov; + netmem_ref netmem; + + niov = net_devmem_alloc_dmabuf(binding); + if (!niov) + return 0; + + netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); + return netmem; +} + +void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + net_devmem_dmabuf_binding_put(binding); +} + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem)); + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + if (WARN_ON_ONCE(refcount != 1)) + return false; + + page_pool_clear_pp_info(netmem); + + net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); + + /* We don't want the page pool put_page()ing our net_iovs. */ + return false; +} + +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + if (xa_empty(&binding->bound_rxqs)) { + mutex_lock(&binding->lock); + binding->dev = NULL; + mutex_unlock(&binding->lock); + } + break; + } + } +} + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, +}; diff --git a/net/core/devmem.h b/net/core/devmem.h new file mode 100644 index 000000000000..0b43a648cd2e --- /dev/null +++ b/net/core/devmem.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Device memory TCP support + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemb@google.com> + * Kaiyuan Zhang <kaiyuanz@google.com> + * + */ +#ifndef _NET_DEVMEM_H +#define _NET_DEVMEM_H + +#include <net/netmem.h> +#include <net/netdev_netlink.h> + +struct netlink_ext_ack; + +struct net_devmem_dmabuf_binding { + struct dma_buf *dmabuf; + struct dma_buf_attachment *attachment; + struct sg_table *sgt; + struct net_device *dev; + struct gen_pool *chunk_pool; + /* Protect dev */ + struct mutex lock; + + /* The user holds a ref (via the netlink API) for as long as they want + * the binding to remain alive. Each page pool using this binding holds + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. + * + * The binding undos itself and unmaps the underlying dmabuf once all + * those refs are dropped and the binding is no longer desired or in + * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. + */ + refcount_t ref; + + /* The list of bindings currently active. Used for netlink to notify us + * of the user dropping the bind. + */ + struct list_head list; + + /* rxq's this binding is active on. */ + struct xarray bound_rxqs; + + /* ID of this binding. Globally unique to all bindings currently + * active. + */ + u32 id; + + /* DMA direction, FROM_DEVICE for Rx binding, TO_DEVICE for Tx. */ + enum dma_data_direction direction; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; +}; + +#if defined(CONFIG_NET_DEVMEM) +/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist + * entry from the dmabuf is inserted into the genpool as a chunk, and needs + * this owner struct to keep track of some metadata necessary to create + * allocations from this chunk. + */ +struct dmabuf_genpool_chunk_owner { + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; + + /* dma_addr of the start of the chunk. */ + dma_addr_t base_dma_addr; +}; + +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack); + +static inline struct dmabuf_genpool_chunk_owner * +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return net_devmem_iov_to_chunk_owner(niov)->binding; +} + +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) +{ + return net_devmem_iov_binding(niov)->id; +} + +static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return owner->base_virtual + + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); +} + +static inline bool +net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) +{ + return refcount_inc_not_zero(&binding->ref); +} + +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ + if (!refcount_dec_and_test(&binding->ref)) + return; + + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); +} + +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); +void net_devmem_free_dmabuf(struct net_iov *ppiov); + +bool net_is_devmem_iov(struct net_iov *niov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); + +#else +struct net_devmem_dmabuf_binding; + +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) +{ +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, + struct netdev_nl_sock *priv, + struct netlink_ext_ack *extack) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + +static inline void +net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline int +net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) + +{ + return -EOPNOTSUPP; +} + +static inline struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + return NULL; +} + +static inline void net_devmem_free_dmabuf(struct net_iov *ppiov) +{ +} + +static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) +{ + return 0; +} + +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) +{ + return 0; +} + +static inline bool net_is_devmem_iov(struct net_iov *niov) +{ + return false; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return NULL; +} +#endif + +#endif /* _NET_DEVMEM_H */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 5e78d44333b9..60d31c2feed3 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Monitoring code for network dropped packet alerts * @@ -20,6 +21,7 @@ #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> +#include <linux/bitfield.h> #include <linux/percpu.h> #include <linux/timer.h> #include <linux/bitops.h> @@ -27,11 +29,15 @@ #include <linux/module.h> #include <net/genetlink.h> #include <net/netevent.h> +#include <net/flow_offload.h> +#include <net/dropreason.h> +#include <net/devlink.h> #include <trace/events/skb.h> #include <trace/events/napi.h> +#include <trace/events/devlink.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #define TRACE_ON 1 #define TRACE_OFF 0 @@ -42,37 +48,87 @@ * netlink alerts */ static int trace_state = TRACE_OFF; -static DEFINE_MUTEX(trace_state_mutex); +static bool monitor_hw; + +/* net_dm_mutex + * + * An overall lock guarding every operation coming from userspace. + */ +static DEFINE_MUTEX(net_dm_mutex); + +struct net_dm_stats { + u64_stats_t dropped; + struct u64_stats_sync syncp; +}; + +#define NET_DM_MAX_HW_TRAP_NAME_LEN 40 + +struct net_dm_hw_entry { + char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN]; + u32 count; +}; + +struct net_dm_hw_entries { + u32 num_entries; + struct net_dm_hw_entry entries[]; +}; struct per_cpu_dm_data { - spinlock_t lock; - struct sk_buff *skb; + raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and + * 'send_timer' + */ + union { + struct sk_buff *skb; + struct net_dm_hw_entries *hw_entries; + }; + struct sk_buff_head drop_queue; struct work_struct dm_alert_work; struct timer_list send_timer; + struct net_dm_stats stats; }; struct dm_hw_stat_delta { - struct net_device *dev; unsigned long last_rx; - struct list_head list; - struct rcu_head rcu; unsigned long last_drop_val; + struct rcu_head rcu; }; -static struct genl_family net_drop_monitor_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = "NET_DM", - .version = 2, - .maxattr = NET_DM_CMD_MAX, -}; +static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); +static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; -static LIST_HEAD(hw_stats_list); + +static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; +static u32 net_dm_trunc_len; +static u32 net_dm_queue_len = 1000; + +struct net_dm_alert_ops { + void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, + void *location, + enum skb_drop_reason reason, + struct sock *rx_sk); + void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, + int work, int budget); + void (*work_item_func)(struct work_struct *work); + void (*hw_work_item_func)(struct work_struct *work); + void (*hw_trap_probe)(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata); +}; + +struct net_dm_skb_cb { + union { + struct devlink_trap_metadata *hw_metadata; + void *pc; + }; + enum skb_drop_reason reason; +}; + +#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -81,6 +137,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) struct nlattr *nla; struct sk_buff *skb; unsigned long flags; + void *msg_header; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); @@ -88,24 +145,48 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) skb = genlmsg_new(al, GFP_KERNEL); - if (skb) { - genlmsg_put(skb, 0, 0, &net_drop_monitor_family, - 0, NET_DM_CMD_ALERT); - nla = nla_reserve(skb, NLA_UNSPEC, - sizeof(struct net_dm_alert_msg)); - msg = nla_data(nla); - memset(msg, 0, al); - } else { - mod_timer(&data->send_timer, jiffies + HZ / 10); + if (!skb) + goto err; + + msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + if (!msg_header) { + nlmsg_free(skb); + skb = NULL; + goto err; } + nla = nla_reserve(skb, NLA_UNSPEC, + sizeof(struct net_dm_alert_msg)); + if (!nla) { + nlmsg_free(skb); + skb = NULL; + goto err; + } + msg = nla_data(nla); + memset(msg, 0, al); + goto out; - spin_lock_irqsave(&data->lock, flags); +err: + mod_timer(&data->send_timer, jiffies + HZ / 10); +out: + raw_spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); + + if (skb) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; + struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh); + + genlmsg_end(skb, genlmsg_data(gnlh)); + } return skb; } +static const struct genl_multicast_group dropmon_mcgrps[] = { + { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, }, +}; + static void send_dm_alert(struct work_struct *work) { struct sk_buff *skb; @@ -116,7 +197,8 @@ static void send_dm_alert(struct work_struct *work) skb = reset_per_cpu_data(data); if (skb) - genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); + genlmsg_multicast(&net_drop_monitor_family, skb, 0, + 0, GFP_KERNEL); } /* @@ -124,9 +206,9 @@ static void send_dm_alert(struct work_struct *work) * in the event that more drops will arrive during the * hysteresis period. */ -static void sched_send_work(unsigned long _data) +static void sched_send_work(struct timer_list *t) { - struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data; + struct per_cpu_dm_data *data = timer_container_of(data, t, send_timer); schedule_work(&data->dm_alert_work); } @@ -134,6 +216,7 @@ static void sched_send_work(unsigned long _data) static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; + struct net_dm_drop_point *point; struct nlmsghdr *nlh; struct nlattr *nla; int i; @@ -142,8 +225,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location) unsigned long flags; local_irq_save(flags); - data = &__get_cpu_var(dm_cpu_data); - spin_lock(&data->lock); + data = this_cpu_ptr(&dm_cpu_data); + raw_spin_lock(&data->lock); dskb = data->skb; if (!dskb) @@ -152,11 +235,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location) nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); + point = msg->points; for (i = 0; i < msg->entries; i++) { - if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { - msg->points[i].count++; + if (!memcmp(&location, &point->pc, sizeof(void *))) { + point->count++; goto out; } + point++; } if (msg->entries == dm_hit_limit) goto out; @@ -165,8 +250,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location) */ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); - memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); - msg->points[msg->entries].count = 1; + memcpy(point->pc, &location, sizeof(void *)); + point->count = 1; msg->entries++; if (!timer_pending(&data->send_timer)) { @@ -175,86 +260,981 @@ static void trace_drop_common(struct sk_buff *skb, void *location) } out: - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); } -static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) +static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, + void *location, + enum skb_drop_reason reason, + struct sock *rx_sk) { trace_drop_common(skb, location); } -static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) +static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, + int work, int budget) { - struct dm_hw_stat_delta *new_stat; - + struct net_device *dev = napi->dev; + struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ - if (!napi->dev) + if (!dev) return; rcu_read_lock(); - list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + stat = rcu_dereference(dev->dm_private); + if (stat) { /* * only add a note to our monitor buffer if: - * 1) this is the dev we received on - * 2) its after the last_rx delta - * 3) our rx_dropped count has gone up + * 1) its after the last_rx delta + * 2) our rx_dropped count has gone up */ - if ((new_stat->dev == napi->dev) && - (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && - (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && + (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); - new_stat->last_drop_val = napi->dev->stats.rx_dropped; - new_stat->last_rx = jiffies; - break; + stat->last_drop_val = dev->stats.rx_dropped; + stat->last_rx = jiffies; } } rcu_read_unlock(); } -static int set_all_monitor_traces(int state) +static struct net_dm_hw_entries * +net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) { - int rc = 0; - struct dm_hw_stat_delta *new_stat = NULL; - struct dm_hw_stat_delta *temp; + struct net_dm_hw_entries *hw_entries; + unsigned long flags; - mutex_lock(&trace_state_mutex); + hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit), + GFP_KERNEL); + if (!hw_entries) { + /* If the memory allocation failed, we try to perform another + * allocation in 1/10 second. Otherwise, the probe function + * will constantly bail out. + */ + mod_timer(&hw_data->send_timer, jiffies + HZ / 10); + } - if (state == trace_state) { - rc = -EAGAIN; - goto out_unlock; + raw_spin_lock_irqsave(&hw_data->lock, flags); + swap(hw_data->hw_entries, hw_entries); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); + + return hw_entries; +} + +static int net_dm_hw_entry_put(struct sk_buff *msg, + const struct net_dm_hw_entry *hw_entry) +{ + struct nlattr *attr; + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY); + if (!attr) + return -EMSGSIZE; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_hw_entries_put(struct sk_buff *msg, + const struct net_dm_hw_entries *hw_entries) +{ + struct nlattr *attr; + int i; + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES); + if (!attr) + return -EMSGSIZE; + + for (i = 0; i < hw_entries->num_entries; i++) { + int rc; + + rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]); + if (rc) + goto nla_put_failure; } - switch (state) { - case TRACE_ON: - if (!try_module_get(THIS_MODULE)) { - rc = -ENODEV; - break; + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int +net_dm_hw_summary_report_fill(struct sk_buff *msg, + const struct net_dm_hw_entries *hw_entries) +{ + struct net_dm_alert_msg anc_hdr = { 0 }; + void *hdr; + int rc; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_ALERT); + if (!hdr) + return -EMSGSIZE; + + /* We need to put the ancillary header in order not to break user + * space. + */ + if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr)) + goto nla_put_failure; + + rc = net_dm_hw_entries_put(msg, hw_entries); + if (rc) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void net_dm_hw_summary_work(struct work_struct *work) +{ + struct net_dm_hw_entries *hw_entries; + struct per_cpu_dm_data *hw_data; + struct sk_buff *msg; + int rc; + + hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); + if (!hw_entries) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_hw_summary_report_fill(msg, hw_entries); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + kfree(hw_entries); +} + +static void +net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata) +{ + struct net_dm_hw_entries *hw_entries; + struct net_dm_hw_entry *hw_entry; + struct per_cpu_dm_data *hw_data; + unsigned long flags; + int i; + + if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) + return; + + hw_data = this_cpu_ptr(&dm_hw_cpu_data); + raw_spin_lock_irqsave(&hw_data->lock, flags); + hw_entries = hw_data->hw_entries; + + if (!hw_entries) + goto out; + + for (i = 0; i < hw_entries->num_entries; i++) { + hw_entry = &hw_entries->entries[i]; + if (!strncmp(hw_entry->trap_name, metadata->trap_name, + NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) { + hw_entry->count++; + goto out; } + } + if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit)) + goto out; - rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL); - rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL); - break; + hw_entry = &hw_entries->entries[hw_entries->num_entries]; + strscpy(hw_entry->trap_name, metadata->trap_name, + NET_DM_MAX_HW_TRAP_NAME_LEN - 1); + hw_entry->count = 1; + hw_entries->num_entries++; - case TRACE_OFF: - rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL); - rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL); + if (!timer_pending(&hw_data->send_timer)) { + hw_data->send_timer.expires = jiffies + dm_delay * HZ; + add_timer(&hw_data->send_timer); + } - tracepoint_synchronize_unregister(); +out: + raw_spin_unlock_irqrestore(&hw_data->lock, flags); +} - /* - * Clean the device list - */ - list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { - if (new_stat->dev == NULL) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - } +static const struct net_dm_alert_ops net_dm_alert_summary_ops = { + .kfree_skb_probe = trace_kfree_skb_hit, + .napi_poll_probe = trace_napi_poll_hit, + .work_item_func = send_dm_alert, + .hw_work_item_func = net_dm_hw_summary_work, + .hw_trap_probe = net_dm_hw_trap_summary_probe, +}; + +static void net_dm_packet_trace_kfree_skb_hit(void *ignore, + struct sk_buff *skb, + void *location, + enum skb_drop_reason reason, + struct sock *rx_sk) +{ + ktime_t tstamp = ktime_get_real(); + struct per_cpu_dm_data *data; + struct net_dm_skb_cb *cb; + struct sk_buff *nskb; + unsigned long flags; + + if (!skb_mac_header_was_set(skb)) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + cb = NET_DM_SKB_CB(nskb); + cb->reason = reason; + cb->pc = location; + /* Override the timestamp because we care about the time when the + * packet was dropped. + */ + nskb->tstamp = tstamp; + + data = this_cpu_ptr(&dm_cpu_data); + + spin_lock_irqsave(&data->drop_queue.lock, flags); + if (skb_queue_len(&data->drop_queue) < net_dm_queue_len) + __skb_queue_tail(&data->drop_queue, nskb); + else + goto unlock_free; + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + + schedule_work(&data->dm_alert_work); + + return; + +unlock_free: + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + u64_stats_update_begin(&data->stats.syncp); + u64_stats_inc(&data->stats.dropped); + u64_stats_update_end(&data->stats.syncp); + consume_skb(nskb); +} + +static void net_dm_packet_trace_napi_poll_hit(void *ignore, + struct napi_struct *napi, + int work, int budget) +{ +} + +static size_t net_dm_in_port_size(void) +{ + /* NET_DM_ATTR_IN_PORT nest */ + return nla_total_size(0) + + /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PORT_NETDEV_NAME */ + nla_total_size(IFNAMSIZ + 1); +} + +#define NET_DM_MAX_SYMBOL_LEN 40 +#define NET_DM_MAX_REASON_LEN 50 + +static size_t net_dm_packet_report_size(size_t payload_len) +{ + size_t size; + + size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); + + return NLMSG_ALIGN(size) + + /* NET_DM_ATTR_ORIGIN */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_PC */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_SYMBOL */ + nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) + + /* NET_DM_ATTR_IN_PORT */ + net_dm_in_port_size() + + /* NET_DM_ATTR_TIMESTAMP */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_ORIG_LEN */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PROTO */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_REASON */ + nla_total_size(NET_DM_MAX_REASON_LEN + 1) + + /* NET_DM_ATTR_PAYLOAD */ + nla_total_size(payload_len); +} + +static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex, + const char *name) +{ + struct nlattr *attr; + + attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT); + if (!attr) + return -EMSGSIZE; + + if (ifindex && + nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex)) + goto nla_put_failure; + + if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, + size_t payload_len) +{ + struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); + const struct drop_reason_list *list = NULL; + unsigned int subsys, subsys_reason; + char buf[NET_DM_MAX_SYMBOL_LEN]; + struct nlattr *attr; + void *hdr; + int rc; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_PACKET_ALERT); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc, + NET_DM_ATTR_PAD)) + goto nla_put_failure; + + rcu_read_lock(); + subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK); + if (subsys < SKB_DROP_REASON_SUBSYS_NUM) + list = rcu_dereference(drop_reasons_by_subsys[subsys]); + subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK; + if (!list || + subsys_reason >= list->n_reasons || + !list->reasons[subsys_reason] || + strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) { + list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]); + subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED; + } + if (nla_put_string(msg, NET_DM_ATTR_REASON, + list->reasons[subsys_reason])) { + rcu_read_unlock(); + goto nla_put_failure; + } + rcu_read_unlock(); + + snprintf(buf, sizeof(buf), "%pS", cb->pc); + if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) + goto nla_put_failure; + + rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL); + if (rc) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, + ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) + goto nla_put_failure; + + if (!payload_len) + goto out; + + if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) + goto nla_put_failure; + + attr = skb_put(msg, nla_total_size(payload_len)); + attr->nla_type = NET_DM_ATTR_PAYLOAD; + attr->nla_len = nla_attr_size(payload_len); + if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +#define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO) + +static void net_dm_packet_report(struct sk_buff *skb) +{ + struct sk_buff *msg; + size_t payload_len; + int rc; + + /* Make sure we start copying the packet from the MAC header */ + if (skb->data > skb_mac_header(skb)) + skb_push(skb, skb->data - skb_mac_header(skb)); + else + skb_pull(skb, skb_mac_header(skb) - skb->data); + + /* Ensure packet fits inside a single netlink attribute */ + payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); + if (net_dm_trunc_len) + payload_len = min_t(size_t, net_dm_trunc_len, payload_len); + + msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_packet_report_fill(msg, skb, payload_len); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + consume_skb(skb); +} + +static void net_dm_packet_work(struct work_struct *work) +{ + struct per_cpu_dm_data *data; + struct sk_buff_head list; + struct sk_buff *skb; + unsigned long flags; + + data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + __skb_queue_head_init(&list); + + spin_lock_irqsave(&data->drop_queue.lock, flags); + skb_queue_splice_tail_init(&data->drop_queue, &list); + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + + while ((skb = __skb_dequeue(&list))) + net_dm_packet_report(skb); +} + +static size_t +net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata) +{ + return hw_metadata->fa_cookie ? + nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; +} + +static size_t +net_dm_hw_packet_report_size(size_t payload_len, + const struct devlink_trap_metadata *hw_metadata) +{ + size_t size; + + size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); + + return NLMSG_ALIGN(size) + + /* NET_DM_ATTR_ORIGIN */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */ + nla_total_size(strlen(hw_metadata->trap_group_name) + 1) + + /* NET_DM_ATTR_HW_TRAP_NAME */ + nla_total_size(strlen(hw_metadata->trap_name) + 1) + + /* NET_DM_ATTR_IN_PORT */ + net_dm_in_port_size() + + /* NET_DM_ATTR_FLOW_ACTION_COOKIE */ + net_dm_flow_action_cookie_size(hw_metadata) + + /* NET_DM_ATTR_TIMESTAMP */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_ORIG_LEN */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PROTO */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_PAYLOAD */ + nla_total_size(payload_len); +} + +static int net_dm_hw_packet_report_fill(struct sk_buff *msg, + struct sk_buff *skb, size_t payload_len) +{ + struct devlink_trap_metadata *hw_metadata; + struct nlattr *attr; + void *hdr; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_PACKET_ALERT); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW)) + goto nla_put_failure; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME, + hw_metadata->trap_group_name)) + goto nla_put_failure; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, + hw_metadata->trap_name)) + goto nla_put_failure; + + if (hw_metadata->input_dev) { + struct net_device *dev = hw_metadata->input_dev; + int rc; + + rc = net_dm_packet_report_in_port_put(msg, dev->ifindex, + dev->name); + if (rc) + goto nla_put_failure; + } + + if (hw_metadata->fa_cookie && + nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE, + hw_metadata->fa_cookie->cookie_len, + hw_metadata->fa_cookie->cookie)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, + ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) + goto nla_put_failure; + + if (!payload_len) + goto out; + + if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) + goto nla_put_failure; + + attr = skb_put(msg, nla_total_size(payload_len)); + attr->nla_type = NET_DM_ATTR_PAYLOAD; + attr->nla_len = nla_attr_size(payload_len); + if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static struct devlink_trap_metadata * +net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) +{ + const struct flow_action_cookie *fa_cookie; + struct devlink_trap_metadata *hw_metadata; + const char *trap_group_name; + const char *trap_name; + + hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); + if (!hw_metadata) + return NULL; + + trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC); + if (!trap_group_name) + goto free_hw_metadata; + hw_metadata->trap_group_name = trap_group_name; + + trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC); + if (!trap_name) + goto free_trap_group; + hw_metadata->trap_name = trap_name; + + if (metadata->fa_cookie) { + size_t cookie_size = sizeof(*fa_cookie) + + metadata->fa_cookie->cookie_len; + + fa_cookie = kmemdup(metadata->fa_cookie, cookie_size, + GFP_ATOMIC); + if (!fa_cookie) + goto free_trap_name; + hw_metadata->fa_cookie = fa_cookie; + } + + hw_metadata->input_dev = metadata->input_dev; + netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker, + GFP_ATOMIC); + + return hw_metadata; + +free_trap_name: + kfree(trap_name); +free_trap_group: + kfree(trap_group_name); +free_hw_metadata: + kfree(hw_metadata); + return NULL; +} + +static void +net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata) +{ + netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker); + kfree(hw_metadata->fa_cookie); + kfree(hw_metadata->trap_name); + kfree(hw_metadata->trap_group_name); + kfree(hw_metadata); +} + +static void net_dm_hw_packet_report(struct sk_buff *skb) +{ + struct devlink_trap_metadata *hw_metadata; + struct sk_buff *msg; + size_t payload_len; + int rc; + + if (skb->data > skb_mac_header(skb)) + skb_push(skb, skb->data - skb_mac_header(skb)); + else + skb_pull(skb, skb_mac_header(skb) - skb->data); + + payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); + if (net_dm_trunc_len) + payload_len = min_t(size_t, net_dm_trunc_len, payload_len); + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata), + GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_hw_packet_report_fill(msg, skb, payload_len); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata); + consume_skb(skb); +} + +static void net_dm_hw_packet_work(struct work_struct *work) +{ + struct per_cpu_dm_data *hw_data; + struct sk_buff_head list; + struct sk_buff *skb; + unsigned long flags; + + hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + __skb_queue_head_init(&list); + + spin_lock_irqsave(&hw_data->drop_queue.lock, flags); + skb_queue_splice_tail_init(&hw_data->drop_queue, &list); + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + + while ((skb = __skb_dequeue(&list))) + net_dm_hw_packet_report(skb); +} + +static void +net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata) +{ + struct devlink_trap_metadata *n_hw_metadata; + ktime_t tstamp = ktime_get_real(); + struct per_cpu_dm_data *hw_data; + struct sk_buff *nskb; + unsigned long flags; + + if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) + return; + + if (!skb_mac_header_was_set(skb)) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + n_hw_metadata = net_dm_hw_metadata_copy(metadata); + if (!n_hw_metadata) + goto free; + + NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata; + nskb->tstamp = tstamp; + + hw_data = this_cpu_ptr(&dm_hw_cpu_data); + + spin_lock_irqsave(&hw_data->drop_queue.lock, flags); + if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len) + __skb_queue_tail(&hw_data->drop_queue, nskb); + else + goto unlock_free; + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + + schedule_work(&hw_data->dm_alert_work); + + return; + +unlock_free: + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + u64_stats_update_begin(&hw_data->stats.syncp); + u64_stats_inc(&hw_data->stats.dropped); + u64_stats_update_end(&hw_data->stats.syncp); + net_dm_hw_metadata_free(n_hw_metadata); +free: + consume_skb(nskb); +} + +static const struct net_dm_alert_ops net_dm_alert_packet_ops = { + .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit, + .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, + .work_item_func = net_dm_packet_work, + .hw_work_item_func = net_dm_hw_packet_work, + .hw_trap_probe = net_dm_hw_trap_packet_probe, +}; + +static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { + [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops, + [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, +}; + +#if IS_ENABLED(CONFIG_NET_DEVLINK) +static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) +{ + return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL); +} + +static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) +{ + unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL); + tracepoint_synchronize_unregister(); +} +#else +static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) +{ + return -EOPNOTSUPP; +} + +static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) +{ +} +#endif + +static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) +{ + const struct net_dm_alert_ops *ops; + int cpu, rc; + + if (monitor_hw) { + NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled"); + return -EAGAIN; + } + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + if (!try_module_get(THIS_MODULE)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); + return -ENODEV; + } + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct net_dm_hw_entries *hw_entries; + + INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func); + timer_setup(&hw_data->send_timer, sched_send_work, 0); + hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); + kfree(hw_entries); + } + + rc = net_dm_hw_probe_register(ops); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint"); + goto err_module_put; + } + + monitor_hw = true; + + return 0; + +err_module_put: + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct sk_buff *skb; + + timer_delete_sync(&hw_data->send_timer); + cancel_work_sync(&hw_data->dm_alert_work); + while ((skb = __skb_dequeue(&hw_data->drop_queue))) { + struct devlink_trap_metadata *hw_metadata; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + net_dm_hw_metadata_free(hw_metadata); + consume_skb(skb); + } + } + module_put(THIS_MODULE); + return rc; +} + +static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) +{ + const struct net_dm_alert_ops *ops; + int cpu; + + if (!monitor_hw) { + NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); + return; + } + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + monitor_hw = false; + + net_dm_hw_probe_unregister(ops); + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct sk_buff *skb; + + timer_delete_sync(&hw_data->send_timer); + cancel_work_sync(&hw_data->dm_alert_work); + while ((skb = __skb_dequeue(&hw_data->drop_queue))) { + struct devlink_trap_metadata *hw_metadata; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + net_dm_hw_metadata_free(hw_metadata); + consume_skb(skb); } + } + + module_put(THIS_MODULE); +} + +static int net_dm_trace_on_set(struct netlink_ext_ack *extack) +{ + const struct net_dm_alert_ops *ops; + int cpu, rc; + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + if (!try_module_get(THIS_MODULE)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); + return -ENODEV; + } + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + INIT_WORK(&data->dm_alert_work, ops->work_item_func); + timer_setup(&data->send_timer, sched_send_work, 0); + /* Allocate a new per-CPU skb for the summary alert message and + * free the old one which might contain stale data from + * previous tracing. + */ + skb = reset_per_cpu_data(data); + consume_skb(skb); + } + + rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint"); + goto err_module_put; + } + + rc = register_trace_napi_poll(ops->napi_poll_probe, NULL); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint"); + goto err_unregister_trace; + } + + return 0; + +err_unregister_trace: + unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); +err_module_put: + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + timer_delete_sync(&data->send_timer); + cancel_work_sync(&data->dm_alert_work); + while ((skb = __skb_dequeue(&data->drop_queue))) + consume_skb(skb); + } + module_put(THIS_MODULE); + return rc; +} + +static void net_dm_trace_off_set(void) +{ + const struct net_dm_alert_ops *ops; + int cpu; + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + unregister_trace_napi_poll(ops->napi_poll_probe, NULL); + unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); + + tracepoint_synchronize_unregister(); - module_put(THIS_MODULE); + /* Make sure we do not send notifications to user space after request + * to stop tracing returns. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + timer_delete_sync(&data->send_timer); + cancel_work_sync(&data->dm_alert_work); + while ((skb = __skb_dequeue(&data->drop_queue))) + consume_skb(skb); + } + + module_put(THIS_MODULE); +} + +static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack) +{ + int rc = 0; + if (state == trace_state) { + NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state"); + return -EAGAIN; + } + + switch (state) { + case TRACE_ON: + rc = net_dm_trace_on_set(extack); + break; + case TRACE_OFF: + net_dm_trace_off_set(); break; default: rc = 1; @@ -266,95 +1246,485 @@ static int set_all_monitor_traces(int state) else rc = -EINPROGRESS; -out_unlock: - mutex_unlock(&trace_state_mutex); - return rc; } +static bool net_dm_is_monitoring(void) +{ + return trace_state == TRACE_ON || monitor_hw; +} + +static int net_dm_alert_mode_get_from_info(struct genl_info *info, + enum net_dm_alert_mode *p_alert_mode) +{ + u8 val; + + val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]); + + switch (val) { + case NET_DM_ALERT_MODE_SUMMARY: + case NET_DM_ALERT_MODE_PACKET: + *p_alert_mode = val; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int net_dm_alert_mode_set(struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + enum net_dm_alert_mode alert_mode; + int rc; + + if (!info->attrs[NET_DM_ATTR_ALERT_MODE]) + return 0; + + rc = net_dm_alert_mode_get_from_info(info, &alert_mode); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode"); + return -EINVAL; + } + + net_dm_alert_mode = alert_mode; + + return 0; +} + +static void net_dm_trunc_len_set(struct genl_info *info) +{ + if (!info->attrs[NET_DM_ATTR_TRUNC_LEN]) + return; + + net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]); +} + +static void net_dm_queue_len_set(struct genl_info *info) +{ + if (!info->attrs[NET_DM_ATTR_QUEUE_LEN]) + return; + + net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]); +} static int net_dm_cmd_config(struct sk_buff *skb, struct genl_info *info) { - return -ENOTSUPP; + struct netlink_ext_ack *extack = info->extack; + int rc; + + if (net_dm_is_monitoring()) { + NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring"); + return -EBUSY; + } + + rc = net_dm_alert_mode_set(info); + if (rc) + return rc; + + net_dm_trunc_len_set(info); + + net_dm_queue_len_set(info); + + return 0; +} + +static int net_dm_monitor_start(bool set_sw, bool set_hw, + struct netlink_ext_ack *extack) +{ + bool sw_set = false; + int rc; + + if (set_sw) { + rc = set_all_monitor_traces(TRACE_ON, extack); + if (rc) + return rc; + sw_set = true; + } + + if (set_hw) { + rc = net_dm_hw_monitor_start(extack); + if (rc) + goto err_monitor_hw; + } + + return 0; + +err_monitor_hw: + if (sw_set) + set_all_monitor_traces(TRACE_OFF, extack); + return rc; +} + +static void net_dm_monitor_stop(bool set_sw, bool set_hw, + struct netlink_ext_ack *extack) +{ + if (set_hw) + net_dm_hw_monitor_stop(extack); + if (set_sw) + set_all_monitor_traces(TRACE_OFF, extack); } static int net_dm_cmd_trace(struct sk_buff *skb, struct genl_info *info) { + bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS]; + bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS]; + struct netlink_ext_ack *extack = info->extack; + + /* To maintain backward compatibility, we start / stop monitoring of + * software drops if no flag is specified. + */ + if (!set_sw && !set_hw) + set_sw = true; + switch (info->genlhdr->cmd) { case NET_DM_CMD_START: - return set_all_monitor_traces(TRACE_ON); - break; + return net_dm_monitor_start(set_sw, set_hw, extack); case NET_DM_CMD_STOP: - return set_all_monitor_traces(TRACE_OFF); - break; + net_dm_monitor_stop(set_sw, set_hw, extack); + return 0; } - return -ENOTSUPP; + return -EOPNOTSUPP; +} + +static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + int rc; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rc = net_dm_config_fill(msg, info); + if (rc) + goto free_msg; + + return genlmsg_reply(msg, info); + +free_msg: + nlmsg_free(msg); + return rc; +} + +static void net_dm_stats_read(struct net_dm_stats *stats) +{ + int cpu; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct net_dm_stats *cpu_stats = &data->stats; + unsigned int start; + u64 dropped; + + do { + start = u64_stats_fetch_begin(&cpu_stats->syncp); + dropped = u64_stats_read(&cpu_stats->dropped); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); + + u64_stats_add(&stats->dropped, dropped); + } +} + +static int net_dm_stats_put(struct sk_buff *msg) +{ + struct net_dm_stats stats; + struct nlattr *attr; + + net_dm_stats_read(&stats); + + attr = nla_nest_start(msg, NET_DM_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, + u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static void net_dm_hw_stats_read(struct net_dm_stats *stats) +{ + int cpu; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct net_dm_stats *cpu_stats = &hw_data->stats; + unsigned int start; + u64 dropped; + + do { + start = u64_stats_fetch_begin(&cpu_stats->syncp); + dropped = u64_stats_read(&cpu_stats->dropped); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); + + u64_stats_add(&stats->dropped, dropped); + } +} + +static int net_dm_hw_stats_put(struct sk_buff *msg) +{ + struct net_dm_stats stats; + struct nlattr *attr; + + net_dm_hw_stats_read(&stats); + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, + u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info) +{ + void *hdr; + int rc; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW); + if (!hdr) + return -EMSGSIZE; + + rc = net_dm_stats_put(msg); + if (rc) + goto nla_put_failure; + + rc = net_dm_hw_stats_put(msg); + if (rc) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + int rc; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rc = net_dm_stats_fill(msg, info); + if (rc) + goto free_msg; + + return genlmsg_reply(msg, info); + +free_msg: + nlmsg_free(msg); + return rc; } static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dm_hw_stat_delta *new_stat = NULL; - struct dm_hw_stat_delta *tmp; + struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: - new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) + break; + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + break; - if (!new_stat) - goto out; + stat->last_rx = jiffies; + rcu_assign_pointer(dev->dm_private, stat); - new_stat->dev = dev; - new_stat->last_rx = jiffies; - mutex_lock(&trace_state_mutex); - list_add_rcu(&new_stat->list, &hw_stats_list); - mutex_unlock(&trace_state_mutex); break; case NETDEV_UNREGISTER: - mutex_lock(&trace_state_mutex); - list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { - if (new_stat->dev == dev) { - new_stat->dev = NULL; - if (trace_state == TRACE_OFF) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - break; - } - } + stat = rtnl_dereference(dev->dm_private); + if (stat) { + rcu_assign_pointer(dev->dm_private, NULL); + kfree_rcu(stat, rcu); } - mutex_unlock(&trace_state_mutex); break; } -out: return NOTIFY_DONE; } -static struct genl_ops dropmon_ops[] = { +static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = { + [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 }, + [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 }, + [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 }, + [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 }, + [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG }, + [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG }, +}; + +static const struct genl_small_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_config, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_START, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_STOP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NET_DM_CMD_CONFIG_GET, + .doit = net_dm_cmd_config_get, + }, + { + .cmd = NET_DM_CMD_STATS_GET, + .doit = net_dm_cmd_stats_get, }, }; +static int net_dm_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + mutex_lock(&net_dm_mutex); + + return 0; +} + +static void net_dm_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + mutex_unlock(&net_dm_mutex); +} + +static struct genl_family net_drop_monitor_family __ro_after_init = { + .hdrsize = 0, + .name = "NET_DM", + .version = 2, + .maxattr = NET_DM_ATTR_MAX, + .policy = net_dm_nl_policy, + .pre_doit = net_dm_nl_pre_doit, + .post_doit = net_dm_nl_post_doit, + .module = THIS_MODULE, + .small_ops = dropmon_ops, + .n_small_ops = ARRAY_SIZE(dropmon_ops), + .resv_start_op = NET_DM_CMD_STATS_GET + 1, + .mcgrps = dropmon_mcgrps, + .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), +}; + static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; -static int __init init_net_drop_monitor(void) +static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) +{ + raw_spin_lock_init(&data->lock); + skb_queue_head_init(&data->drop_queue); + u64_stats_init(&data->stats.syncp); +} + +static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data) +{ + WARN_ON(!skb_queue_empty(&data->drop_queue)); +} + +static void net_dm_cpu_data_init(int cpu) { struct per_cpu_dm_data *data; + + data = &per_cpu(dm_cpu_data, cpu); + __net_dm_cpu_data_init(data); +} + +static void net_dm_cpu_data_fini(int cpu) +{ + struct per_cpu_dm_data *data; + + data = &per_cpu(dm_cpu_data, cpu); + /* At this point, we should have exclusive access + * to this struct and can free the skb inside it. + */ + consume_skb(data->skb); + __net_dm_cpu_data_fini(data); +} + +static void net_dm_hw_cpu_data_init(int cpu) +{ + struct per_cpu_dm_data *hw_data; + + hw_data = &per_cpu(dm_hw_cpu_data, cpu); + __net_dm_cpu_data_init(hw_data); +} + +static void net_dm_hw_cpu_data_fini(int cpu) +{ + struct per_cpu_dm_data *hw_data; + + hw_data = &per_cpu(dm_hw_cpu_data, cpu); + kfree(hw_data->hw_entries); + __net_dm_cpu_data_fini(hw_data); +} + +static int __init init_net_drop_monitor(void) +{ int cpu, rc; pr_info("Initializing network drop monitor service\n"); @@ -364,67 +1734,50 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family_with_ops(&net_drop_monitor_family, - dropmon_ops, - ARRAY_SIZE(dropmon_ops)); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); - goto out_unreg; + return rc; } - rc = 0; - - for_each_possible_cpu(cpu) { - data = &per_cpu(dm_cpu_data, cpu); - INIT_WORK(&data->dm_alert_work, send_dm_alert); - init_timer(&data->send_timer); - data->send_timer.data = (unsigned long)data; - data->send_timer.function = sched_send_work; - spin_lock_init(&data->lock); - reset_per_cpu_data(data); + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); + goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); + rc = 0; goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } static void exit_net_drop_monitor(void) { - struct per_cpu_dm_data *data; int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path - * we are guarnateed not to have any current users when we get here - * all we need to do is make sure that we don't have any running timers - * or pending schedule calls + * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { - data = &per_cpu(dm_cpu_data, cpu); - del_timer_sync(&data->send_timer); - cancel_work_sync(&data->dm_alert_work); - /* - * At this point, we should have exclusive access - * to this struct and can free the skb inside it - */ - kfree_skb(data->skb); + net_dm_hw_cpu_data_fini(cpu); + net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); @@ -433,3 +1786,4 @@ module_exit(exit_net_drop_monitor); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); MODULE_ALIAS_GENL_FAMILY("NET_DM"); +MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts"); diff --git a/net/core/dst.c b/net/core/dst.c index ca4231ec7347..e9d35f49c9e7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * net/core/dst.c Protocol independent destination cache. * @@ -20,171 +21,43 @@ #include <net/net_namespace.h> #include <linux/sched.h> #include <linux/prefetch.h> +#include <net/lwtunnel.h> +#include <net/xfrm.h> #include <net/dst.h> +#include <net/dst_metadata.h> -/* - * Theory of operations: - * 1) We use a list, protected by a spinlock, to add - * new entries from both BH and non-BH context. - * 2) In order to keep spinlock held for a small delay, - * we use a second list where are stored long lived - * entries, that are handled by the garbage collect thread - * fired by a workqueue. - * 3) This list is guarded by a mutex, - * so that the gc_task and dst_dev_event() can be synchronized. - */ - -/* - * We want to keep lock & list close together - * to dirty as few cache lines as possible in __dst_free(). - * As this is not a very strong hint, we dont force an alignment on SMP. - */ -static struct { - spinlock_t lock; - struct dst_entry *list; - unsigned long timer_inc; - unsigned long timer_expires; -} dst_garbage = { - .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), - .timer_inc = DST_GC_MAX, -}; -static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry *dst); - -static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); - -static DEFINE_MUTEX(dst_gc_mutex); -/* - * long lived entries are maintained in this list, guarded by dst_gc_mutex - */ -static struct dst_entry *dst_busy_list; - -static void dst_gc_task(struct work_struct *work) -{ - int delayed = 0; - int work_performed = 0; - unsigned long expires = ~0L; - struct dst_entry *dst, *next, head; - struct dst_entry *last = &head; - - mutex_lock(&dst_gc_mutex); - next = dst_busy_list; - -loop: - while ((dst = next) != NULL) { - next = dst->next; - prefetch(&next->next); - cond_resched(); - if (likely(atomic_read(&dst->__refcnt))) { - last->next = dst; - last = dst; - delayed++; - continue; - } - work_performed++; - - dst = dst_destroy(dst); - if (dst) { - /* NOHASH and still referenced. Unless it is already - * on gc list, invalidate it and add to gc list. - * - * Note: this is temporary. Actually, NOHASH dst's - * must be obsoleted when parent is obsoleted. - * But we do not have state "obsoleted, but - * referenced by parent", so it is right. - */ - if (dst->obsolete > 0) - continue; - - ___dst_free(dst); - dst->next = next; - next = dst; - } - } - - spin_lock_bh(&dst_garbage.lock); - next = dst_garbage.list; - if (next) { - dst_garbage.list = NULL; - spin_unlock_bh(&dst_garbage.lock); - goto loop; - } - last->next = NULL; - dst_busy_list = head.next; - if (!dst_busy_list) - dst_garbage.timer_inc = DST_GC_MAX; - else { - /* - * if we freed less than 1/10 of delayed entries, - * we can sleep longer. - */ - if (work_performed <= delayed/10) { - dst_garbage.timer_expires += dst_garbage.timer_inc; - if (dst_garbage.timer_expires > DST_GC_MAX) - dst_garbage.timer_expires = DST_GC_MAX; - dst_garbage.timer_inc += DST_GC_INC; - } else { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - } - expires = dst_garbage.timer_expires; - /* - * if the next desired timer is more than 4 seconds in the - * future then round the timer to whole seconds - */ - if (expires > 4*HZ) - expires = round_jiffies_relative(expires); - schedule_delayed_work(&dst_gc_work, expires); - } - - spin_unlock_bh(&dst_garbage.lock); - mutex_unlock(&dst_gc_mutex); -} - -int dst_discard(struct sk_buff *skb) +int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard); +EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ - [RTAX_MAX] = 0xdeadbeef, + .refcnt = REFCOUNT_INIT(1), }; +EXPORT_SYMBOL(dst_default_metrics); - -void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, unsigned short flags) +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_obsolete, + unsigned short flags) { - struct dst_entry *dst; - - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) - return NULL; - } - dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); - if (!dst) - return NULL; - dst->child = NULL; dst->dev = dev; - if (dev) - dev_hold(dev); + netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC); dst->ops = ops; - dst_init_metrics(dst, dst_default_metrics, true); + dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; - dst->path = dst; - dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard; + dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -192,118 +65,155 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif - atomic_set(&dst->__refcnt, initial_ref); + dst->lwtstate = NULL; + rcuref_init(&dst->__rcuref, 1); + INIT_LIST_HEAD(&dst->rt_uncached); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->pending_confirm = 0; - dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); - return dst; } -EXPORT_SYMBOL(dst_alloc); +EXPORT_SYMBOL(dst_init); -static void ___dst_free(struct dst_entry *dst) +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_obsolete, unsigned short flags) { - /* The first case (dev==NULL) is required, when - protocol module is unloaded. - */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) - dst->input = dst->output = dst_discard; - dst->obsolete = DST_OBSOLETE_DEAD; + struct dst_entry *dst; + + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + + dst_init(dst, ops, dev, initial_obsolete, flags); + + return dst; } +EXPORT_SYMBOL(dst_alloc); -void __dst_free(struct dst_entry *dst) +static void dst_destroy(struct dst_entry *dst) { - spin_lock_bh(&dst_garbage.lock); - ___dst_free(dst); - dst->next = dst_garbage.list; - dst_garbage.list = dst; - if (dst_garbage.timer_inc > DST_GC_INC) { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, - dst_garbage.timer_expires); + struct dst_entry *child = NULL; + + smp_rmb(); + +#ifdef CONFIG_XFRM + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + + child = xdst->child; } - spin_unlock_bh(&dst_garbage.lock); +#endif + if (dst->ops->destroy) + dst->ops->destroy(dst); + netdev_put(dst->dev, &dst->dev_tracker); + + lwtstate_put(dst->lwtstate); + + if (dst->flags & DST_METADATA) + metadata_dst_free((struct metadata_dst *)dst); + else + kmem_cache_free(dst->ops->kmem_cachep, dst); + + dst = child; + if (dst) + dst_release_immediate(dst); } -EXPORT_SYMBOL(__dst_free); -struct dst_entry *dst_destroy(struct dst_entry * dst) +static void dst_destroy_rcu(struct rcu_head *head) { - struct dst_entry *child; + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - smp_rmb(); + dst_destroy(dst); +} -again: - child = dst->child; +/* Operations to mark dst as DEAD and clean up the net device referenced + * by dst: + * 1. put the dst under blackhole interface and discard all tx/rx packets + * on this route. + * 2. release the net_device + * This function should be called when removing routes from the fib tree + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to + * make the next dst_ops->check() fail. + */ +void dst_dev_put(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + + WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD); + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev); + WRITE_ONCE(dst->input, dst_discard); + WRITE_ONCE(dst->output, dst_discard_out); + rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); + netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, + GFP_ATOMIC); +} +EXPORT_SYMBOL(dst_dev_put); +static void dst_count_dec(struct dst_entry *dst) +{ if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); +} - if (dst->ops->destroy) - dst->ops->destroy(dst); - if (dst->dev) - dev_put(dst->dev); - kmem_cache_free(dst->ops->kmem_cachep, dst); +void dst_release(struct dst_entry *dst) +{ + if (dst && rcuref_put(&dst->__rcuref)) { +#ifdef CONFIG_DST_CACHE + if (dst->flags & DST_METADATA) { + struct metadata_dst *md_dst = (struct metadata_dst *)dst; - dst = child; - if (dst) { - int nohash = dst->flags & DST_NOHASH; - - if (atomic_dec_and_test(&dst->__refcnt)) { - /* We were real parent of this dst, so kill child. */ - if (nohash) - goto again; - } else { - /* Child is still referenced, return it for freeing. */ - if (nohash) - return dst; - /* Child is still in his hash table */ + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_reset_now(&md_dst->u.tun_info.dst_cache); } +#endif + dst_count_dec(dst); + call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } - return NULL; } -EXPORT_SYMBOL(dst_destroy); +EXPORT_SYMBOL(dst_release); -void dst_release(struct dst_entry *dst) +void dst_release_immediate(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); - } + if (dst && rcuref_put(&dst->__rcuref)) { + dst_count_dec(dst); + dst_destroy(dst); } } -EXPORT_SYMBOL(dst_release); +EXPORT_SYMBOL(dst_release_immediate); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { - u32 *old_p = __DST_METRICS_PTR(old); + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + refcount_set(&p->refcnt, 1); + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); - p = __DST_METRICS_PTR(prev); + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; + } else if (prev & DST_METRICS_REFCOUNTED) { + if (refcount_dec_and_test(&old_p->refcnt)) + kfree(old_p); } } - return p; + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); + return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -312,101 +222,134 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); } EXPORT_SYMBOL(__dst_destroy_metrics_generic); -/** - * __skb_dst_set_noref - sets skb dst, without a reference - * @skb: buffer - * @dst: dst entry - * @force: if force is set, use noref version even for DST_NOCACHE entries - * - * Sets skb dst, assuming a reference was not taken on dst - * skb_dst_drop() should not dst_release() this dst - */ -void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force) +struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie) { - WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); - /* If dst not in cache, we must take a reference, because - * dst_release() will destroy dst as soon as its refcount becomes zero - */ - if (unlikely((dst->flags & DST_NOCACHE) && !force)) { - dst_hold(dst); - skb_dst_set(skb, dst); - } else { - skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; - } + return NULL; } -EXPORT_SYMBOL(__skb_dst_set_noref); -/* Dirty hack. We did it in 2.2 (in __dst_free), - * we have _very_ good reasons not to repeat - * this mistake in 2.3, but we have no choice - * now. _It_ _is_ _explicit_ _deliberate_ - * _race_ _condition_. - * - * Commented and originally written by Alexey. - */ -static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) +u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) { - if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, unregister); - - if (dev != dst->dev) - return; + return NULL; +} - if (!unregister) { - dst->input = dst->output = dst_discard; - } else { - dst->dev = dev_net(dst->dev)->loopback_dev; - dev_hold(dst->dev); - dev_put(dev); - } +struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + return NULL; } -static int dst_dev_event(struct notifier_block *this, unsigned long event, - void *ptr) +void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dst_entry *dst, *last = NULL; - - switch (event) { - case NETDEV_UNREGISTER_FINAL: - case NETDEV_DOWN: - mutex_lock(&dst_gc_mutex); - for (dst = dst_busy_list; dst; dst = dst->next) { - last = dst; - dst_ifdown(dst, dev, event != NETDEV_DOWN); - } +} +EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu); - spin_lock_bh(&dst_garbage.lock); - dst = dst_garbage.list; - dst_garbage.list = NULL; - spin_unlock_bh(&dst_garbage.lock); - - if (last) - last->next = dst; - else - dst_busy_list = dst; - for (; dst; dst = dst->next) - dst_ifdown(dst, dev, event != NETDEV_DOWN); - mutex_unlock(&dst_gc_mutex); - break; - } - return NOTIFY_DONE; +void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ } +EXPORT_SYMBOL_GPL(dst_blackhole_redirect); + +unsigned int dst_blackhole_mtu(const struct dst_entry *dst) +{ + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); -static struct notifier_block dst_dev_notifier = { - .notifier_call = dst_dev_event, - .priority = -10, /* must be called after other network notifiers */ + return mtu ? : dst_dev(dst)->mtu; +} +EXPORT_SYMBOL_GPL(dst_blackhole_mtu); + +static struct dst_ops dst_blackhole_ops = { + .family = AF_UNSPEC, + .neigh_lookup = dst_blackhole_neigh_lookup, + .check = dst_blackhole_check, + .cow_metrics = dst_blackhole_cow_metrics, + .update_pmtu = dst_blackhole_update_pmtu, + .redirect = dst_blackhole_redirect, + .mtu = dst_blackhole_mtu, }; -void __init dst_init(void) +static void __metadata_dst_init(struct metadata_dst *md_dst, + enum metadata_type type, u8 optslen) +{ + struct dst_entry *dst; + + dst = &md_dst->dst; + dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE, + DST_METADATA | DST_NOCOUNT); + memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); + md_dst->type = type; +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, + gfp_t flags) +{ + struct metadata_dst *md_dst; + + md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen), + flags); + if (!md_dst) + return NULL; + + __metadata_dst_init(md_dst, type, optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc); + +void metadata_dst_free(struct metadata_dst *md_dst) { - register_netdevice_notifier(&dst_dev_notifier); +#ifdef CONFIG_DST_CACHE + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + if (md_dst->type == METADATA_XFRM) + dst_release(md_dst->u.xfrm_info.dst_orig); + kfree(md_dst); +} +EXPORT_SYMBOL_GPL(metadata_dst_free); + +struct metadata_dst __percpu * +metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) +{ + int cpu; + struct metadata_dst __percpu *md_dst; + + md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options, + optslen), + __alignof__(struct metadata_dst), flags); + if (!md_dst) + return NULL; + + for_each_possible_cpu(cpu) + __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); + +#ifdef CONFIG_DST_CACHE + if (one_md_dst->type == METADATA_IP_TUNNEL) + dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); +#endif + if (one_md_dst->type == METADATA_XFRM) + dst_release(one_md_dst->u.xfrm_info.dst_orig); + } + free_percpu(md_dst); } +EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c new file mode 100644 index 000000000000..9ab4902324e1 --- /dev/null +++ b/net/core/dst_cache.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/core/dst_cache.c - dst entry cache + * + * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <net/dst_cache.h> +#include <net/route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_fib.h> +#endif +#include <uapi/linux/in.h> + +struct dst_cache_pcpu { + unsigned long refresh_ts; + struct dst_entry *dst; + local_lock_t bh_lock; + u32 cookie; + union { + struct in_addr in_saddr; + struct in6_addr in6_saddr; + }; +}; + +static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + dst_release(dst_cache->dst); + if (dst) + dst_hold(dst); + + dst_cache->cookie = cookie; + dst_cache->dst = dst; +} + +static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + dst = idst->dst; + if (!dst) + goto fail; + + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + + if (unlikely(!time_after(idst->refresh_ts, + READ_ONCE(dst_cache->reset_ts)) || + (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) { + dst_cache_per_cpu_dst_set(idst, NULL, 0); + dst_release(dst); + goto fail; + } + return dst; + +fail: + idst->refresh_ts = jiffies; + return NULL; +} + +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) +{ + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return NULL; + } + + *saddr = idst->in_saddr.s_addr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst_rtable(dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, + rt6_get_cookie(dst_rt6_info(dst))); + idst->in6_saddr = *saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return NULL; + } + + *saddr = idst->in6_saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ + unsigned int i; + + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); + if (!dst_cache->cache) + return -ENOMEM; + for_each_possible_cpu(i) + local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); + + dst_cache_reset(dst_cache); + return 0; +} +EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + for_each_possible_cpu(i) + dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); + + free_percpu(dst_cache->cache); +} +EXPORT_SYMBOL_GPL(dst_cache_destroy); + +void dst_cache_reset_now(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + dst_cache_reset(dst_cache); + for_each_possible_cpu(i) { + struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); + struct dst_entry *dst = idst->dst; + + idst->cookie = 0; + idst->dst = NULL; + dst_release(dst); + } +} +EXPORT_SYMBOL_GPL(dst_cache_reset_now); diff --git a/net/core/ethtool.c b/net/core/ethtool.c deleted file mode 100644 index ab5fa6336c84..000000000000 --- a/net/core/ethtool.c +++ /dev/null @@ -1,1676 +0,0 @@ -/* - * net/core/ethtool.c - Ethtool ioctl handler - * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx> - * - * This file is where we call all the ethtool_ops commands to get - * the information ethtool needs. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/ethtool.h> -#include <linux/netdevice.h> -#include <linux/net_tstamp.h> -#include <linux/phy.h> -#include <linux/bitops.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> -#include <linux/rtnetlink.h> -#include <linux/sched.h> - -/* - * Some useful ethtool_ops methods that're device independent. - * If we find that all drivers want to do the same thing here, - * we can turn these into dev_() function calls. - */ - -u32 ethtool_op_get_link(struct net_device *dev) -{ - return netif_carrier_ok(dev) ? 1 : 0; -} -EXPORT_SYMBOL(ethtool_op_get_link); - -int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) -{ - info->so_timestamping = - SOF_TIMESTAMPING_TX_SOFTWARE | - SOF_TIMESTAMPING_RX_SOFTWARE | - SOF_TIMESTAMPING_SOFTWARE; - info->phc_index = -1; - return 0; -} -EXPORT_SYMBOL(ethtool_op_get_ts_info); - -/* Handlers for each ethtool command */ - -#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) - -static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { - [NETIF_F_SG_BIT] = "tx-scatter-gather", - [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", - [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", - [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", - [NETIF_F_HIGHDMA_BIT] = "highdma", - [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", - [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", - - [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", - [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", - [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", - [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", - [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", - [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", - [NETIF_F_GSO_BIT] = "tx-generic-segmentation", - [NETIF_F_LLTX_BIT] = "tx-lockless", - [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", - [NETIF_F_GRO_BIT] = "rx-gro", - [NETIF_F_LRO_BIT] = "rx-lro", - - [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", - [NETIF_F_UFO_BIT] = "tx-udp-fragmentation", - [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", - [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", - [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", - [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", - [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", - - [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", - [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", - [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", - [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", - [NETIF_F_RXHASH_BIT] = "rx-hashing", - [NETIF_F_RXCSUM_BIT] = "rx-checksum", - [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", - [NETIF_F_LOOPBACK_BIT] = "loopback", - [NETIF_F_RXFCS_BIT] = "rx-fcs", - [NETIF_F_RXALL_BIT] = "rx-all", -}; - -static int ethtool_get_features(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_gfeatures cmd = { - .cmd = ETHTOOL_GFEATURES, - .size = ETHTOOL_DEV_FEATURE_WORDS, - }; - struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; - u32 __user *sizeaddr; - u32 copy_size; - int i; - - /* in case feature bits run out again */ - BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); - - for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { - features[i].available = (u32)(dev->hw_features >> (32 * i)); - features[i].requested = (u32)(dev->wanted_features >> (32 * i)); - features[i].active = (u32)(dev->features >> (32 * i)); - features[i].never_changed = - (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); - } - - sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); - if (get_user(copy_size, sizeaddr)) - return -EFAULT; - - if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) - copy_size = ETHTOOL_DEV_FEATURE_WORDS; - - if (copy_to_user(useraddr, &cmd, sizeof(cmd))) - return -EFAULT; - useraddr += sizeof(cmd); - if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) - return -EFAULT; - - return 0; -} - -static int ethtool_set_features(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_sfeatures cmd; - struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; - netdev_features_t wanted = 0, valid = 0; - int i, ret = 0; - - if (copy_from_user(&cmd, useraddr, sizeof(cmd))) - return -EFAULT; - useraddr += sizeof(cmd); - - if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) - return -EINVAL; - - if (copy_from_user(features, useraddr, sizeof(features))) - return -EFAULT; - - for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { - valid |= (netdev_features_t)features[i].valid << (32 * i); - wanted |= (netdev_features_t)features[i].requested << (32 * i); - } - - if (valid & ~NETIF_F_ETHTOOL_BITS) - return -EINVAL; - - if (valid & ~dev->hw_features) { - valid &= dev->hw_features; - ret |= ETHTOOL_F_UNSUPPORTED; - } - - dev->wanted_features &= ~valid; - dev->wanted_features |= wanted & valid; - __netdev_update_features(dev); - - if ((dev->wanted_features ^ dev->features) & valid) - ret |= ETHTOOL_F_WISH; - - return ret; -} - -static int __ethtool_get_sset_count(struct net_device *dev, int sset) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (sset == ETH_SS_FEATURES) - return ARRAY_SIZE(netdev_features_strings); - - if (ops->get_sset_count && ops->get_strings) - return ops->get_sset_count(dev, sset); - else - return -EOPNOTSUPP; -} - -static void __ethtool_get_strings(struct net_device *dev, - u32 stringset, u8 *data) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (stringset == ETH_SS_FEATURES) - memcpy(data, netdev_features_strings, - sizeof(netdev_features_strings)); - else - /* ops->get_strings is valid because checked earlier */ - ops->get_strings(dev, stringset, data); -} - -static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) -{ - /* feature masks of legacy discrete ethtool ops */ - - switch (eth_cmd) { - case ETHTOOL_GTXCSUM: - case ETHTOOL_STXCSUM: - return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; - case ETHTOOL_GRXCSUM: - case ETHTOOL_SRXCSUM: - return NETIF_F_RXCSUM; - case ETHTOOL_GSG: - case ETHTOOL_SSG: - return NETIF_F_SG; - case ETHTOOL_GTSO: - case ETHTOOL_STSO: - return NETIF_F_ALL_TSO; - case ETHTOOL_GUFO: - case ETHTOOL_SUFO: - return NETIF_F_UFO; - case ETHTOOL_GGSO: - case ETHTOOL_SGSO: - return NETIF_F_GSO; - case ETHTOOL_GGRO: - case ETHTOOL_SGRO: - return NETIF_F_GRO; - default: - BUG(); - } -} - -static int ethtool_get_one_feature(struct net_device *dev, - char __user *useraddr, u32 ethcmd) -{ - netdev_features_t mask = ethtool_get_feature_mask(ethcmd); - struct ethtool_value edata = { - .cmd = ethcmd, - .data = !!(dev->features & mask), - }; - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_one_feature(struct net_device *dev, - void __user *useraddr, u32 ethcmd) -{ - struct ethtool_value edata; - netdev_features_t mask; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - mask = ethtool_get_feature_mask(ethcmd); - mask &= dev->hw_features; - if (!mask) - return -EOPNOTSUPP; - - if (edata.data) - dev->wanted_features |= mask; - else - dev->wanted_features &= ~mask; - - __netdev_update_features(dev); - - return 0; -} - -#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ - ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) -#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ - NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ - NETIF_F_RXHASH) - -static u32 __ethtool_get_flags(struct net_device *dev) -{ - u32 flags = 0; - - if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; - if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN; - if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN; - if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; - if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; - - return flags; -} - -static int __ethtool_set_flags(struct net_device *dev, u32 data) -{ - netdev_features_t features = 0, changed; - - if (data & ~ETH_ALL_FLAGS) - return -EINVAL; - - if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; - if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_CTAG_RX; - if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_CTAG_TX; - if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; - if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; - - /* allow changing only bits set in hw_features */ - changed = (features ^ dev->features) & ETH_ALL_FEATURES; - if (changed & ~dev->hw_features) - return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; - - dev->wanted_features = - (dev->wanted_features & ~changed) | (features & changed); - - __netdev_update_features(dev); - - return 0; -} - -int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) -{ - ASSERT_RTNL(); - - if (!dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; - - memset(cmd, 0, sizeof(struct ethtool_cmd)); - cmd->cmd = ETHTOOL_GSET; - return dev->ethtool_ops->get_settings(dev, cmd); -} -EXPORT_SYMBOL(__ethtool_get_settings); - -static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) -{ - int err; - struct ethtool_cmd cmd; - - err = __ethtool_get_settings(dev, &cmd); - if (err < 0) - return err; - - if (copy_to_user(useraddr, &cmd, sizeof(cmd))) - return -EFAULT; - return 0; -} - -static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_cmd cmd; - - if (!dev->ethtool_ops->set_settings) - return -EOPNOTSUPP; - - if (copy_from_user(&cmd, useraddr, sizeof(cmd))) - return -EFAULT; - - return dev->ethtool_ops->set_settings(dev, &cmd); -} - -static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_drvinfo info; - const struct ethtool_ops *ops = dev->ethtool_ops; - - memset(&info, 0, sizeof(info)); - info.cmd = ETHTOOL_GDRVINFO; - if (ops->get_drvinfo) { - ops->get_drvinfo(dev, &info); - } else if (dev->dev.parent && dev->dev.parent->driver) { - strlcpy(info.bus_info, dev_name(dev->dev.parent), - sizeof(info.bus_info)); - strlcpy(info.driver, dev->dev.parent->driver->name, - sizeof(info.driver)); - } else { - return -EOPNOTSUPP; - } - - /* - * this method of obtaining string set info is deprecated; - * Use ETHTOOL_GSSET_INFO instead. - */ - if (ops->get_sset_count) { - int rc; - - rc = ops->get_sset_count(dev, ETH_SS_TEST); - if (rc >= 0) - info.testinfo_len = rc; - rc = ops->get_sset_count(dev, ETH_SS_STATS); - if (rc >= 0) - info.n_stats = rc; - rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); - if (rc >= 0) - info.n_priv_flags = rc; - } - if (ops->get_regs_len) - info.regdump_len = ops->get_regs_len(dev); - if (ops->get_eeprom_len) - info.eedump_len = ops->get_eeprom_len(dev); - - if (copy_to_user(useraddr, &info, sizeof(info))) - return -EFAULT; - return 0; -} - -static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_sset_info info; - u64 sset_mask; - int i, idx = 0, n_bits = 0, ret, rc; - u32 *info_buf = NULL; - - if (copy_from_user(&info, useraddr, sizeof(info))) - return -EFAULT; - - /* store copy of mask, because we zero struct later on */ - sset_mask = info.sset_mask; - if (!sset_mask) - return 0; - - /* calculate size of return buffer */ - n_bits = hweight64(sset_mask); - - memset(&info, 0, sizeof(info)); - info.cmd = ETHTOOL_GSSET_INFO; - - info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER); - if (!info_buf) - return -ENOMEM; - - /* - * fill return buffer based on input bitmask and successful - * get_sset_count return - */ - for (i = 0; i < 64; i++) { - if (!(sset_mask & (1ULL << i))) - continue; - - rc = __ethtool_get_sset_count(dev, i); - if (rc >= 0) { - info.sset_mask |= (1ULL << i); - info_buf[idx++] = rc; - } - } - - ret = -EFAULT; - if (copy_to_user(useraddr, &info, sizeof(info))) - goto out; - - useraddr += offsetof(struct ethtool_sset_info, data); - if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) - goto out; - - ret = 0; - -out: - kfree(info_buf); - return ret; -} - -static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, - u32 cmd, void __user *useraddr) -{ - struct ethtool_rxnfc info; - size_t info_size = sizeof(info); - int rc; - - if (!dev->ethtool_ops->set_rxnfc) - return -EOPNOTSUPP; - - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_SRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - - rc = dev->ethtool_ops->set_rxnfc(dev, &info); - if (rc) - return rc; - - if (cmd == ETHTOOL_SRXCLSRLINS && - copy_to_user(useraddr, &info, info_size)) - return -EFAULT; - - return 0; -} - -static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, - u32 cmd, void __user *useraddr) -{ - struct ethtool_rxnfc info; - size_t info_size = sizeof(info); - const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; - void *rule_buf = NULL; - - if (!ops->get_rxnfc) - return -EOPNOTSUPP; - - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_GRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - - if (info.cmd == ETHTOOL_GRXCLSRLALL) { - if (info.rule_cnt > 0) { - if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) - rule_buf = kzalloc(info.rule_cnt * sizeof(u32), - GFP_USER); - if (!rule_buf) - return -ENOMEM; - } - } - - ret = ops->get_rxnfc(dev, &info, rule_buf); - if (ret < 0) - goto err_out; - - ret = -EFAULT; - if (copy_to_user(useraddr, &info, info_size)) - goto err_out; - - if (rule_buf) { - useraddr += offsetof(struct ethtool_rxnfc, rule_locs); - if (copy_to_user(useraddr, rule_buf, - info.rule_cnt * sizeof(u32))) - goto err_out; - } - ret = 0; - -err_out: - kfree(rule_buf); - - return ret; -} - -static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, - void __user *useraddr) -{ - u32 user_size, dev_size; - u32 *indir; - int ret; - - if (!dev->ethtool_ops->get_rxfh_indir_size || - !dev->ethtool_ops->get_rxfh_indir) - return -EOPNOTSUPP; - dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); - if (dev_size == 0) - return -EOPNOTSUPP; - - if (copy_from_user(&user_size, - useraddr + offsetof(struct ethtool_rxfh_indir, size), - sizeof(user_size))) - return -EFAULT; - - if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), - &dev_size, sizeof(dev_size))) - return -EFAULT; - - /* If the user buffer size is 0, this is just a query for the - * device table size. Otherwise, if it's smaller than the - * device table size it's an error. - */ - if (user_size < dev_size) - return user_size == 0 ? 0 : -EINVAL; - - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) - return -ENOMEM; - - ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); - if (ret) - goto out; - - if (copy_to_user(useraddr + - offsetof(struct ethtool_rxfh_indir, ring_index[0]), - indir, dev_size * sizeof(indir[0]))) - ret = -EFAULT; - -out: - kfree(indir); - return ret; -} - -static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_rxnfc rx_rings; - u32 user_size, dev_size, i; - u32 *indir; - const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; - - if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir || - !ops->get_rxnfc) - return -EOPNOTSUPP; - - dev_size = ops->get_rxfh_indir_size(dev); - if (dev_size == 0) - return -EOPNOTSUPP; - - if (copy_from_user(&user_size, - useraddr + offsetof(struct ethtool_rxfh_indir, size), - sizeof(user_size))) - return -EFAULT; - - if (user_size != 0 && user_size != dev_size) - return -EINVAL; - - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) - return -ENOMEM; - - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) - goto out; - - if (user_size == 0) { - for (i = 0; i < dev_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); - } else { - if (copy_from_user(indir, - useraddr + - offsetof(struct ethtool_rxfh_indir, - ring_index[0]), - dev_size * sizeof(indir[0]))) { - ret = -EFAULT; - goto out; - } - - /* Validate ring indices */ - for (i = 0; i < dev_size; i++) { - if (indir[i] >= rx_rings.data) { - ret = -EINVAL; - goto out; - } - } - } - - ret = ops->set_rxfh_indir(dev, indir); - -out: - kfree(indir); - return ret; -} - -static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_regs regs; - const struct ethtool_ops *ops = dev->ethtool_ops; - void *regbuf; - int reglen, ret; - - if (!ops->get_regs || !ops->get_regs_len) - return -EOPNOTSUPP; - - if (copy_from_user(®s, useraddr, sizeof(regs))) - return -EFAULT; - - reglen = ops->get_regs_len(dev); - if (regs.len > reglen) - regs.len = reglen; - - regbuf = vzalloc(reglen); - if (reglen && !regbuf) - return -ENOMEM; - - ops->get_regs(dev, ®s, regbuf); - - ret = -EFAULT; - if (copy_to_user(useraddr, ®s, sizeof(regs))) - goto out; - useraddr += offsetof(struct ethtool_regs, data); - if (regbuf && copy_to_user(useraddr, regbuf, regs.len)) - goto out; - ret = 0; - - out: - vfree(regbuf); - return ret; -} - -static int ethtool_reset(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value reset; - int ret; - - if (!dev->ethtool_ops->reset) - return -EOPNOTSUPP; - - if (copy_from_user(&reset, useraddr, sizeof(reset))) - return -EFAULT; - - ret = dev->ethtool_ops->reset(dev, &reset.data); - if (ret) - return ret; - - if (copy_to_user(useraddr, &reset, sizeof(reset))) - return -EFAULT; - return 0; -} - -static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; - - if (!dev->ethtool_ops->get_wol) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_wol(dev, &wol); - - if (copy_to_user(useraddr, &wol, sizeof(wol))) - return -EFAULT; - return 0; -} - -static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_wolinfo wol; - - if (!dev->ethtool_ops->set_wol) - return -EOPNOTSUPP; - - if (copy_from_user(&wol, useraddr, sizeof(wol))) - return -EFAULT; - - return dev->ethtool_ops->set_wol(dev, &wol); -} - -static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_eee edata; - int rc; - - if (!dev->ethtool_ops->get_eee) - return -EOPNOTSUPP; - - memset(&edata, 0, sizeof(struct ethtool_eee)); - edata.cmd = ETHTOOL_GEEE; - rc = dev->ethtool_ops->get_eee(dev, &edata); - - if (rc) - return rc; - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - - return 0; -} - -static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_eee edata; - - if (!dev->ethtool_ops->set_eee) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - return dev->ethtool_ops->set_eee(dev, &edata); -} - -static int ethtool_nway_reset(struct net_device *dev) -{ - if (!dev->ethtool_ops->nway_reset) - return -EOPNOTSUPP; - - return dev->ethtool_ops->nway_reset(dev); -} - -static int ethtool_get_link(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; - - if (!dev->ethtool_ops->get_link) - return -EOPNOTSUPP; - - edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, - int (*getter)(struct net_device *, - struct ethtool_eeprom *, u8 *), - u32 total_len) -{ - struct ethtool_eeprom eeprom; - void __user *userbuf = useraddr + sizeof(eeprom); - u32 bytes_remaining; - u8 *data; - int ret = 0; - - if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) - return -EFAULT; - - /* Check for wrap and zero */ - if (eeprom.offset + eeprom.len <= eeprom.offset) - return -EINVAL; - - /* Check for exceeding total eeprom len */ - if (eeprom.offset + eeprom.len > total_len) - return -EINVAL; - - data = kmalloc(PAGE_SIZE, GFP_USER); - if (!data) - return -ENOMEM; - - bytes_remaining = eeprom.len; - while (bytes_remaining > 0) { - eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); - - ret = getter(dev, &eeprom, data); - if (ret) - break; - if (copy_to_user(userbuf, data, eeprom.len)) { - ret = -EFAULT; - break; - } - userbuf += eeprom.len; - eeprom.offset += eeprom.len; - bytes_remaining -= eeprom.len; - } - - eeprom.len = userbuf - (useraddr + sizeof(eeprom)); - eeprom.offset -= eeprom.len; - if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) - ret = -EFAULT; - - kfree(data); - return ret; -} - -static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops->get_eeprom || !ops->get_eeprom_len) - return -EOPNOTSUPP; - - return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, - ops->get_eeprom_len(dev)); -} - -static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_eeprom eeprom; - const struct ethtool_ops *ops = dev->ethtool_ops; - void __user *userbuf = useraddr + sizeof(eeprom); - u32 bytes_remaining; - u8 *data; - int ret = 0; - - if (!ops->set_eeprom || !ops->get_eeprom_len) - return -EOPNOTSUPP; - - if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) - return -EFAULT; - - /* Check for wrap and zero */ - if (eeprom.offset + eeprom.len <= eeprom.offset) - return -EINVAL; - - /* Check for exceeding total eeprom len */ - if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) - return -EINVAL; - - data = kmalloc(PAGE_SIZE, GFP_USER); - if (!data) - return -ENOMEM; - - bytes_remaining = eeprom.len; - while (bytes_remaining > 0) { - eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); - - if (copy_from_user(data, userbuf, eeprom.len)) { - ret = -EFAULT; - break; - } - ret = ops->set_eeprom(dev, &eeprom, data); - if (ret) - break; - userbuf += eeprom.len; - eeprom.offset += eeprom.len; - bytes_remaining -= eeprom.len; - } - - kfree(data); - return ret; -} - -static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; - - if (!dev->ethtool_ops->get_coalesce) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_coalesce(dev, &coalesce); - - if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) - return -EFAULT; - return 0; -} - -static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_coalesce coalesce; - - if (!dev->ethtool_ops->set_coalesce) - return -EOPNOTSUPP; - - if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) - return -EFAULT; - - return dev->ethtool_ops->set_coalesce(dev, &coalesce); -} - -static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; - - if (!dev->ethtool_ops->get_ringparam) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_ringparam(dev, &ringparam); - - if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) - return -EFAULT; - return 0; -} - -static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_ringparam ringparam; - - if (!dev->ethtool_ops->set_ringparam) - return -EOPNOTSUPP; - - if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) - return -EFAULT; - - return dev->ethtool_ops->set_ringparam(dev, &ringparam); -} - -static noinline_for_stack int ethtool_get_channels(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; - - if (!dev->ethtool_ops->get_channels) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_channels(dev, &channels); - - if (copy_to_user(useraddr, &channels, sizeof(channels))) - return -EFAULT; - return 0; -} - -static noinline_for_stack int ethtool_set_channels(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_channels channels; - - if (!dev->ethtool_ops->set_channels) - return -EOPNOTSUPP; - - if (copy_from_user(&channels, useraddr, sizeof(channels))) - return -EFAULT; - - return dev->ethtool_ops->set_channels(dev, &channels); -} - -static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; - - if (!dev->ethtool_ops->get_pauseparam) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_pauseparam(dev, &pauseparam); - - if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) - return -EFAULT; - return 0; -} - -static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_pauseparam pauseparam; - - if (!dev->ethtool_ops->set_pauseparam) - return -EOPNOTSUPP; - - if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) - return -EFAULT; - - return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); -} - -static int ethtool_self_test(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_test test; - const struct ethtool_ops *ops = dev->ethtool_ops; - u64 *data; - int ret, test_len; - - if (!ops->self_test || !ops->get_sset_count) - return -EOPNOTSUPP; - - test_len = ops->get_sset_count(dev, ETH_SS_TEST); - if (test_len < 0) - return test_len; - WARN_ON(test_len == 0); - - if (copy_from_user(&test, useraddr, sizeof(test))) - return -EFAULT; - - test.len = test_len; - data = kmalloc(test_len * sizeof(u64), GFP_USER); - if (!data) - return -ENOMEM; - - ops->self_test(dev, &test, data); - - ret = -EFAULT; - if (copy_to_user(useraddr, &test, sizeof(test))) - goto out; - useraddr += sizeof(test); - if (copy_to_user(useraddr, data, test.len * sizeof(u64))) - goto out; - ret = 0; - - out: - kfree(data); - return ret; -} - -static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_gstrings gstrings; - u8 *data; - int ret; - - if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) - return -EFAULT; - - ret = __ethtool_get_sset_count(dev, gstrings.string_set); - if (ret < 0) - return ret; - - gstrings.len = ret; - - data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); - if (!data) - return -ENOMEM; - - __ethtool_get_strings(dev, gstrings.string_set, data); - - ret = -EFAULT; - if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) - goto out; - useraddr += sizeof(gstrings); - if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) - goto out; - ret = 0; - -out: - kfree(data); - return ret; -} - -static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_value id; - static bool busy; - const struct ethtool_ops *ops = dev->ethtool_ops; - int rc; - - if (!ops->set_phys_id) - return -EOPNOTSUPP; - - if (busy) - return -EBUSY; - - if (copy_from_user(&id, useraddr, sizeof(id))) - return -EFAULT; - - rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); - if (rc < 0) - return rc; - - /* Drop the RTNL lock while waiting, but prevent reentry or - * removal of the device. - */ - busy = true; - dev_hold(dev); - rtnl_unlock(); - - if (rc == 0) { - /* Driver will handle this itself */ - schedule_timeout_interruptible( - id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); - } else { - /* Driver expects to be called at twice the frequency in rc */ - int n = rc * 2, i, interval = HZ / n; - - /* Count down seconds */ - do { - /* Count down iterations per second */ - i = n; - do { - rtnl_lock(); - rc = ops->set_phys_id(dev, - (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); - rtnl_unlock(); - if (rc) - break; - schedule_timeout_interruptible(interval); - } while (!signal_pending(current) && --i != 0); - } while (!signal_pending(current) && - (id.data == 0 || --id.data != 0)); - } - - rtnl_lock(); - dev_put(dev); - busy = false; - - (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); - return rc; -} - -static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_stats stats; - const struct ethtool_ops *ops = dev->ethtool_ops; - u64 *data; - int ret, n_stats; - - if (!ops->get_ethtool_stats || !ops->get_sset_count) - return -EOPNOTSUPP; - - n_stats = ops->get_sset_count(dev, ETH_SS_STATS); - if (n_stats < 0) - return n_stats; - WARN_ON(n_stats == 0); - - if (copy_from_user(&stats, useraddr, sizeof(stats))) - return -EFAULT; - - stats.n_stats = n_stats; - data = kmalloc(n_stats * sizeof(u64), GFP_USER); - if (!data) - return -ENOMEM; - - ops->get_ethtool_stats(dev, &stats, data); - - ret = -EFAULT; - if (copy_to_user(useraddr, &stats, sizeof(stats))) - goto out; - useraddr += sizeof(stats); - if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) - goto out; - ret = 0; - - out: - kfree(data); - return ret; -} - -static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_perm_addr epaddr; - - if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) - return -EFAULT; - - if (epaddr.size < dev->addr_len) - return -ETOOSMALL; - epaddr.size = dev->addr_len; - - if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) - return -EFAULT; - useraddr += sizeof(epaddr); - if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) - return -EFAULT; - return 0; -} - -static int ethtool_get_value(struct net_device *dev, char __user *useraddr, - u32 cmd, u32 (*actor)(struct net_device *)) -{ - struct ethtool_value edata = { .cmd = cmd }; - - if (!actor) - return -EOPNOTSUPP; - - edata.data = actor(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, - void (*actor)(struct net_device *, u32)) -{ - struct ethtool_value edata; - - if (!actor) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - actor(dev, edata.data); - return 0; -} - -static int ethtool_set_value(struct net_device *dev, char __user *useraddr, - int (*actor)(struct net_device *, u32)) -{ - struct ethtool_value edata; - - if (!actor) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - return actor(dev, edata.data); -} - -static noinline_for_stack int ethtool_flash_device(struct net_device *dev, - char __user *useraddr) -{ - struct ethtool_flash efl; - - if (copy_from_user(&efl, useraddr, sizeof(efl))) - return -EFAULT; - - if (!dev->ethtool_ops->flash_device) - return -EOPNOTSUPP; - - efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; - - return dev->ethtool_ops->flash_device(dev, &efl); -} - -static int ethtool_set_dump(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_dump dump; - - if (!dev->ethtool_ops->set_dump) - return -EOPNOTSUPP; - - if (copy_from_user(&dump, useraddr, sizeof(dump))) - return -EFAULT; - - return dev->ethtool_ops->set_dump(dev, &dump); -} - -static int ethtool_get_dump_flag(struct net_device *dev, - void __user *useraddr) -{ - int ret; - struct ethtool_dump dump; - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops->get_dump_flag) - return -EOPNOTSUPP; - - if (copy_from_user(&dump, useraddr, sizeof(dump))) - return -EFAULT; - - ret = ops->get_dump_flag(dev, &dump); - if (ret) - return ret; - - if (copy_to_user(useraddr, &dump, sizeof(dump))) - return -EFAULT; - return 0; -} - -static int ethtool_get_dump_data(struct net_device *dev, - void __user *useraddr) -{ - int ret; - __u32 len; - struct ethtool_dump dump, tmp; - const struct ethtool_ops *ops = dev->ethtool_ops; - void *data = NULL; - - if (!ops->get_dump_data || !ops->get_dump_flag) - return -EOPNOTSUPP; - - if (copy_from_user(&dump, useraddr, sizeof(dump))) - return -EFAULT; - - memset(&tmp, 0, sizeof(tmp)); - tmp.cmd = ETHTOOL_GET_DUMP_FLAG; - ret = ops->get_dump_flag(dev, &tmp); - if (ret) - return ret; - - len = min(tmp.len, dump.len); - if (!len) - return -EFAULT; - - /* Don't ever let the driver think there's more space available - * than it requested with .get_dump_flag(). - */ - dump.len = len; - - /* Always allocate enough space to hold the whole thing so that the - * driver does not need to check the length and bother with partial - * dumping. - */ - data = vzalloc(tmp.len); - if (!data) - return -ENOMEM; - ret = ops->get_dump_data(dev, &dump, data); - if (ret) - goto out; - - /* There are two sane possibilities: - * 1. The driver's .get_dump_data() does not touch dump.len. - * 2. Or it may set dump.len to how much it really writes, which - * should be tmp.len (or len if it can do a partial dump). - * In any case respond to userspace with the actual length of data - * it's receiving. - */ - WARN_ON(dump.len != len && dump.len != tmp.len); - dump.len = len; - - if (copy_to_user(useraddr, &dump, sizeof(dump))) { - ret = -EFAULT; - goto out; - } - useraddr += offsetof(struct ethtool_dump, data); - if (copy_to_user(useraddr, data, len)) - ret = -EFAULT; -out: - vfree(data); - return ret; -} - -static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) -{ - int err = 0; - struct ethtool_ts_info info; - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - - memset(&info, 0, sizeof(info)); - info.cmd = ETHTOOL_GET_TS_INFO; - - if (phydev && phydev->drv && phydev->drv->ts_info) { - err = phydev->drv->ts_info(phydev, &info); - } else if (ops->get_ts_info) { - err = ops->get_ts_info(dev, &info); - } else { - info.so_timestamping = - SOF_TIMESTAMPING_RX_SOFTWARE | - SOF_TIMESTAMPING_SOFTWARE; - info.phc_index = -1; - } - - if (err) - return err; - - if (copy_to_user(useraddr, &info, sizeof(info))) - err = -EFAULT; - - return err; -} - -static int ethtool_get_module_info(struct net_device *dev, - void __user *useraddr) -{ - int ret; - struct ethtool_modinfo modinfo; - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops->get_module_info) - return -EOPNOTSUPP; - - if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) - return -EFAULT; - - ret = ops->get_module_info(dev, &modinfo); - if (ret) - return ret; - - if (copy_to_user(useraddr, &modinfo, sizeof(modinfo))) - return -EFAULT; - - return 0; -} - -static int ethtool_get_module_eeprom(struct net_device *dev, - void __user *useraddr) -{ - int ret; - struct ethtool_modinfo modinfo; - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops->get_module_info || !ops->get_module_eeprom) - return -EOPNOTSUPP; - - ret = ops->get_module_info(dev, &modinfo); - if (ret) - return ret; - - return ethtool_get_any_eeprom(dev, useraddr, ops->get_module_eeprom, - modinfo.eeprom_len); -} - -/* The main entry point in this file. Called from net/core/dev_ioctl.c */ - -int dev_ethtool(struct net *net, struct ifreq *ifr) -{ - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - void __user *useraddr = ifr->ifr_data; - u32 ethcmd; - int rc; - netdev_features_t old_features; - - if (!dev || !netif_device_present(dev)) - return -ENODEV; - - if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) - return -EFAULT; - - /* Allow some commands to be done by anyone */ - switch (ethcmd) { - case ETHTOOL_GSET: - case ETHTOOL_GDRVINFO: - case ETHTOOL_GMSGLVL: - case ETHTOOL_GLINK: - case ETHTOOL_GCOALESCE: - case ETHTOOL_GRINGPARAM: - case ETHTOOL_GPAUSEPARAM: - case ETHTOOL_GRXCSUM: - case ETHTOOL_GTXCSUM: - case ETHTOOL_GSG: - case ETHTOOL_GSSET_INFO: - case ETHTOOL_GSTRINGS: - case ETHTOOL_GSTATS: - case ETHTOOL_GTSO: - case ETHTOOL_GPERMADDR: - case ETHTOOL_GUFO: - case ETHTOOL_GGSO: - case ETHTOOL_GGRO: - case ETHTOOL_GFLAGS: - case ETHTOOL_GPFLAGS: - case ETHTOOL_GRXFH: - case ETHTOOL_GRXRINGS: - case ETHTOOL_GRXCLSRLCNT: - case ETHTOOL_GRXCLSRULE: - case ETHTOOL_GRXCLSRLALL: - case ETHTOOL_GRXFHINDIR: - case ETHTOOL_GFEATURES: - case ETHTOOL_GCHANNELS: - case ETHTOOL_GET_TS_INFO: - case ETHTOOL_GEEE: - break; - default: - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - } - - if (dev->ethtool_ops->begin) { - rc = dev->ethtool_ops->begin(dev); - if (rc < 0) - return rc; - } - old_features = dev->features; - - switch (ethcmd) { - case ETHTOOL_GSET: - rc = ethtool_get_settings(dev, useraddr); - break; - case ETHTOOL_SSET: - rc = ethtool_set_settings(dev, useraddr); - break; - case ETHTOOL_GDRVINFO: - rc = ethtool_get_drvinfo(dev, useraddr); - break; - case ETHTOOL_GREGS: - rc = ethtool_get_regs(dev, useraddr); - break; - case ETHTOOL_GWOL: - rc = ethtool_get_wol(dev, useraddr); - break; - case ETHTOOL_SWOL: - rc = ethtool_set_wol(dev, useraddr); - break; - case ETHTOOL_GMSGLVL: - rc = ethtool_get_value(dev, useraddr, ethcmd, - dev->ethtool_ops->get_msglevel); - break; - case ETHTOOL_SMSGLVL: - rc = ethtool_set_value_void(dev, useraddr, - dev->ethtool_ops->set_msglevel); - break; - case ETHTOOL_GEEE: - rc = ethtool_get_eee(dev, useraddr); - break; - case ETHTOOL_SEEE: - rc = ethtool_set_eee(dev, useraddr); - break; - case ETHTOOL_NWAY_RST: - rc = ethtool_nway_reset(dev); - break; - case ETHTOOL_GLINK: - rc = ethtool_get_link(dev, useraddr); - break; - case ETHTOOL_GEEPROM: - rc = ethtool_get_eeprom(dev, useraddr); - break; - case ETHTOOL_SEEPROM: - rc = ethtool_set_eeprom(dev, useraddr); - break; - case ETHTOOL_GCOALESCE: - rc = ethtool_get_coalesce(dev, useraddr); - break; - case ETHTOOL_SCOALESCE: - rc = ethtool_set_coalesce(dev, useraddr); - break; - case ETHTOOL_GRINGPARAM: - rc = ethtool_get_ringparam(dev, useraddr); - break; - case ETHTOOL_SRINGPARAM: - rc = ethtool_set_ringparam(dev, useraddr); - break; - case ETHTOOL_GPAUSEPARAM: - rc = ethtool_get_pauseparam(dev, useraddr); - break; - case ETHTOOL_SPAUSEPARAM: - rc = ethtool_set_pauseparam(dev, useraddr); - break; - case ETHTOOL_TEST: - rc = ethtool_self_test(dev, useraddr); - break; - case ETHTOOL_GSTRINGS: - rc = ethtool_get_strings(dev, useraddr); - break; - case ETHTOOL_PHYS_ID: - rc = ethtool_phys_id(dev, useraddr); - break; - case ETHTOOL_GSTATS: - rc = ethtool_get_stats(dev, useraddr); - break; - case ETHTOOL_GPERMADDR: - rc = ethtool_get_perm_addr(dev, useraddr); - break; - case ETHTOOL_GFLAGS: - rc = ethtool_get_value(dev, useraddr, ethcmd, - __ethtool_get_flags); - break; - case ETHTOOL_SFLAGS: - rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); - break; - case ETHTOOL_GPFLAGS: - rc = ethtool_get_value(dev, useraddr, ethcmd, - dev->ethtool_ops->get_priv_flags); - break; - case ETHTOOL_SPFLAGS: - rc = ethtool_set_value(dev, useraddr, - dev->ethtool_ops->set_priv_flags); - break; - case ETHTOOL_GRXFH: - case ETHTOOL_GRXRINGS: - case ETHTOOL_GRXCLSRLCNT: - case ETHTOOL_GRXCLSRULE: - case ETHTOOL_GRXCLSRLALL: - rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); - break; - case ETHTOOL_SRXFH: - case ETHTOOL_SRXCLSRLDEL: - case ETHTOOL_SRXCLSRLINS: - rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); - break; - case ETHTOOL_FLASHDEV: - rc = ethtool_flash_device(dev, useraddr); - break; - case ETHTOOL_RESET: - rc = ethtool_reset(dev, useraddr); - break; - case ETHTOOL_GSSET_INFO: - rc = ethtool_get_sset_info(dev, useraddr); - break; - case ETHTOOL_GRXFHINDIR: - rc = ethtool_get_rxfh_indir(dev, useraddr); - break; - case ETHTOOL_SRXFHINDIR: - rc = ethtool_set_rxfh_indir(dev, useraddr); - break; - case ETHTOOL_GFEATURES: - rc = ethtool_get_features(dev, useraddr); - break; - case ETHTOOL_SFEATURES: - rc = ethtool_set_features(dev, useraddr); - break; - case ETHTOOL_GTXCSUM: - case ETHTOOL_GRXCSUM: - case ETHTOOL_GSG: - case ETHTOOL_GTSO: - case ETHTOOL_GUFO: - case ETHTOOL_GGSO: - case ETHTOOL_GGRO: - rc = ethtool_get_one_feature(dev, useraddr, ethcmd); - break; - case ETHTOOL_STXCSUM: - case ETHTOOL_SRXCSUM: - case ETHTOOL_SSG: - case ETHTOOL_STSO: - case ETHTOOL_SUFO: - case ETHTOOL_SGSO: - case ETHTOOL_SGRO: - rc = ethtool_set_one_feature(dev, useraddr, ethcmd); - break; - case ETHTOOL_GCHANNELS: - rc = ethtool_get_channels(dev, useraddr); - break; - case ETHTOOL_SCHANNELS: - rc = ethtool_set_channels(dev, useraddr); - break; - case ETHTOOL_SET_DUMP: - rc = ethtool_set_dump(dev, useraddr); - break; - case ETHTOOL_GET_DUMP_FLAG: - rc = ethtool_get_dump_flag(dev, useraddr); - break; - case ETHTOOL_GET_DUMP_DATA: - rc = ethtool_get_dump_data(dev, useraddr); - break; - case ETHTOOL_GET_TS_INFO: - rc = ethtool_get_ts_info(dev, useraddr); - break; - case ETHTOOL_GMODULEINFO: - rc = ethtool_get_module_info(dev, useraddr); - break; - case ETHTOOL_GMODULEEEPROM: - rc = ethtool_get_module_eeprom(dev, useraddr); - break; - default: - rc = -EOPNOTSUPP; - } - - if (dev->ethtool_ops->complete) - dev->ethtool_ops->complete(dev); - - if (old_features != dev->features) - netdev_features_change(dev); - - return rc; -} diff --git a/net/core/failover.c b/net/core/failover.c new file mode 100644 index 000000000000..2a140b3ea669 --- /dev/null +++ b/net/core/failover.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Intel Corporation. */ + +/* A common module to handle registrations and notifications for paravirtual + * drivers to enable accelerated datapath and support VF live migration. + * + * The notifier and event handling code is based on netvsc driver. + */ + +#include <linux/module.h> +#include <linux/etherdevice.h> +#include <uapi/linux/if_arp.h> +#include <linux/rtnetlink.h> +#include <linux/if_vlan.h> +#include <net/failover.h> + +static LIST_HEAD(failover_list); +static DEFINE_SPINLOCK(failover_lock); + +static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) +{ + struct net_device *failover_dev; + struct failover *failover; + + spin_lock(&failover_lock); + list_for_each_entry(failover, &failover_list, list) { + failover_dev = rtnl_dereference(failover->failover_dev); + if (ether_addr_equal(failover_dev->perm_addr, mac)) { + *ops = rtnl_dereference(failover->ops); + spin_unlock(&failover_lock); + return failover_dev; + } + } + spin_unlock(&failover_lock); + return NULL; +} + +/** + * failover_slave_register - Register a slave netdev + * + * @slave_dev: slave netdev that is being registered + * + * Registers a slave device to a failover instance. Only ethernet devices + * are supported. + */ +static int failover_slave_register(struct net_device *slave_dev) +{ + struct netdev_lag_upper_info lag_upper_info; + struct net_device *failover_dev; + struct failover_ops *fops; + int err; + + if (slave_dev->type != ARPHRD_ETHER) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (fops && fops->slave_pre_register && + fops->slave_pre_register(slave_dev, failover_dev)) + goto done; + + err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, + failover_dev); + if (err) { + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", + err); + goto done; + } + + lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; + err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, + &lag_upper_info, NULL); + if (err) { + netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", + failover_dev->name, err); + goto err_upper_link; + } + + slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); + + if (fops && fops->slave_register && + !fops->slave_register(slave_dev, failover_dev)) + return NOTIFY_OK; + + netdev_upper_dev_unlink(slave_dev, failover_dev); + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); +err_upper_link: + netdev_rx_handler_unregister(slave_dev); +done: + return NOTIFY_DONE; +} + +/** + * failover_slave_unregister - Unregister a slave netdev + * + * @slave_dev: slave netdev that is being unregistered + * + * Unregisters a slave device from a failover instance. + */ +int failover_slave_unregister(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (fops && fops->slave_pre_unregister && + fops->slave_pre_unregister(slave_dev, failover_dev)) + goto done; + + netdev_rx_handler_unregister(slave_dev); + netdev_upper_dev_unlink(slave_dev, failover_dev); + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); + + if (fops && fops->slave_unregister && + !fops->slave_unregister(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} +EXPORT_SYMBOL_GPL(failover_slave_unregister); + +static int failover_slave_link_change(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (!netif_running(failover_dev)) + goto done; + + if (fops && fops->slave_link_change && + !fops->slave_link_change(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} + +static int failover_slave_name_change(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (!netif_running(failover_dev)) + goto done; + + if (fops && fops->slave_name_change && + !fops->slave_name_change(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} + +static int +failover_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + /* Skip parent events */ + if (netif_is_failover(event_dev)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + return failover_slave_register(event_dev); + case NETDEV_UNREGISTER: + return failover_slave_unregister(event_dev); + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + return failover_slave_link_change(event_dev); + case NETDEV_CHANGENAME: + return failover_slave_name_change(event_dev); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block failover_notifier = { + .notifier_call = failover_event, +}; + +static void +failover_existing_slave_register(struct net_device *failover_dev) +{ + struct net *net = dev_net(failover_dev); + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(net, dev) { + if (netif_is_failover(dev)) + continue; + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + failover_slave_register(dev); + } + rtnl_unlock(); +} + +/** + * failover_register - Register a failover instance + * + * @dev: failover netdev + * @ops: failover ops + * + * Allocate and register a failover instance for a failover netdev. ops + * provides handlers for slave device register/unregister/link change/ + * name change events. + * + * Return: pointer to failover instance + */ +struct failover *failover_register(struct net_device *dev, + struct failover_ops *ops) +{ + struct failover *failover; + + if (dev->type != ARPHRD_ETHER) + return ERR_PTR(-EINVAL); + + failover = kzalloc(sizeof(*failover), GFP_KERNEL); + if (!failover) + return ERR_PTR(-ENOMEM); + + rcu_assign_pointer(failover->ops, ops); + netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); + dev->priv_flags |= IFF_FAILOVER; + rcu_assign_pointer(failover->failover_dev, dev); + + spin_lock(&failover_lock); + list_add_tail(&failover->list, &failover_list); + spin_unlock(&failover_lock); + + netdev_info(dev, "failover master:%s registered\n", dev->name); + + failover_existing_slave_register(dev); + + return failover; +} +EXPORT_SYMBOL_GPL(failover_register); + +/** + * failover_unregister - Unregister a failover instance + * + * @failover: pointer to failover instance + * + * Unregisters and frees a failover instance. + */ +void failover_unregister(struct failover *failover) +{ + struct net_device *failover_dev; + + failover_dev = rcu_dereference(failover->failover_dev); + + netdev_info(failover_dev, "failover master:%s unregistered\n", + failover_dev->name); + + failover_dev->priv_flags &= ~IFF_FAILOVER; + netdev_put(failover_dev, &failover->dev_tracker); + + spin_lock(&failover_lock); + list_del(&failover->list); + spin_unlock(&failover_lock); + + kfree(failover); +} +EXPORT_SYMBOL_GPL(failover_unregister); + +static __init int +failover_init(void) +{ + register_netdevice_notifier(&failover_notifier); + + return 0; +} +module_init(failover_init); + +static __exit +void failover_exit(void) +{ + unregister_netdevice_notifier(&failover_notifier); +} +module_exit(failover_exit); + +MODULE_DESCRIPTION("Generic failover infrastructure/interface"); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c new file mode 100644 index 000000000000..5cdca49b1d7c --- /dev/null +++ b/net/core/fib_notifier.c @@ -0,0 +1,197 @@ +#include <linux/rtnetlink.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/fib_notifier.h> + +static unsigned int fib_notifier_net_id; + +struct fib_notifier_net { + struct list_head fib_notifier_ops; + struct atomic_notifier_head fib_chain; +}; + +int call_fib_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + int err; + + err = nb->notifier_call(nb, event_type, info); + return notifier_to_errno(err); +} +EXPORT_SYMBOL(call_fib_notifier); + +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + int err; + + err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); + return notifier_to_errno(err); +} +EXPORT_SYMBOL(call_fib_notifiers); + +static unsigned int fib_seq_sum(struct net *net) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + struct fib_notifier_ops *ops; + unsigned int fib_seq = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + fib_seq += ops->fib_seq_read(net); + module_put(ops->owner); + } + rcu_read_unlock(); + + return fib_seq; +} + +static int fib_net_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + struct fib_notifier_ops *ops; + int err = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + err = ops->fib_dump(net, nb, extack); + module_put(ops->owner); + if (err) + goto unlock; + } + +unlock: + rcu_read_unlock(); + + return err; +} + +static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + atomic_notifier_chain_register(&fn_net->fib_chain, nb); + if (fib_seq == fib_seq_sum(net)) + return true; + atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct net *net, struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack) +{ + int retries = 0; + int err; + + do { + unsigned int fib_seq = fib_seq_sum(net); + + err = fib_net_dump(net, nb, extack); + if (err) + return err; + + if (fib_dump_is_consistent(net, nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; +} +EXPORT_SYMBOL(register_fib_notifier); + +int unregister_fib_notifier(struct net *net, struct notifier_block *nb) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); +} +EXPORT_SYMBOL(unregister_fib_notifier); + +static int __fib_notifier_ops_register(struct fib_notifier_ops *ops, + struct net *net) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + struct fib_notifier_ops *o; + + list_for_each_entry(o, &fn_net->fib_notifier_ops, list) + if (ops->family == o->family) + return -EEXIST; + list_add_tail_rcu(&ops->list, &fn_net->fib_notifier_ops); + return 0; +} + +struct fib_notifier_ops * +fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net) +{ + struct fib_notifier_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); + if (!ops) + return ERR_PTR(-ENOMEM); + + err = __fib_notifier_ops_register(ops, net); + if (err) + goto err_register; + + return ops; + +err_register: + kfree(ops); + return ERR_PTR(err); +} +EXPORT_SYMBOL(fib_notifier_ops_register); + +void fib_notifier_ops_unregister(struct fib_notifier_ops *ops) +{ + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); +} +EXPORT_SYMBOL(fib_notifier_ops_unregister); + +static int __net_init fib_notifier_net_init(struct net *net) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + INIT_LIST_HEAD(&fn_net->fib_notifier_ops); + ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); + return 0; +} + +static void __net_exit fib_notifier_net_exit(struct net *net) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + WARN_ON_ONCE(!list_empty(&fn_net->fib_notifier_ops)); +} + +static struct pernet_operations fib_notifier_net_ops = { + .init = fib_notifier_net_init, + .exit = fib_notifier_net_exit, + .id = &fib_notifier_net_id, + .size = sizeof(struct fib_notifier_net), +}; + +static int __init fib_notifier_init(void) +{ + return register_pernet_subsys(&fib_notifier_net_ops); +} + +subsys_initcall(fib_notifier_init); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 21735440c44a..8ca634964e36 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1,10 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2. - * * Authors: Thomas Graf <tgraf@suug.ch> */ @@ -14,33 +11,76 @@ #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> +#include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> +#include <net/ip_tunnels.h> +#include <linux/indirect_call_wrapper.h> + +#if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) +#ifdef CONFIG_IP_MULTIPLE_TABLES +#define INDIRECT_CALL_MT(f, f2, f1, ...) \ + INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) +#endif +#elif defined(CONFIG_IP_MULTIPLE_TABLES) +#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) +#endif + +static const struct fib_kuid_range fib_kuid_range_unset = { + KUIDT_INIT(0), + KUIDT_INIT(~0), +}; + +bool fib_rule_matchall(const struct fib_rule *rule) +{ + if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || + rule->mark || rule->tun_id || rule->flags) + return false; + if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) + return false; + if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || + !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) + return false; + if (fib_rule_port_range_set(&rule->sport_range)) + return false; + if (fib_rule_port_range_set(&rule->dport_range)) + return false; + return true; +} +EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, - u32 pref, u32 table, u32 flags) + u32 pref, u32 table) { struct fib_rule *r; - r = kzalloc(ops->rule_size, GFP_KERNEL); + r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; - atomic_set(&r->refcnt, 1); + refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; - r->flags = flags; - r->fr_net = hold_net(ops->fro_net); + r->proto = RTPROT_KERNEL; + r->fr_net = ops->fro_net; + r->uid_range = fib_kuid_range_unset; + + r->suppress_prefixlen = -1; + r->suppress_ifgroup = -1; - /* The lock is not required here, the list in unreacheable + /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); -u32 fib_default_rule_pref(struct fib_rules_ops *ops) +static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; @@ -56,13 +96,13 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops) return 0; } -EXPORT_SYMBOL(fib_default_rule_pref); static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); -static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +static struct fib_rules_ops *lookup_rules_ops(const struct net *net, + int family) { struct fib_rules_ops *ops; @@ -113,7 +153,6 @@ static int __fib_rules_register(struct fib_rules_ops *ops) if (ops->family == o->family) goto errout; - hold_net(net); list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: @@ -157,43 +196,116 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) } } -static void fib_rules_put_rcu(struct rcu_head *head) -{ - struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu); - struct net *net = ops->fro_net; - - release_net(net); - kfree(ops); -} - void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); - fib_rules_cleanup_ops(ops); spin_unlock(&net->rules_mod_lock); - call_rcu(&ops->rcu, fib_rules_put_rcu); + fib_rules_cleanup_ops(ops); + kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); +static int uid_range_set(struct fib_kuid_range *range) +{ + return uid_valid(range->start) && uid_valid(range->end); +} + +static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) +{ + struct fib_rule_uid_range *in; + struct fib_kuid_range out; + + in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); + + out.start = make_kuid(current_user_ns(), in->start); + out.end = make_kuid(current_user_ns(), in->end); + + return out; +} + +static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) +{ + struct fib_rule_uid_range out = { + from_kuid_munged(current_user_ns(), range->start), + from_kuid_munged(current_user_ns(), range->end) + }; + + return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); +} + +static int nla_get_port_range(struct nlattr *pattr, + struct fib_rule_port_range *port_range) +{ + const struct fib_rule_port_range *pr = nla_data(pattr); + + if (!fib_rule_port_range_valid(pr)) + return -EINVAL; + + port_range->start = pr->start; + port_range->end = pr->end; + + return 0; +} + +static int nla_put_port_range(struct sk_buff *skb, int attrtype, + struct fib_rule_port_range *range) +{ + return nla_put(skb, attrtype, sizeof(*range), range); +} + +static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex, + const struct flowi *fl) +{ + u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master); + + return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) : + fl->flowi_iif == iifindex; +} + +static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex, + const struct flowi *fl) +{ + u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master); + + return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) : + fl->flowi_oif == oifindex; +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, - struct flowi *fl, int flags) + struct flowi *fl, int flags, + struct fib_lookup_arg *arg) { - int ret = 0; + int iifindex, oifindex, ret = 0; - if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + iifindex = READ_ONCE(rule->iifindex); + if (iifindex && !fib_rule_iif_match(rule, iifindex, fl)) goto out; - if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + oifindex = READ_ONCE(rule->oifindex); + if (oifindex && !fib_rule_oif_match(rule, oifindex, fl)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; - ret = ops->match(rule, fl, flags); + if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) + goto out; + + if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) + goto out; + + if (uid_lt(fl->flowi_uid, rule->uid_range.start) || + uid_gt(fl->flowi_uid, rule->uid_range.end)) + goto out; + + ret = INDIRECT_CALL_MT(ops->match, + fib6_rule_match, + fib4_rule_match, + rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } @@ -208,7 +320,7 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: - if (!fib_rule_match(rule, ops, fl, flags)) + if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { @@ -224,11 +336,20 @@ jumped: } else if (rule->action == FR_ACT_NOP) continue; else - err = ops->action(rule, fl, flags, arg); + err = INDIRECT_CALL_MT(ops->action, + fib6_rule_action, + fib4_rule_action, + rule, fl, flags, arg); + + if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, + fib6_rule_suppress, + fib4_rule_suppress, + rule, flags, arg)) + continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || - likely(atomic_inc_not_zero(&rule->refcnt))) { + likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } @@ -244,137 +365,583 @@ out: } EXPORT_SYMBOL_GPL(fib_rules_lookup); -static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, - struct fib_rules_ops *ops) +static int call_fib_rule_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib_rule *rule, int family, + struct netlink_ext_ack *extack) { - int err = -EINVAL; + struct fib_rule_notifier_info info = { + .info.family = family, + .info.extack = extack, + .rule = rule, + }; - if (frh->src_len) - if (tb[FRA_SRC] == NULL || - frh->src_len > (ops->addr_size * 8) || - nla_len(tb[FRA_SRC]) != ops->addr_size) - goto errout; + return call_fib_notifier(nb, event_type, &info.info); +} - if (frh->dst_len) - if (tb[FRA_DST] == NULL || - frh->dst_len > (ops->addr_size * 8) || - nla_len(tb[FRA_DST]) != ops->addr_size) - goto errout; +static int call_fib_rule_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib_rule *rule, + struct fib_rules_ops *ops, + struct netlink_ext_ack *extack) +{ + struct fib_rule_notifier_info info = { + .info.family = ops->family, + .info.extack = extack, + .rule = rule, + }; + + ASSERT_RTNL_NET(net); + + /* Paired with READ_ONCE() in fib_rules_seq() */ + WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); + return call_fib_notifiers(net, event_type, &info.info); +} + +/* Called with rcu_read_lock() */ +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack) +{ + struct fib_rules_ops *ops; + struct fib_rule *rule; + int err = 0; + + ops = lookup_rules_ops(net, family); + if (!ops) + return -EAFNOSUPPORT; + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, + rule, family, extack); + if (err) + break; + } + rules_ops_put(ops); - err = 0; -errout: return err; } +EXPORT_SYMBOL_GPL(fib_rules_dump); -static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) +unsigned int fib_rules_seq_read(const struct net *net, int family) { - struct net *net = sock_net(skb->sk); - struct fib_rule_hdr *frh = nlmsg_data(nlh); - struct fib_rules_ops *ops = NULL; - struct fib_rule *rule, *r, *last = NULL; - struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL, unresolved = 0; + unsigned int fib_rules_seq; + struct fib_rules_ops *ops; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) - goto errout; + ops = lookup_rules_ops(net, family); + if (!ops) + return 0; + /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ + fib_rules_seq = READ_ONCE(ops->fib_rules_seq); + rules_ops_put(ops); - ops = lookup_rules_ops(net, frh->family); - if (ops == NULL) { - err = -EAFNOSUPPORT; - goto errout; + return fib_rules_seq; +} +EXPORT_SYMBOL_GPL(fib_rules_seq_read); + +static struct fib_rule *rule_find(struct fib_rules_ops *ops, + struct fib_rule_hdr *frh, + struct nlattr **tb, + struct fib_rule *rule, + bool user_priority) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (rule->action && r->action != rule->action) + continue; + + if (rule->table && r->table != rule->table) + continue; + + if (user_priority && r->pref != rule->pref) + continue; + + if (rule->iifname[0] && + memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (rule->oifname[0] && + memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (rule->mark && r->mark != rule->mark) + continue; + + if (rule->suppress_ifgroup != -1 && + r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (rule->suppress_prefixlen != -1 && + r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + + if (rule->mark_mask && r->mark_mask != rule->mark_mask) + continue; + + if (rule->tun_id && r->tun_id != rule->tun_id) + continue; + + if (rule->l3mdev && r->l3mdev != rule->l3mdev) + continue; + + if (uid_range_set(&rule->uid_range) && + (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end))) + continue; + + if (rule->ip_proto && r->ip_proto != rule->ip_proto) + continue; + + if (rule->proto && r->proto != rule->proto) + continue; + + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (rule->sport_mask && r->sport_mask != rule->sport_mask) + continue; + + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + + if (rule->dport_mask && r->dport_mask != rule->dport_mask) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return r; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); - if (err < 0) - goto errout; + return NULL; +} - err = validate_rulemsg(frh, tb, ops); - if (err < 0) - goto errout; +#ifdef CONFIG_NET_L3_MASTER_DEV +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, + struct netlink_ext_ack *extack) +{ + nlrule->l3mdev = nla_get_u8(nla); + if (nlrule->l3mdev != 1) { + NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); + return -1; + } + + return 0; +} +#else +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); + return -1; +} +#endif + +static int fib_nl2rule_port_mask(const struct nlattr *mask_attr, + const struct fib_rule_port_range *range, + u16 *port_mask, + struct netlink_ext_ack *extack) +{ + if (!fib_rule_port_range_valid(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask without port value"); + return -EINVAL; + } + + if (fib_rule_port_is_range(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask for port range"); + return -EINVAL; + } + + if (range->start & ~nla_get_u16(mask_attr)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask"); + return -EINVAL; + } + + *port_mask = nla_get_u16(mask_attr); + + return 0; +} - rule = kzalloc(ops->rule_size, GFP_KERNEL); - if (rule == NULL) { +static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct fib_rule **rule, + bool *user_priority) +{ + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rule *nlrule = NULL; + int err = -EINVAL; + + if (frh->src_len) + if (!tb[FRA_SRC] || + frh->src_len > (ops->addr_size * 8) || + nla_len(tb[FRA_SRC]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid source address"); + goto errout; + } + + if (frh->dst_len) + if (!tb[FRA_DST] || + frh->dst_len > (ops->addr_size * 8) || + nla_len(tb[FRA_DST]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid dst address"); + goto errout; + } + + nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); + if (!nlrule) { err = -ENOMEM; goto errout; } - rule->fr_net = hold_net(net); + refcount_set(&nlrule->refcnt, 1); + nlrule->fr_net = net; - if (tb[FRA_PRIORITY]) - rule->pref = nla_get_u32(tb[FRA_PRIORITY]); + if (tb[FRA_PRIORITY]) { + nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); + *user_priority = true; + } - if (tb[FRA_IIFNAME]) { - struct net_device *dev; + nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); - rule->iifindex = -1; - nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->iifname); - if (dev) - rule->iifindex = dev->ifindex; + if (tb[FRA_IIFNAME]) { + nlrule->iifindex = -1; + nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); } if (tb[FRA_OIFNAME]) { - struct net_device *dev; - - rule->oifindex = -1; - nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->oifname); - if (dev) - rule->oifindex = dev->ifindex; + nlrule->oifindex = -1; + nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); } if (tb[FRA_FWMARK]) { - rule->mark = nla_get_u32(tb[FRA_FWMARK]); - if (rule->mark) + nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); + if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ - rule->mark_mask = 0xFFFFFFFF; + nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) - rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); - rule->action = frh->action; - rule->flags = frh->flags; - rule->table = frh_get_table(frh, tb); + if (tb[FRA_TUN_ID]) + nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); - if (!tb[FRA_PRIORITY] && ops->default_pref) - rule->pref = ops->default_pref(ops); + if (tb[FRA_L3MDEV] && + fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) + goto errout_free; + + nlrule->action = frh->action; + nlrule->flags = frh->flags; + nlrule->table = frh_get_table(frh, tb); + if (tb[FRA_SUPPRESS_PREFIXLEN]) + nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + else + nlrule->suppress_prefixlen = -1; + + if (tb[FRA_SUPPRESS_IFGROUP]) + nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + else + nlrule->suppress_ifgroup = -1; - err = -EINVAL; if (tb[FRA_GOTO]) { - if (rule->action != FR_ACT_GOTO) + if (nlrule->action != FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; + } - rule->target = nla_get_u32(tb[FRA_GOTO]); - /* Backward jumps are prohibited to avoid endless loops */ - if (rule->target <= rule->pref) + nlrule->target = nla_get_u32(tb[FRA_GOTO]); + } else if (nlrule->action == FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); + goto errout_free; + } + + if (nlrule->l3mdev && nlrule->table) { + NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); + goto errout_free; + } + + if (tb[FRA_UID_RANGE]) { + if (current_user_ns() != net->user_ns) { + err = -EPERM; + NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; + } - list_for_each_entry(r, &ops->rules_list, list) { - if (r->pref == rule->target) { - RCU_INIT_POINTER(rule->ctarget, r); - break; - } + nlrule->uid_range = nla_get_kuid_range(tb); + + if (!uid_range_set(&nlrule->uid_range) || + !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { + NL_SET_ERR_MSG(extack, "Invalid uid range"); + goto errout_free; + } + } else { + nlrule->uid_range = fib_kuid_range_unset; + } + + if (tb[FRA_IP_PROTO]) + nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); + + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &nlrule->sport_range); + if (err) { + NL_SET_ERR_MSG(extack, "Invalid sport range"); + goto errout_free; + } + if (!fib_rule_port_is_range(&nlrule->sport_range)) + nlrule->sport_mask = U16_MAX; + } + + if (tb[FRA_SPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK], + &nlrule->sport_range, + &nlrule->sport_mask, extack); + if (err) + goto errout_free; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &nlrule->dport_range); + if (err) { + NL_SET_ERR_MSG(extack, "Invalid dport range"); + goto errout_free; + } + if (!fib_rule_port_is_range(&nlrule->dport_range)) + nlrule->dport_mask = U16_MAX; + } + + if (tb[FRA_DPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK], + &nlrule->dport_range, + &nlrule->dport_mask, extack); + if (err) + goto errout_free; + } + + *rule = nlrule; + + return 0; + +errout_free: + kfree(nlrule); +errout: + return err; +} + +static int fib_nl2rule_rtnl(struct fib_rule *nlrule, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + if (!tb[FRA_PRIORITY]) + nlrule->pref = fib_default_rule_pref(ops); + + /* Backward jumps are prohibited to avoid endless loops */ + if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); + return -EINVAL; + } + + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname); + if (dev) { + nlrule->iifindex = dev->ifindex; + nlrule->iif_is_l3_master = netif_is_l3_master(dev); } + } + + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname); + if (dev) { + nlrule->oifindex = dev->ifindex; + nlrule->oif_is_l3_master = netif_is_l3_master(dev); + } + } + + return 0; +} + +static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, + struct nlattr **tb, struct fib_rule *rule) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action != rule->action) + continue; + + if (r->table != rule->table) + continue; + + if (r->pref != rule->pref) + continue; + + if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (r->mark != rule->mark) + continue; + + if (r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + + if (r->mark_mask != rule->mark_mask) + continue; + + if (r->tun_id != rule->tun_id) + continue; - if (rcu_dereference_protected(rule->ctarget, 1) == NULL) - unresolved = 1; - } else if (rule->action == FR_ACT_GOTO) + if (r->l3mdev != rule->l3mdev) + continue; + + if (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end)) + continue; + + if (r->ip_proto != rule->ip_proto) + continue; + + if (r->proto != rule->proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (r->sport_mask != rule->sport_mask) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + + if (r->dport_mask != rule->dport_mask) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return 1; + } + return 0; +} + +static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { + [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, + [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [FRA_PRIORITY] = { .type = NLA_U32 }, + [FRA_FWMARK] = { .type = NLA_U32 }, + [FRA_FLOW] = { .type = NLA_U32 }, + [FRA_TUN_ID] = { .type = NLA_U64 }, + [FRA_FWMASK] = { .type = NLA_U32 }, + [FRA_TABLE] = { .type = NLA_U32 }, + [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, + [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, + [FRA_GOTO] = { .type = NLA_U32 }, + [FRA_L3MDEV] = { .type = NLA_U8 }, + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, + [FRA_PROTOCOL] = { .type = NLA_U8 }, + [FRA_IP_PROTO] = { .type = NLA_U8 }, + [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, + [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, + [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), + [FRA_FLOWLABEL] = { .type = NLA_BE32 }, + [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, + [FRA_SPORT_MASK] = { .type = NLA_U16 }, + [FRA_DPORT_MASK] = { .type = NLA_U16 }, + [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2), +}; + +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) +{ + struct fib_rule *rule = NULL, *r, *last = NULL; + int err = -EINVAL, unresolved = 0; + struct fib_rules_ops *ops = NULL; + struct nlattr *tb[FRA_MAX + 1]; + bool user_priority = false; + struct fib_rule_hdr *frh; + + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); + goto errout; + } + + ops = lookup_rules_ops(net, frh->family); + if (!ops) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); + goto errout; + } + + err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, + fib_rule_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); + goto errout; + } + + err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority); + if (err) + goto errout; + + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(rule, ops, tb, extack); + if (err) goto errout_free; - err = ops->configure(rule, skb, frh, tb); + if ((nlh->nlmsg_flags & NLM_F_EXCL) && + rule_exists(ops, frh, tb, rule)) { + err = -EEXIST; + goto errout_free; + } + + err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; + err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, + extack); + if (err < 0) + goto errout_free; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref == rule->target) { + RCU_INIT_POINTER(rule->ctarget, r); + break; + } + } + + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) + unresolved = 1; + list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } - fib_rule_get(rule); - if (last) list_add_rcu(&rule->list, &last->list); else @@ -402,118 +969,154 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (unresolved) ops->unresolved_rules++; + if (rule->tun_id) + ip_tunnel_need_metadata(); + + fib_rule_get(rule); + + if (!rtnl_held) + rtnl_net_unlock(net); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); + fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: - release_net(rule->fr_net); + if (!rtnl_held) + rtnl_net_unlock(net); kfree(rule); errout: rules_ops_put(ops); return err; } +EXPORT_SYMBOL_GPL(fib_newrule); -static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); - struct fib_rule_hdr *frh = nlmsg_data(nlh); + return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false); +} + +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) +{ + struct fib_rule *rule = NULL, *nlrule = NULL; struct fib_rules_ops *ops = NULL; - struct fib_rule *rule, *tmp; struct nlattr *tb[FRA_MAX+1]; + bool user_priority = false; + struct fib_rule_hdr *frh; int err = -EINVAL; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; + } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); - if (err < 0) + err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, + fib_rule_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; + } - err = validate_rulemsg(frh, tb, ops); - if (err < 0) + err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority); + if (err) goto errout; - list_for_each_entry(rule, &ops->rules_list, list) { - if (frh->action && (frh->action != rule->action)) - continue; + if (!rtnl_held) + rtnl_net_lock(net); - if (frh->table && (frh_get_table(frh, tb) != rule->table)) - continue; + err = fib_nl2rule_rtnl(nlrule, ops, tb, extack); + if (err) + goto errout_free; - if (tb[FRA_PRIORITY] && - (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) - continue; + rule = rule_find(ops, frh, tb, nlrule, user_priority); + if (!rule) { + err = -ENOENT; + goto errout_free; + } - if (tb[FRA_IIFNAME] && - nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) - continue; + if (rule->flags & FIB_RULE_PERMANENT) { + err = -EPERM; + goto errout_free; + } - if (tb[FRA_OIFNAME] && - nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) - continue; + if (ops->delete) { + err = ops->delete(rule); + if (err) + goto errout_free; + } - if (tb[FRA_FWMARK] && - (rule->mark != nla_get_u32(tb[FRA_FWMARK]))) - continue; + if (rule->tun_id) + ip_tunnel_unneed_metadata(); - if (tb[FRA_FWMASK] && - (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) - continue; + list_del_rcu(&rule->list); - if (!ops->compare(rule, frh, tb)) - continue; + if (rule->action == FR_ACT_GOTO) { + ops->nr_goto_rules--; + if (rtnl_dereference(rule->ctarget) == NULL) + ops->unresolved_rules--; + } - if (rule->flags & FIB_RULE_PERMANENT) { - err = -EPERM; - goto errout; + /* + * Check if this rule is a target to any of them. If so, + * adjust to the next one with the same preference or + * disable them. As this operation is eventually very + * expensive, it is only performed if goto rules, except + * current if it is goto rule, have actually been added. + */ + if (ops->nr_goto_rules > 0) { + struct fib_rule *n, *r; + + n = list_next_entry(rule, list); + if (&n->list == &ops->rules_list || n->pref != rule->pref) + n = NULL; + list_for_each_entry(r, &ops->rules_list, list) { + if (rtnl_dereference(r->ctarget) != rule) + continue; + rcu_assign_pointer(r->ctarget, n); + if (!n) + ops->unresolved_rules++; } + } - list_del_rcu(&rule->list); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); - if (rule->action == FR_ACT_GOTO) { - ops->nr_goto_rules--; - if (rtnl_dereference(rule->ctarget) == NULL) - ops->unresolved_rules--; - } + if (!rtnl_held) + rtnl_net_unlock(net); - /* - * Check if this rule is a target to any of them. If so, - * disable them. As this operation is eventually very - * expensive, it is only performed if goto rules have - * actually been added. - */ - if (ops->nr_goto_rules > 0) { - list_for_each_entry(tmp, &ops->rules_list, list) { - if (rtnl_dereference(tmp->ctarget) == rule) { - RCU_INIT_POINTER(tmp->ctarget, NULL); - ops->unresolved_rules++; - } - } - } - - notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).portid); - if (ops->delete) - ops->delete(rule); - fib_rule_put(rule); - flush_route_cache(ops); - rules_ops_put(ops); - return 0; - } + notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); + fib_rule_put(rule); + flush_route_cache(ops); + rules_ops_put(ops); + kfree(nlrule); + return 0; - err = -ENOENT; +errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); + kfree(nlrule); errout: rules_ops_put(ops); return err; } +EXPORT_SYMBOL_GPL(fib_delrule); + +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false); +} static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) @@ -523,8 +1126,18 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ - + nla_total_size(4); /* FRA_FWMASK */ + + nla_total_size(4) /* FRA_FWMASK */ + + nla_total_size_64bit(8) /* FRA_TUN_ID */ + + nla_total_size(sizeof(struct fib_kuid_range)) + + nla_total_size(1) /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_IP_PROTO */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */ + + nla_total_size(2) /* FRA_SPORT_MASK */ + + nla_total_size(2); /* FRA_DPORT_MASK */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -545,14 +1158,19 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh = nlmsg_data(nlh); frh->family = ops->family; - frh->table = rule->table; + frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; + if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) + goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; + if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) + goto nla_put_failure; + if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; @@ -560,14 +1178,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; - if (rule->iifindex == -1) + if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; - if (rule->oifindex == -1) + if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } @@ -578,12 +1196,34 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && - nla_put_u32(skb, FRA_GOTO, rule->target))) + nla_put_u32(skb, FRA_GOTO, rule->target)) || + (rule->tun_id && + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || + (rule->l3mdev && + nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || + (uid_range_set(&rule->uid_range) && + nla_put_uid_range(skb, &rule->uid_range)) || + (fib_rule_port_range_set(&rule->sport_range) && + nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK, + rule->sport_mask)) || + (fib_rule_port_range_set(&rule->dport_range) && + nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK, + rule->dport_mask)) || + (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; + + if (rule->suppress_ifgroup != -1) { + if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) + goto nla_put_failure; + } + if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -595,15 +1235,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, { int idx = 0; struct fib_rule *rule; + int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWRULE, - NLM_F_MULTI, ops) < 0) + err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops); + if (err) break; skip: idx++; @@ -612,16 +1254,50 @@ skip: cb->args[1] = idx; rules_ops_put(ops); - return skb->len; + return err; +} + +static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct fib_rule_hdr *frh; + + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { + NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); + return -EINVAL; + } + + if (frh->dst_len || frh->src_len || frh->tos || frh->table || + frh->res1 || frh->res2 || frh->action || frh->flags) { + NL_SET_ERR_MSG(extack, + "Invalid values in header for fib rule dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); + return -EINVAL; + } + + return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; - int idx = 0, family; + int err, idx = 0, family; + + if (cb->strict_check) { + err = fib_valid_dumprule_req(nlh, cb->extack); - family = rtnl_msg_family(cb->nlh); + if (err < 0) + return err; + } + + family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); @@ -631,12 +1307,14 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) return dump_rules(skb, cb, ops); } + err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; - if (dump_rules(skb, cb, ops) < 0) + err = dump_rules(skb, cb, ops); + if (err < 0) break; cb->args[1] = 0; @@ -646,7 +1324,7 @@ skip: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static void notify_rule_change(int event, struct fib_rule *rule, @@ -655,7 +1333,7 @@ static void notify_rule_change(int event, struct fib_rule *rule, { struct net *net; struct sk_buff *skb; - int err = -ENOBUFS; + int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); @@ -673,8 +1351,7 @@ static void notify_rule_change(int event, struct fib_rule *rule, rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, ops->nlgroup, err); + rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) @@ -683,11 +1360,17 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && - strcmp(dev->name, rule->iifname) == 0) - rule->iifindex = dev->ifindex; + strcmp(dev->name, rule->iifname) == 0) { + WRITE_ONCE(rule->iifindex, dev->ifindex); + WRITE_ONCE(rule->iif_is_l3_master, + netif_is_l3_master(dev)); + } if (rule->oifindex == -1 && - strcmp(dev->name, rule->oifname) == 0) - rule->oifindex = dev->ifindex; + strcmp(dev->name, rule->oifname) == 0) { + WRITE_ONCE(rule->oifindex, dev->ifindex); + WRITE_ONCE(rule->oif_is_l3_master, + netif_is_l3_master(dev)); + } } } @@ -696,10 +1379,14 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) { - if (rule->iifindex == dev->ifindex) - rule->iifindex = -1; - if (rule->oifindex == dev->ifindex) - rule->oifindex = -1; + if (rule->iifindex == dev->ifindex) { + WRITE_ONCE(rule->iifindex, -1); + WRITE_ONCE(rule->iif_is_l3_master, false); + } + if (rule->oifindex == dev->ifindex) { + WRITE_ONCE(rule->oifindex, -1); + WRITE_ONCE(rule->oif_is_l3_master, false); + } } } @@ -719,6 +1406,13 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, attach_rules(&ops->rules_list, dev); break; + case NETDEV_CHANGENAME: + list_for_each_entry(ops, &net->rules_ops, list) { + detach_rules(&ops->rules_list, dev); + attach_rules(&ops->rules_list, dev); + } + break; + case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); @@ -739,16 +1433,30 @@ static int __net_init fib_rules_net_init(struct net *net) return 0; } +static void __net_exit fib_rules_net_exit(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->rules_ops)); +} + static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, + .exit = fib_rules_net_exit, +}; + +static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init fib_rules_init(void) { int err; - rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL); + + rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) @@ -763,9 +1471,7 @@ static int __init fib_rules_init(void) fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: - rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); - rtnl_unregister(PF_UNSPEC, RTM_DELRULE); - rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } diff --git a/net/core/filter.c b/net/core/filter.c index 6438f29ff266..616e0520a0bb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1,85 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * - * Author: - * Jay Schulist <jschlst@samba.org> + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: * - * Based on the design of: - * - The Berkeley Packet Filter + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * Authors: + * + * Jay Schulist <jschlst@samba.org> + * Alexei Starovoitov <ast@plumgrid.com> + * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. - * Kris Katterjohn - Added many additional checks in sk_chk_filter() + * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ +#include <linux/atomic.h> +#include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> +#include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> +#include <linux/if_arp.h> #include <linux/gfp.h> +#include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> +#include <linux/skmsg.h> #include <net/sock.h> +#include <net/flow_dissector.h> #include <linux/errno.h> #include <linux/timer.h> -#include <asm/uaccess.h> -#include <asm/unaligned.h> +#include <linux/uaccess.h> +#include <linux/unaligned.h> #include <linux/filter.h> -#include <linux/reciprocal_div.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <net/sch_generic.h> +#include <net/cls_cgroup.h> +#include <net/dst_metadata.h> +#include <net/dst.h> +#include <net/sock_reuseport.h> +#include <net/busy_poll.h> +#include <net/tcp.h> +#include <net/xfrm.h> +#include <net/udp.h> +#include <linux/bpf_trace.h> +#include <net/xdp_sock.h> +#include <linux/inetdevice.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> +#include <net/ip_fib.h> +#include <net/nexthop.h> +#include <net/flow.h> +#include <net/arp.h> +#include <net/ipv6.h> +#include <net/net_namespace.h> +#include <linux/seg6_local.h> +#include <net/seg6.h> +#include <net/seg6_local.h> +#include <net/lwtunnel.h> +#include <net/ipv6_stubs.h> +#include <net/bpf_sk_storage.h> +#include <net/transp_v6.h> +#include <linux/btf_ids.h> +#include <net/tls.h> +#include <net/xdp.h> +#include <net/mptcp.h> +#include <net/netfilter/nf_conntrack_bpf.h> +#include <net/netkit.h> +#include <linux/un.h> +#include <net/xdp_sock_drv.h> +#include <net/inet_dscp.h> -/* No hurry in this branch - * - * Exported for the bpf jit load helper. - */ -void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) -{ - u8 *ptr = NULL; +#include "dev.h" - if (k >= SKF_NET_OFF) - ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) - ptr = skb_mac_header(skb) + k - SKF_LL_OFF; +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); - if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) - return ptr; - return NULL; -} +static const struct bpf_func_proto * +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); -static inline void *load_pointer(const struct sk_buff *skb, int k, - unsigned int size, void *buffer) +int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { - if (k >= 0) - return skb_header_pointer(skb, k, size, buffer); - return bpf_internal_load_pointer_neg_helper(skb, k, size); + if (in_compat_syscall()) { + struct compat_sock_fprog f32; + + if (len != sizeof(f32)) + return -EINVAL; + if (copy_from_sockptr(&f32, src, sizeof(f32))) + return -EFAULT; + memset(dst, 0, sizeof(*dst)); + dst->len = f32.len; + dst->filter = compat_ptr(f32.filter); + } else { + if (len != sizeof(*dst)) + return -EINVAL; + if (copy_from_sockptr(dst, src, sizeof(*dst))) + return -EFAULT; + } + + return 0; } +EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); /** - * sk_filter - run a packet through a socket filter + * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter + * @cap: limit on how short the eBPF program may trim the packet + * @reason: record drop reason on errors (negative return value) * - * Run the filter code and then cut skb->data to correct size returned by - * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller + * Run the eBPF program and then cut skb->data to correct size returned by + * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to sk_run_filter. It returns 0 if the packet should + * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ -int sk_filter(struct sock *sk, struct sk_buff *skb) +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, + unsigned int cap, enum skb_drop_reason *reason) { int err; struct sk_filter *filter; @@ -89,369 +142,840 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ - if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) + if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); + *reason = SKB_DROP_REASON_PFMEMALLOC; return -ENOMEM; + } + err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); + if (err) { + *reason = SKB_DROP_REASON_SOCKET_FILTER; + return err; + } err = security_sock_rcv_skb(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SECURITY_HOOK; return err; + } rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = SK_RUN_FILTER(filter, skb); + struct sock *save_sk = skb->sk; + unsigned int pkt_len; - err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; + skb->sk = sk; + pkt_len = bpf_prog_run_save_cb(filter->prog, skb); + skb->sk = save_sk; + err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; + if (err) + *reason = SKB_DROP_REASON_SOCKET_FILTER; } rcu_read_unlock(); return err; } -EXPORT_SYMBOL(sk_filter); +EXPORT_SYMBOL(sk_filter_trim_cap); + +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) +{ + return skb_get_poff(skb); +} + +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +{ + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + + if (skb->len < sizeof(struct nlattr)) + return 0; + + if (a > skb->len - sizeof(struct nlattr)) + return 0; + + nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); + if (nla) + return (void *) nla - (void *) skb->data; + + return 0; +} + +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +{ + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + + if (skb->len < sizeof(struct nlattr)) + return 0; + + if (a > skb->len - sizeof(struct nlattr)) + return 0; + + nla = (struct nlattr *) &skb->data[a]; + if (!nla_ok(nla, skb->len - a)) + return 0; + + nla = nla_find_nested(nla, x); + if (nla) + return (void *) nla - (void *) skb->data; + + return 0; +} + +static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) +{ + if (likely(offset >= 0)) + return offset; + + if (offset >= SKF_NET_OFF) + return offset - SKF_NET_OFF + skb_network_offset(skb); + + if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) + return offset - SKF_LL_OFF + skb_mac_offset(skb); + + return INT_MIN; +} + +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp; + const int len = sizeof(tmp); + + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; + + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + else + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + __be16 tmp; + const int len = sizeof(tmp); + + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; + + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + else + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + __be32 tmp; + const int len = sizeof(tmp); + + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; + + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + else + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + +static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (skb_field) { + case SKF_AD_MARK: + BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_PKTTYPE: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); +#endif + break; + + case SKF_AD_QUEUE: + BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, queue_mapping)); + break; + + case SKF_AD_VLAN_TAG: + BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); + + /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_tci)); + break; + case SKF_AD_VLAN_TAG_PRESENT: + BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_all)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); + *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1); + break; + } + + return insn - insn_buf; +} + +static bool convert_bpf_extensions(struct sock_filter *fp, + struct bpf_insn **insnp) +{ + struct bpf_insn *insn = *insnp; + u32 cnt; + + switch (fp->k) { + case SKF_AD_OFF + SKF_AD_PROTOCOL: + BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); + + /* A = *(u16 *) (CTX + offsetof(protocol)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, protocol)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + break; + + case SKF_AD_OFF + SKF_AD_PKTTYPE: + cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_IFINDEX: + case SKF_AD_OFF + SKF_AD_HATYPE: + BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); + BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, dev)); + /* if (tmp != 0) goto pc + 1 */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); + *insn++ = BPF_EXIT_INSN(); + if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, + offsetof(struct net_device, ifindex)); + else + *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, + offsetof(struct net_device, type)); + break; + + case SKF_AD_OFF + SKF_AD_MARK: + cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_RXHASH: + BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); + + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, hash)); + break; + + case SKF_AD_OFF + SKF_AD_QUEUE: + cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_VLAN_TAG: + cnt = convert_skb_access(SKF_AD_VLAN_TAG, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: + cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_VLAN_TPID: + BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); + + /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, vlan_proto)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + break; + + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: + case SKF_AD_OFF + SKF_AD_NLATTR: + case SKF_AD_OFF + SKF_AD_NLATTR_NEST: + case SKF_AD_OFF + SKF_AD_CPU: + case SKF_AD_OFF + SKF_AD_RANDOM: + /* arg1 = CTX */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + /* arg2 = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); + /* arg3 = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); + /* Emit call(arg1=CTX, arg2=A, arg3=X) */ + switch (fp->k) { + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: + *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); + break; + case SKF_AD_OFF + SKF_AD_NLATTR: + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); + break; + case SKF_AD_OFF + SKF_AD_NLATTR_NEST: + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); + break; + case SKF_AD_OFF + SKF_AD_CPU: + *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); + break; + case SKF_AD_OFF + SKF_AD_RANDOM: + *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); + bpf_user_rnd_init_once(); + break; + } + break; + + case SKF_AD_OFF + SKF_AD_ALU_XOR_X: + /* A ^= X */ + *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); + break; + + default: + /* This is just a dummy call to avoid letting the compiler + * evict __bpf_call_base() as an optimization. Placed here + * where no-one bothers. + */ + BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); + return false; + } + + *insnp = insn; + return true; +} + +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + bool ldx_off_ok = offset <= S16_MAX; + + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, + size, 2 + endian + (!ldx_off_ok * 2)); + if (ldx_off_ok) { + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_D, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_TMP, 0); + } + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} /** - * sk_run_filter - run a filter on a socket - * @skb: buffer to run the filter on - * @fentry: filter to apply - * - * Decode and apply filter instructions to the skb->data. - * Return length to keep, 0 for none. @skb is the data we are - * filtering, @filter is the array of filter instructions. - * Because all jumps are guaranteed to be before last instruction, - * and last instruction guaranteed to be a RET, we dont need to check - * flen. (We used to pass to this function the length of filter) + * bpf_convert_filter - convert filter program + * @prog: the user passed filter program + * @len: the length of the user passed filter program + * @new_prog: allocated 'struct bpf_prog' or NULL + * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind + * + * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' + * style extended BPF (eBPF). + * Conversion workflow: + * + * 1) First pass for calculating the new program length: + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) + * + * 2) 2nd pass to remap in two passes: 1st pass finds new + * jump offsets, 2nd pass remapping: + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ -unsigned int sk_run_filter(const struct sk_buff *skb, - const struct sock_filter *fentry) +static int bpf_convert_filter(struct sock_filter *prog, int len, + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) { - void *ptr; - u32 A = 0; /* Accumulator */ - u32 X = 0; /* Index Register */ - u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ - u32 tmp; - int k; + int new_flen = 0, pass = 0, target, i, stack_off; + struct bpf_insn *new_insn, *first_insn = NULL; + struct sock_filter *fp; + int *addrs = NULL; + u8 bpf_src; - /* - * Process array of filter instructions. - */ - for (;; fentry++) { -#if defined(CONFIG_X86_32) -#define K (fentry->k) -#else - const u32 K = fentry->k; -#endif + BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); + BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - switch (fentry->code) { - case BPF_S_ALU_ADD_X: - A += X; - continue; - case BPF_S_ALU_ADD_K: - A += K; - continue; - case BPF_S_ALU_SUB_X: - A -= X; - continue; - case BPF_S_ALU_SUB_K: - A -= K; - continue; - case BPF_S_ALU_MUL_X: - A *= X; - continue; - case BPF_S_ALU_MUL_K: - A *= K; - continue; - case BPF_S_ALU_DIV_X: - if (X == 0) - return 0; - A /= X; - continue; - case BPF_S_ALU_DIV_K: - A = reciprocal_divide(A, K); - continue; - case BPF_S_ALU_MOD_X: - if (X == 0) - return 0; - A %= X; - continue; - case BPF_S_ALU_MOD_K: - A %= K; - continue; - case BPF_S_ALU_AND_X: - A &= X; - continue; - case BPF_S_ALU_AND_K: - A &= K; - continue; - case BPF_S_ALU_OR_X: - A |= X; - continue; - case BPF_S_ALU_OR_K: - A |= K; - continue; - case BPF_S_ANC_ALU_XOR_X: - case BPF_S_ALU_XOR_X: - A ^= X; - continue; - case BPF_S_ALU_XOR_K: - A ^= K; - continue; - case BPF_S_ALU_LSH_X: - A <<= X; - continue; - case BPF_S_ALU_LSH_K: - A <<= K; - continue; - case BPF_S_ALU_RSH_X: - A >>= X; - continue; - case BPF_S_ALU_RSH_K: - A >>= K; - continue; - case BPF_S_ALU_NEG: - A = -A; - continue; - case BPF_S_JMP_JA: - fentry += K; - continue; - case BPF_S_JMP_JGT_K: - fentry += (A > K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGE_K: - fentry += (A >= K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JEQ_K: - fentry += (A == K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JSET_K: - fentry += (A & K) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGT_X: - fentry += (A > X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JGE_X: - fentry += (A >= X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JEQ_X: - fentry += (A == X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_JMP_JSET_X: - fentry += (A & X) ? fentry->jt : fentry->jf; - continue; - case BPF_S_LD_W_ABS: - k = K; -load_w: - ptr = load_pointer(skb, k, 4, &tmp); - if (ptr != NULL) { - A = get_unaligned_be32(ptr); - continue; + if (len <= 0 || len > BPF_MAXINSNS) + return -EINVAL; + + if (new_prog) { + first_insn = new_prog->insnsi; + addrs = kcalloc(len, sizeof(*addrs), + GFP_KERNEL | __GFP_NOWARN); + if (!addrs) + return -ENOMEM; + } + +do_pass: + new_insn = first_insn; + fp = prog; + + /* Classic BPF related prologue emission. */ + if (new_prog) { + /* Classic BPF expects A and X to be reset first. These need + * to be guaranteed to be the first two instructions. + */ + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + + /* All programs must keep CTX in callee saved BPF_REG_CTX. + * In eBPF case it's done by the compiler, here we need to + * do this ourself. Initial CTX is present in BPF_REG_ARG1. + */ + *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } + } else { + new_insn += 3; + } + + for (i = 0; i < len; fp++, i++) { + struct bpf_insn tmp_insns[32] = { }; + struct bpf_insn *insn = tmp_insns; + + if (addrs) + addrs[i] = new_insn - first_insn; + + switch (fp->code) { + /* All arithmetic insns and skb loads map as-is. */ + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_X: + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + /* Check for overloaded BPF extension and + * directly convert it if found, otherwise + * just move on with mapping. + */ + if (BPF_CLASS(fp->code) == BPF_LD && + BPF_MODE(fp->code) == BPF_ABS && + convert_bpf_extensions(fp, &insn)) + break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; } - return 0; - case BPF_S_LD_H_ABS: - k = K; -load_h: - ptr = load_pointer(skb, k, 2, &tmp); - if (ptr != NULL) { - A = get_unaligned_be16(ptr); - continue; + + if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { + *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); } - return 0; - case BPF_S_LD_B_ABS: - k = K; -load_b: - ptr = load_pointer(skb, k, 1, &tmp); - if (ptr != NULL) { - A = *(u8 *)ptr; - continue; + + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); + break; + + /* Jump transformation cannot use BPF block macros + * everywhere as offset calculation and target updates + * require a bit more work than the rest, i.e. jump + * opcodes map as-is, but offsets need adjustment. + */ + +#define BPF_EMIT_JMP \ + do { \ + const s32 off_min = S16_MIN, off_max = S16_MAX; \ + s32 off; \ + \ + if (target >= len || target < 0) \ + goto err; \ + off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ + /* Adjust pc relative offset for 2nd or 3rd insn. */ \ + off -= insn - tmp_insns; \ + /* Reject anything not fitting into insn->off. */ \ + if (off < off_min || off > off_max) \ + goto err; \ + insn->off = off; \ + } while (0) + + case BPF_JMP | BPF_JA: + target = i + fp->k + 1; + insn->code = fp->code; + BPF_EMIT_JMP; + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { + /* BPF immediates are signed, zero extend + * immediate into tmp register and use it + * in compare insn. + */ + *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); + + insn->dst_reg = BPF_REG_A; + insn->src_reg = BPF_REG_TMP; + bpf_src = BPF_X; + } else { + insn->dst_reg = BPF_REG_A; + insn->imm = fp->k; + bpf_src = BPF_SRC(fp->code); + insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } - return 0; - case BPF_S_LD_W_LEN: - A = skb->len; - continue; - case BPF_S_LDX_W_LEN: - X = skb->len; - continue; - case BPF_S_LD_W_IND: - k = X + K; - goto load_w; - case BPF_S_LD_H_IND: - k = X + K; - goto load_h; - case BPF_S_LD_B_IND: - k = X + K; - goto load_b; - case BPF_S_LDX_B_MSH: - ptr = load_pointer(skb, K, 1, &tmp); - if (ptr != NULL) { - X = (*(u8 *)ptr & 0xf) << 2; - continue; + + /* Common case where 'jump_false' is next insn. */ + if (fp->jf == 0) { + insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; + target = i + fp->jt + 1; + BPF_EMIT_JMP; + break; } - return 0; - case BPF_S_LD_IMM: - A = K; - continue; - case BPF_S_LDX_IMM: - X = K; - continue; - case BPF_S_LD_MEM: - A = mem[K]; - continue; - case BPF_S_LDX_MEM: - X = mem[K]; - continue; - case BPF_S_MISC_TAX: - X = A; - continue; - case BPF_S_MISC_TXA: - A = X; - continue; - case BPF_S_RET_K: - return K; - case BPF_S_RET_A: - return A; - case BPF_S_ST: - mem[K] = A; - continue; - case BPF_S_STX: - mem[K] = X; - continue; - case BPF_S_ANC_PROTOCOL: - A = ntohs(skb->protocol); - continue; - case BPF_S_ANC_PKTTYPE: - A = skb->pkt_type; - continue; - case BPF_S_ANC_IFINDEX: - if (!skb->dev) - return 0; - A = skb->dev->ifindex; - continue; - case BPF_S_ANC_MARK: - A = skb->mark; - continue; - case BPF_S_ANC_QUEUE: - A = skb->queue_mapping; - continue; - case BPF_S_ANC_HATYPE: - if (!skb->dev) - return 0; - A = skb->dev->type; - continue; - case BPF_S_ANC_RXHASH: - A = skb->rxhash; - continue; - case BPF_S_ANC_CPU: - A = raw_smp_processor_id(); - continue; - case BPF_S_ANC_VLAN_TAG: - A = vlan_tx_tag_get(skb); - continue; - case BPF_S_ANC_VLAN_TAG_PRESENT: - A = !!vlan_tx_tag_present(skb); - continue; - case BPF_S_ANC_PAY_OFFSET: - A = __skb_get_poff(skb); - continue; - case BPF_S_ANC_NLATTR: { - struct nlattr *nla; - - if (skb_is_nonlinear(skb)) - return 0; - if (A > skb->len - sizeof(struct nlattr)) - return 0; - - nla = nla_find((struct nlattr *)&skb->data[A], - skb->len - A, X); - if (nla) - A = (void *)nla - (void *)skb->data; - else - A = 0; - continue; - } - case BPF_S_ANC_NLATTR_NEST: { - struct nlattr *nla; - if (skb_is_nonlinear(skb)) - return 0; - if (A > skb->len - sizeof(struct nlattr)) - return 0; + /* Convert some jumps when 'jump_true' is next insn. */ + if (fp->jt == 0) { + switch (BPF_OP(fp->code)) { + case BPF_JEQ: + insn->code = BPF_JMP | BPF_JNE | bpf_src; + break; + case BPF_JGT: + insn->code = BPF_JMP | BPF_JLE | bpf_src; + break; + case BPF_JGE: + insn->code = BPF_JMP | BPF_JLT | bpf_src; + break; + default: + goto jmp_rest; + } - nla = (struct nlattr *)&skb->data[A]; - if (nla->nla_len > A - skb->len) - return 0; + target = i + fp->jf + 1; + BPF_EMIT_JMP; + break; + } +jmp_rest: + /* Other jumps are mapped into two insns: Jxx and JA. */ + target = i + fp->jt + 1; + insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; + BPF_EMIT_JMP; + insn++; - nla = nla_find_nested(nla, X); - if (nla) - A = (void *)nla - (void *)skb->data; - else - A = 0; - continue; + insn->code = BPF_JMP | BPF_JA; + target = i + fp->jf + 1; + BPF_EMIT_JMP; + break; + + /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + /* A = BPF_R0 = *(u8 *) (skb->data + K) */ + convert_bpf_ld_abs(&tmp, &insn); + insn++; + /* A &= 0xf */ + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); + /* A <<= 2 */ + *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + /* A = tmp */ + *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); + break; } -#ifdef CONFIG_SECCOMP_FILTER - case BPF_S_ANC_SECCOMP_LD_W: - A = seccomp_bpf_load(fentry->k); - continue; -#endif + /* RET_K is remapped into 2 insns. RET_A case doesn't need an + * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. + */ + case BPF_RET | BPF_A: + case BPF_RET | BPF_K: + if (BPF_RVAL(fp->code) == BPF_K) + *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, + 0, fp->k); + *insn = BPF_EXIT_INSN(); + break; + + /* Store to stack. */ + case BPF_ST: + case BPF_STX: + stack_off = fp->k * 4 + 4; + *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == + BPF_ST ? BPF_REG_A : BPF_REG_X, + -stack_off); + /* check_load_and_stores() verifies that classic BPF can + * load from stack only after write, so tracking + * stack_depth for ST|STX insns is enough + */ + if (new_prog && new_prog->aux->stack_depth < stack_off) + new_prog->aux->stack_depth = stack_off; + break; + + /* Load from stack. */ + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + stack_off = fp->k * 4 + 4; + *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, BPF_REG_FP, + -stack_off); + break; + + /* A = K or X = K */ + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, fp->k); + break; + + /* X = A */ + case BPF_MISC | BPF_TAX: + *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + break; + + /* A = X */ + case BPF_MISC | BPF_TXA: + *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); + break; + + /* A = skb->len or X = skb->len */ + case BPF_LD | BPF_W | BPF_LEN: + case BPF_LDX | BPF_W | BPF_LEN: + *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + break; + + /* Access seccomp_data fields. */ + case BPF_LDX | BPF_ABS | BPF_W: + /* A = *(u32 *) (ctx + K) */ + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); + break; + + /* Unknown instruction. */ default: - WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", - fentry->code, fentry->jt, - fentry->jf, fentry->k); - return 0; + goto err; } + + insn++; + if (new_prog) + memcpy(new_insn, tmp_insns, + sizeof(*insn) * (insn - tmp_insns)); + new_insn += insn - tmp_insns; + } + + if (!new_prog) { + /* Only calculating new length. */ + *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ + return 0; + } + + pass++; + if (new_flen != new_insn - first_insn) { + new_flen = new_insn - first_insn; + if (pass > 2) + goto err; + goto do_pass; } + kfree(addrs); + BUG_ON(*new_len != new_flen); return 0; +err: + kfree(addrs); + return -EINVAL; } -EXPORT_SYMBOL(sk_run_filter); -/* - * Security : - * A BPF program is able to use 16 cells of memory to store intermediate - * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) +/* Security: + * * As we dont want to clear mem[] array for each packet going through - * sk_run_filter(), we check that filter loaded by user never try to read + * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ -static int check_load_and_stores(struct sock_filter *filter, int flen) +static int check_load_and_stores(const struct sock_filter *filter, int flen) { - u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ + u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); - masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + + masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; + memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { - case BPF_S_ST: - case BPF_S_STX: + case BPF_ST: + case BPF_STX: memvalid |= (1 << filter[pc].k); break; - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; - case BPF_S_JMP_JA: - /* a jump must set masks on target */ + case BPF_JMP | BPF_JA: + /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_X: - case BPF_S_JMP_JSET_K: - /* a jump must set masks on targets */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; @@ -463,8 +987,85 @@ error: return ret; } +static bool chk_code_allowed(u16 code_to_probe) +{ + static const bool codes[] = { + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_K] = true, + [BPF_ALU | BPF_ADD | BPF_X] = true, + [BPF_ALU | BPF_SUB | BPF_K] = true, + [BPF_ALU | BPF_SUB | BPF_X] = true, + [BPF_ALU | BPF_MUL | BPF_K] = true, + [BPF_ALU | BPF_MUL | BPF_X] = true, + [BPF_ALU | BPF_DIV | BPF_K] = true, + [BPF_ALU | BPF_DIV | BPF_X] = true, + [BPF_ALU | BPF_MOD | BPF_K] = true, + [BPF_ALU | BPF_MOD | BPF_X] = true, + [BPF_ALU | BPF_AND | BPF_K] = true, + [BPF_ALU | BPF_AND | BPF_X] = true, + [BPF_ALU | BPF_OR | BPF_K] = true, + [BPF_ALU | BPF_OR | BPF_X] = true, + [BPF_ALU | BPF_XOR | BPF_K] = true, + [BPF_ALU | BPF_XOR | BPF_X] = true, + [BPF_ALU | BPF_LSH | BPF_K] = true, + [BPF_ALU | BPF_LSH | BPF_X] = true, + [BPF_ALU | BPF_RSH | BPF_K] = true, + [BPF_ALU | BPF_RSH | BPF_X] = true, + [BPF_ALU | BPF_NEG] = true, + /* Load instructions */ + [BPF_LD | BPF_W | BPF_ABS] = true, + [BPF_LD | BPF_H | BPF_ABS] = true, + [BPF_LD | BPF_B | BPF_ABS] = true, + [BPF_LD | BPF_W | BPF_LEN] = true, + [BPF_LD | BPF_W | BPF_IND] = true, + [BPF_LD | BPF_H | BPF_IND] = true, + [BPF_LD | BPF_B | BPF_IND] = true, + [BPF_LD | BPF_IMM] = true, + [BPF_LD | BPF_MEM] = true, + [BPF_LDX | BPF_W | BPF_LEN] = true, + [BPF_LDX | BPF_B | BPF_MSH] = true, + [BPF_LDX | BPF_IMM] = true, + [BPF_LDX | BPF_MEM] = true, + /* Store instructions */ + [BPF_ST] = true, + [BPF_STX] = true, + /* Misc instructions */ + [BPF_MISC | BPF_TAX] = true, + [BPF_MISC | BPF_TXA] = true, + /* Return instructions */ + [BPF_RET | BPF_K] = true, + [BPF_RET | BPF_A] = true, + /* Jump instructions */ + [BPF_JMP | BPF_JA] = true, + [BPF_JMP | BPF_JEQ | BPF_K] = true, + [BPF_JMP | BPF_JEQ | BPF_X] = true, + [BPF_JMP | BPF_JGE | BPF_K] = true, + [BPF_JMP | BPF_JGE | BPF_X] = true, + [BPF_JMP | BPF_JGT | BPF_K] = true, + [BPF_JMP | BPF_JGT | BPF_X] = true, + [BPF_JMP | BPF_JSET | BPF_K] = true, + [BPF_JMP | BPF_JSET | BPF_X] = true, + }; + + if (code_to_probe >= ARRAY_SIZE(codes)) + return false; + + return codes[code_to_probe]; +} + +static bool bpf_check_basics_ok(const struct sock_filter *filter, + unsigned int flen) +{ + if (filter == NULL) + return false; + if (flen == 0 || flen > BPF_MAXINSNS) + return false; + + return true; +} + /** - * sk_chk_filter - verify socket filter code + * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * @@ -477,237 +1078,468 @@ error: * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int sk_chk_filter(struct sock_filter *filter, unsigned int flen) +static int bpf_check_classic(const struct sock_filter *filter, + unsigned int flen) { - /* - * Valid instructions are initialized to non-0. - * Invalid instructions are initialized to 0. - */ - static const u8 codes[] = { - [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K, - [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X, - [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K, - [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X, - [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, - [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, - [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, - [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K, - [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X, - [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, - [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, - [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, - [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, - [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K, - [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X, - [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, - [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, - [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, - [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X, - [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG, - [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS, - [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS, - [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS, - [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN, - [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND, - [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND, - [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND, - [BPF_LD|BPF_IMM] = BPF_S_LD_IMM, - [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN, - [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH, - [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM, - [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX, - [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA, - [BPF_RET|BPF_K] = BPF_S_RET_K, - [BPF_RET|BPF_A] = BPF_S_RET_A, - [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K, - [BPF_LD|BPF_MEM] = BPF_S_LD_MEM, - [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM, - [BPF_ST] = BPF_S_ST, - [BPF_STX] = BPF_S_STX, - [BPF_JMP|BPF_JA] = BPF_S_JMP_JA, - [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K, - [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X, - [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K, - [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X, - [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K, - [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X, - [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, - [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, - }; - int pc; bool anc_found; + int pc; - if (flen == 0 || flen > BPF_MAXINSNS) - return -EINVAL; - - /* check the filter code now */ + /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { - struct sock_filter *ftest = &filter[pc]; - u16 code = ftest->code; + const struct sock_filter *ftest = &filter[pc]; - if (code >= ARRAY_SIZE(codes)) - return -EINVAL; - code = codes[code]; - if (!code) + /* May we actually operate on this code? */ + if (!chk_code_allowed(ftest->code)) return -EINVAL; + /* Some instructions need special checks */ - switch (code) { - case BPF_S_ALU_DIV_K: - /* check for division by zero */ + switch (ftest->code) { + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; - ftest->k = reciprocal_value(ftest->k); break; - case BPF_S_ALU_MOD_K: - /* check for division by zero */ - if (ftest->k == 0) + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_K: + if (ftest->k >= 32) return -EINVAL; break; - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: - case BPF_S_ST: - case BPF_S_STX: - /* check for invalid memory addresses */ + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; - case BPF_S_JMP_JA: - /* - * Note, the large ftest->k might cause loops. + case BPF_JMP | BPF_JA: + /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ - if (ftest->k >= (unsigned int)(flen-pc-1)) + if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_X: - case BPF_S_JMP_JSET_K: - /* for conditionals both must be safe */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: anc_found = false; -#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ - code = BPF_S_ANC_##CODE; \ - anc_found = true; \ - break - switch (ftest->k) { - ANCILLARY(PROTOCOL); - ANCILLARY(PKTTYPE); - ANCILLARY(IFINDEX); - ANCILLARY(NLATTR); - ANCILLARY(NLATTR_NEST); - ANCILLARY(MARK); - ANCILLARY(QUEUE); - ANCILLARY(HATYPE); - ANCILLARY(RXHASH); - ANCILLARY(CPU); - ANCILLARY(ALU_XOR_X); - ANCILLARY(VLAN_TAG); - ANCILLARY(VLAN_TAG_PRESENT); - ANCILLARY(PAY_OFFSET); - } - - /* ancillary operation unknown or unsupported */ + if (bpf_anc_helper(ftest) & BPF_ANC) + anc_found = true; + /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } - ftest->code = code; } - /* last instruction must be a RET code */ + /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { - case BPF_S_RET_K: - case BPF_S_RET_A: + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } + return -EINVAL; } -EXPORT_SYMBOL(sk_chk_filter); + +static int bpf_prog_store_orig_filter(struct bpf_prog *fp, + const struct sock_fprog *fprog) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct sock_fprog_kern *fkprog; + + fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); + if (!fp->orig_prog) + return -ENOMEM; + + fkprog = fp->orig_prog; + fkprog->len = fprog->len; + + fkprog->filter = kmemdup(fp->insns, fsize, + GFP_KERNEL | __GFP_NOWARN); + if (!fkprog->filter) { + kfree(fp->orig_prog); + return -ENOMEM; + } + + return 0; +} + +static void bpf_release_orig_filter(struct bpf_prog *fp) +{ + struct sock_fprog_kern *fprog = fp->orig_prog; + + if (fprog) { + kfree(fprog->filter); + kfree(fprog); + } +} + +static void __bpf_prog_release(struct bpf_prog *prog) +{ + if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + } else { + bpf_release_orig_filter(prog); + bpf_prog_free(prog); + } +} + +static void __sk_filter_release(struct sk_filter *fp) +{ + __bpf_prog_release(fp->prog); + kfree(fp); +} /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ -void sk_filter_release_rcu(struct rcu_head *rcu) +static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - bpf_jit_free(fp); - kfree(fp); + __sk_filter_release(fp); } -EXPORT_SYMBOL(sk_filter_release_rcu); -static int __sk_prepare_filter(struct sk_filter *fp) +/** + * sk_filter_release - release a socket filter + * @fp: filter to remove + * + * Remove a filter from a socket and release its resources. + */ +static void sk_filter_release(struct sk_filter *fp) { - int err; + if (refcount_dec_and_test(&fp->refcnt)) + call_rcu(&fp->rcu, sk_filter_release_rcu); +} - fp->bpf_func = sk_run_filter; +void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) +{ + u32 filter_size = bpf_prog_size(fp->prog->len); - err = sk_chk_filter(fp->insns, fp->len); + atomic_sub(filter_size, &sk->sk_omem_alloc); + sk_filter_release(fp); +} + +/* try to charge the socket memory if there is space available + * return true on success + */ +static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + u32 filter_size = bpf_prog_size(fp->prog->len); + + /* same check as in sock_kmalloc() */ + if (filter_size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { + atomic_add(filter_size, &sk->sk_omem_alloc); + return true; + } + return false; +} + +bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; +} + +static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) +{ + struct sock_filter *old_prog; + struct bpf_prog *old_fp; + int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; + + /* We are free to overwrite insns et al right here as it won't be used at + * this point in time anymore internally after the migration to the eBPF + * instruction representation. + */ + BUILD_BUG_ON(sizeof(struct sock_filter) != + sizeof(struct bpf_insn)); + + /* Conversion cannot happen on overlapping memory areas, + * so we need to keep the user BPF around until the 2nd + * pass. At this time, the user BPF is stored in fp->insns. + */ + old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), + GFP_KERNEL | __GFP_NOWARN); + if (!old_prog) { + err = -ENOMEM; + goto out_err; + } + + /* 1st pass: calculate the new program length. */ + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); if (err) - return err; + goto out_err_free; + + /* Expand fp for appending the new filter representation. */ + old_fp = fp; + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); + if (!fp) { + /* The old_fp is still around in case we couldn't + * allocate new memory, so uncharge on that one. + */ + fp = old_fp; + err = -ENOMEM; + goto out_err_free; + } + + fp->len = new_len; + + /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); + if (err) + /* 2nd bpf_convert_filter() can fail only if it fails + * to allocate memory, remapping must succeed. Note, + * that at this time old_fp has already been released + * by krealloc(). + */ + goto out_err_free; + + fp = bpf_prog_select_runtime(fp, &err); + if (err) + goto out_err_free; + + kfree(old_prog); + return fp; + +out_err_free: + kfree(old_prog); +out_err: + __bpf_prog_release(fp); + return ERR_PTR(err); +} +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans) +{ + int err; + + fp->bpf_func = NULL; + fp->jited = 0; + + err = bpf_check_classic(fp->insns, fp->len); + if (err) { + __bpf_prog_release(fp); + return ERR_PTR(err); + } + + /* There might be additional checks and transformations + * needed on classic filters, f.e. in case of seccomp. + */ + if (trans) { + err = trans(fp->insns, fp->len); + if (err) { + __bpf_prog_release(fp); + return ERR_PTR(err); + } + } + + /* Probe if we can JIT compile the filter and if so, do + * the compilation of the filter. + */ bpf_jit_compile(fp); - return 0; + + /* JIT compiler couldn't process this filter, so do the eBPF translation + * for the optimized interpreter. + */ + if (!fp->jited) + fp = bpf_migrate_filter(fp); + + return fp; } /** - * sk_unattached_filter_create - create an unattached filter - * @fprog: the filter program + * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created + * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ -int sk_unattached_filter_create(struct sk_filter **pfp, - struct sock_fprog *fprog) +int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { - struct sk_filter *fp; - unsigned int fsize = sizeof(struct sock_filter) * fprog->len; - int err; + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; - fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; + memcpy(fp->insns, fprog->filter, fsize); - atomic_set(&fp->refcnt, 1); fp->len = fprog->len; + /* Since unattached filters are not copied back to user + * space through sk_get_filter(), we do not need to hold + * a copy here, and can spare us the work. + */ + fp->orig_prog = NULL; - err = __sk_prepare_filter(fp); - if (err) - goto free_mem; + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + fp = bpf_prepare_filter(fp, NULL); + if (IS_ERR(fp)) + return PTR_ERR(fp); *pfp = fp; return 0; -free_mem: - kfree(fp); - return err; } -EXPORT_SYMBOL_GPL(sk_unattached_filter_create); +EXPORT_SYMBOL_GPL(bpf_prog_create); -void sk_unattached_filter_destroy(struct sk_filter *fp) +/** + * bpf_prog_create_from_user - create an unattached filter from user buffer + * @pfp: the unattached filter that is created + * @fprog: the filter program + * @trans: post-classic verifier transformation handler + * @save_orig: save classic BPF program + * + * This function effectively does the same as bpf_prog_create(), only + * that it builds up its insns buffer from user space provided buffer. + * It also allows for passing a bpf_aux_classic_check_t handler. + */ +int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, + bpf_aux_classic_check_t trans, bool save_orig) { - sk_filter_release(fp); + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) + return -EINVAL; + + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!fp) + return -ENOMEM; + + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + __bpf_prog_free(fp); + return -EFAULT; + } + + fp->len = fprog->len; + fp->orig_prog = NULL; + + if (save_orig) { + err = bpf_prog_store_orig_filter(fp, fprog); + if (err) { + __bpf_prog_free(fp); + return -ENOMEM; + } + } + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + fp = bpf_prepare_filter(fp, trans); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + *pfp = fp; + return 0; +} +EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); + +void bpf_prog_destroy(struct bpf_prog *fp) +{ + __bpf_prog_release(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_destroy); + +static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + fp->prog = prog; + + if (!__sk_filter_charge(sk, fp)) { + kfree(fp); + return -ENOMEM; + } + refcount_set(&fp->refcnt, 1); + + old_fp = rcu_dereference_protected(sk->sk_filter, + lockdep_sock_is_held(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + + return 0; +} + +static +struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *prog; + int err; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return ERR_PTR(-EPERM); + + /* Make sure new filter is there and in the right amounts. */ + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) + return ERR_PTR(-EINVAL); + + prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!prog) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(prog->insns, fprog->filter, fsize)) { + __bpf_prog_free(prog); + return ERR_PTR(-EFAULT); + } + + prog->len = fprog->len; + + err = bpf_prog_store_orig_filter(prog, fprog); + if (err) { + __bpf_prog_free(prog); + return ERR_PTR(-ENOMEM); + } + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + return bpf_prepare_filter(prog, NULL); } -EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); /** * sk_attach_filter - attach a socket filter @@ -721,43 +1553,9795 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { - struct sk_filter *fp, *old_fp; - unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + struct bpf_prog *prog = __get_filter(fprog, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __sk_attach_prog(prog, sk); + if (err < 0) { + __bpf_prog_release(prog); + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_attach_filter); + +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct bpf_prog *prog = __get_filter(fprog, sk); + int err, optmem_max; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) + err = -ENOMEM; + else + err = reuseport_attach_prog(sk, prog); + + if (err) + __bpf_prog_release(prog); + + return err; +} + +static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) +{ + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return ERR_PTR(-EPERM); + + return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); +} + +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); int err; + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __sk_attach_prog(prog, sk); + if (err < 0) { + bpf_prog_put(prog); + return err; + } + + return 0; +} + +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog; + int err, optmem_max; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; - /* Make sure new filter is there and in the right amounts. */ - if (fprog->filter == NULL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); + if (PTR_ERR(prog) == -EINVAL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER + * bpf prog (e.g. sockmap). It depends on the + * limitation imposed by bpf_prog_load(). + * Hence, sysctl_optmem_max is not checked. + */ + if ((sk->sk_type != SOCK_STREAM && + sk->sk_type != SOCK_DGRAM) || + (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) || + (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6)) { + err = -ENOTSUPP; + goto err_prog_put; + } + } else { + /* BPF_PROG_TYPE_SOCKET_FILTER */ + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) { + err = -ENOMEM; + goto err_prog_put; + } + } + + err = reuseport_attach_prog(sk, prog); +err_prog_put: + if (err) + bpf_prog_put(prog); + + return err; +} + +void sk_reuseport_prog_free(struct bpf_prog *prog) +{ + if (!prog) + return; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + bpf_prog_put(prog); + else + bpf_prog_destroy(prog); +} + +static inline int __bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ +#ifdef CONFIG_DEBUG_NET + /* Avoid a splat in pskb_may_pull_reason() */ + if (write_len > INT_MAX) return -EINVAL; +#endif + return skb_ensure_writable(skb, write_len); +} - fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); - if (!fp) +static inline int bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_pointers(skb); + return err; +} + +static int bpf_try_make_head_writable(struct sk_buff *skb) +{ + return bpf_try_make_writable(skb, skb_headlen(skb)); +} + +static inline void bpf_push_mac_rcsum(struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); +} + +static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); +} + +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len, u64, flags) +{ + void *ptr; + + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) + return -EINVAL; + if (unlikely(offset > INT_MAX)) + return -EFAULT; + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + + ptr = skb->data + offset; + if (flags & BPF_F_RECOMPUTE_CSUM) + __skb_postpull_rcsum(skb, ptr, len, offset); + + memcpy(ptr, from, len); + + if (flags & BPF_F_RECOMPUTE_CSUM) + __skb_postpush_rcsum(skb, ptr, len, offset); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_store_bytes_proto = { + .func = bpf_skb_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, + u32 len, u64 flags) +{ + return ____bpf_skb_store_bytes(skb, offset, from, len, flags); +} + +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, + void *, to, u32, len) +{ + void *ptr; + + if (unlikely(offset > INT_MAX)) + goto err_clear; + + ptr = skb_header_pointer(skb, offset, len, to); + if (unlikely(!ptr)) + goto err_clear; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_proto = { + .func = bpf_skb_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) +{ + return ____bpf_skb_load_bytes(skb, offset, to, len); +} + +BPF_CALL_4(bpf_flow_dissector_load_bytes, + const struct bpf_flow_dissector *, ctx, u32, offset, + void *, to, u32, len) +{ + void *ptr; + + if (unlikely(offset > 0xffff)) + goto err_clear; + + if (unlikely(!ctx->skb)) + goto err_clear; + + ptr = skb_header_pointer(ctx->skb, offset, len, to); + if (unlikely(!ptr)) + goto err_clear; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { + .func = bpf_flow_dissector_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, + u32, offset, void *, to, u32, len, u32, start_header) +{ + u8 *end = skb_tail_pointer(skb); + u8 *start, *ptr; + + if (unlikely(offset > 0xffff)) + goto err_clear; + + switch (start_header) { + case BPF_HDR_START_MAC: + if (unlikely(!skb_mac_header_was_set(skb))) + goto err_clear; + start = skb_mac_header(skb); + break; + case BPF_HDR_START_NET: + start = skb_network_header(skb); + break; + default: + goto err_clear; + } + + ptr = start + offset; + + if (likely(ptr + len <= end)) { + memcpy(to, ptr, len); + return 0; + } + +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { + .func = bpf_skb_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto bpf_skb_pull_data_proto = { + .func = bpf_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) +{ + return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sk_fullsock_proto = { + .func = bpf_sk_fullsock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + +static inline int sk_skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return __bpf_try_make_writable(skb, write_len); +} + +BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto sk_skb_pull_data_proto = { + .func = sk_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) +{ + __sum16 *ptr; + + if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) + return -EINVAL; + if (unlikely(offset > 0xffff || offset & 1)) + return -EFAULT; + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) + return -EFAULT; + + ptr = (__sum16 *)(skb->data + offset); + switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + csum_replace_by_diff(ptr, to); + break; + case 2: + csum_replace2(ptr, from, to); + break; + case 4: + csum_replace4(ptr, from, to); + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_l3_csum_replace_proto = { + .func = bpf_l3_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) +{ + bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; + bool do_mforce = flags & BPF_F_MARK_ENFORCE; + bool is_ipv6 = flags & BPF_F_IPV6; + __sum16 *ptr; + + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) + return -EINVAL; + if (unlikely(offset > 0xffff || offset & 1)) + return -EFAULT; + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) + return -EFAULT; + + ptr = (__sum16 *)(skb->data + offset); + if (is_mmzero && !do_mforce && !*ptr) + return 0; + + switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); + break; + case 2: + inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); + break; + case 4: + inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); + break; + default: + return -EINVAL; + } + + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; + return 0; +} + +static const struct bpf_func_proto bpf_l4_csum_replace_proto = { + .func = bpf_l4_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, + __be32 *, to, u32, to_size, __wsum, seed) +{ + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + + __wsum ret = seed; + + if (from_size && to_size) + ret = csum_sub(csum_partial(to, to_size, ret), + csum_partial(from, from_size, 0)); + else if (to_size) + ret = csum_partial(to, to_size, ret); + + else if (from_size) + ret = ~csum_partial(from, from_size, ~ret); + + return csum_from32to16((__force unsigned int)ret); +} + +static const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +{ + /* The interface is to be used in combination with bpf_csum_diff() + * for direct packet writes. csum rotation for alignment as well + * as emulating csum_sub() can be done from the eBPF program. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + return (skb->csum = csum_add(skb->csum, csum)); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_csum_update_proto = { + .func = bpf_csum_update, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) +{ + return dev_forward_skb_nomtu(dev, skb); +} + +static inline int __bpf_rx_skb_no_mac(struct net_device *dev, + struct sk_buff *skb) +{ + int ret = ____dev_forward_skb(dev, skb, false); + + if (likely(!ret)) { + skb->dev = dev; + ret = netif_rx(skb); + } + + return ret; +} + +static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) +{ + int ret; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + kfree_skb(skb); + return -ENETDOWN; + } + + skb->dev = dev; + skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb)); + skb_clear_tstamp(skb); + + dev_xmit_recursion_inc(); + ret = dev_queue_xmit(skb); + dev_xmit_recursion_dec(); + + return ret; +} + +static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + unsigned int mlen = skb_network_offset(skb); + + if (unlikely(skb->len <= mlen)) { + kfree_skb(skb); + return -ERANGE; + } + + if (mlen) { + __skb_pull(skb, mlen); + + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + } + skb_pop_mac_header(skb); + skb_reset_mac_len(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { + kfree_skb(skb); + return -ERANGE; + } + + bpf_push_mac_rcsum(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + if (dev_is_mac_header_xmit(dev)) + return __bpf_redirect_common(skb, dev, flags); + else + return __bpf_redirect_no_mac(skb, dev, flags); +} + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) +{ + u32 hh_len = LL_RESERVED_SPACE(dev); + const struct in6_addr *nexthop; + struct dst_entry *dst = NULL; + struct neighbour *neigh; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb_clear_tstamp(skb); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return -ENOMEM; + } + + rcu_read_lock(); + if (!nh) { + dst = skb_dst(skb); + nexthop = rt6_nexthop(dst_rt6_info(dst), + &ipv6_hdr(skb)->daddr); + } else { + nexthop = &nh->ipv6_nh; + } + neigh = ip_neigh_gw6(dev, nexthop); + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + local_bh_disable(); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, false); + dev_xmit_recursion_dec(); + local_bh_enable(); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); + if (dst) + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + + if (!nh) { + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + } else if (nh->nh_family != AF_INET6) { + goto out_drop; + } + + err = bpf_out_neigh_v6(net, skb, dev, nh); + if (unlikely(net_xmit_eval(err))) + DEV_STATS_INC(dev, tx_errors); + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + DEV_STATS_INC(dev, tx_errors); + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_IPV6 */ + +#if IS_ENABLED(CONFIG_INET) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) +{ + u32 hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; + bool is_v6gw = false; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb_clear_tstamp(skb); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return -ENOMEM; + } + + rcu_read_lock(); + if (!nh) { + struct rtable *rt = skb_rtable(skb); + + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + } else if (nh->nh_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); + is_v6gw = true; + } else if (nh->nh_family == AF_INET) { + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); + } else { + rcu_read_unlock(); + goto out_drop; + } + + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + local_bh_disable(); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, is_v6gw); + dev_xmit_recursion_dec(); + local_bh_enable(); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + + if (!nh) { + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_dscp = ip4h_dscp(ip4h), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + struct rtable *rt; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } + + err = bpf_out_neigh_v4(net, skb, dev, nh); + if (unlikely(net_xmit_eval(err))) + DEV_STATS_INC(dev, tx_errors); + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + DEV_STATS_INC(dev, tx_errors); + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_INET */ + +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + struct ethhdr *ethh = eth_hdr(skb); + + if (unlikely(skb->mac_header >= skb->network_header)) + goto out; + bpf_push_mac_rcsum(skb); + if (is_multicast_ether_addr(ethh->h_dest)) + goto out; + + skb_pull(skb, sizeof(*ethh)); + skb_unset_mac_header(skb); + skb_reset_network_header(skb); + + if (skb->protocol == htons(ETH_P_IP)) + return __bpf_redirect_neigh_v4(skb, dev, nh); + else if (skb->protocol == htons(ETH_P_IPV6)) + return __bpf_redirect_neigh_v6(skb, dev, nh); +out: + kfree_skb(skb); + return -ENOTSUPP; +} + +/* Internal, non-exposed redirect flags. */ +enum { + BPF_F_NEIGH = (1ULL << 16), + BPF_F_PEER = (1ULL << 17), + BPF_F_NEXTHOP = (1ULL << 18), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) +}; + +BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) +{ + struct net_device *dev; + struct sk_buff *clone; + int ret; + + BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); + + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) + return -EINVAL; + + /* BPF test infra's convert___skb_to_skb() can create type-less + * GSO packets. gso_features_check() will detect this as a bad + * offload. However, lets not leak them out in the first place. + */ + if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type)) + return -EBADMSG; + + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); + if (unlikely(!dev)) + return -EINVAL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!clone)) return -ENOMEM; - if (copy_from_user(fp->insns, fprog->filter, fsize)) { - sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + + /* For direct write, we need to keep the invariant that the skbs + * we're dealing with need to be uncloned. Should uncloning fail + * here, we need to free the just generated clone to unclone once + * again. + */ + ret = bpf_try_make_head_writable(skb); + if (unlikely(ret)) { + kfree_skb(clone); + return -ENOMEM; + } + + return __bpf_redirect(clone, dev, flags); +} + +static const struct bpf_func_proto bpf_clone_redirect_proto = { + .func = bpf_clone_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static struct net_device *skb_get_peer_dev(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (likely(ops->ndo_get_peer_dev)) + return INDIRECT_CALL_1(ops->ndo_get_peer_dev, + netkit_peer_dev, dev); + return NULL; +} + +int skb_do_redirect(struct sk_buff *skb) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + struct net *net = dev_net(skb->dev); + struct net_device *dev; + u32 flags = ri->flags; + + dev = dev_get_by_index_rcu(net, ri->tgt_index); + ri->tgt_index = 0; + ri->flags = 0; + if (unlikely(!dev)) + goto out_drop; + if (flags & BPF_F_PEER) { + if (unlikely(!skb_at_tc_ingress(skb))) + goto out_drop; + dev = skb_get_peer_dev(dev); + if (unlikely(!dev || + !(dev->flags & IFF_UP) || + net_eq(net, dev_net(dev)))) + goto out_drop; + skb->dev = dev; + dev_sw_netstats_rx_add(dev, skb->len); + skb_scrub_packet(skb, false); + return -EAGAIN; + } + return flags & BPF_F_NEIGH ? + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? + &ri->nh : NULL) : + __bpf_redirect(skb, dev, flags); +out_drop: + kfree_skb(skb); + return -EINVAL; +} + +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) + return TC_ACT_SHOT; + + ri->flags = flags; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_proto = { + .func = bpf_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_PEER; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_peer_proto = { + .func = bpf_redirect_peer, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, + int, plen, u64, flags) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + + if (unlikely((plen && plen < sizeof(*params)) || flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); + ri->tgt_index = ifindex; + + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); + if (plen) + memcpy(&ri->nh, params, sizeof(ri->nh)); + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_neigh_proto = { + .func = bpf_redirect_neigh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) +{ + msg->apply_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { + .func = bpf_msg_apply_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) +{ + msg->cork_bytes = bytes; + return 0; +} + +static void sk_msg_reset_curr(struct sk_msg *msg) +{ + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else { + u32 i = msg->sg.end; + + sk_msg_iter_var_prev(i); + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } +} + +static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { + .func = bpf_msg_cork_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, + u32, end, u64, flags) +{ + u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; + u32 first_sge, last_sge, i, shift, bytes_sg_total; + struct scatterlist *sge; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags || end <= start)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + offset += len; + len = sk_msg_elem(msg, i)->length; + if (start < offset + len) + break; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (unlikely(start >= offset + len)) + return -EINVAL; + + first_sge = i; + /* The start may point into the sg element so we need to also + * account for the headroom. + */ + bytes_sg_total = start - offset + bytes; + if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) + goto out; + + /* At this point we need to linearize multiple scatterlist + * elements or a single shared page. Either way we need to + * copy into a linear buffer exclusively owned by BPF. Then + * place the buffer in the scatterlist and fixup the original + * entries by removing the entries now in the linear buffer + * and shifting the remaining entries. For now we do not try + * to copy partial entries to avoid complexity of running out + * of sg_entry slots. The downside is reading a single byte + * will copy the entire sg entry. + */ + do { + copy += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); + if (bytes_sg_total <= copy) + break; + } while (i != msg->sg.end); + last_sge = i; + + if (unlikely(bytes_sg_total > copy)) + return -EINVAL; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy)); + if (unlikely(!page)) + return -ENOMEM; + + raw = page_address(page); + i = first_sge; + do { + sge = sk_msg_elem(msg, i); + from = sg_virt(sge); + len = sge->length; + to = raw + poffset; + + memcpy(to, from, len); + poffset += len; + sge->length = 0; + put_page(sg_page(sge)); + + sk_msg_iter_var_next(i); + } while (i != last_sge); + + sg_set_page(&msg->sg.data[first_sge], page, copy, 0); + + /* To repair sg ring we need to shift entries. If we only + * had a single entry though we can just replace it and + * be done. Otherwise walk the ring and shift the entries. + */ + WARN_ON_ONCE(last_sge == first_sge); + shift = last_sge > first_sge ? + last_sge - first_sge - 1 : + NR_MSG_FRAG_IDS - first_sge + last_sge - 1; + if (!shift) + goto out; + + i = first_sge; + sk_msg_iter_var_next(i); + do { + u32 move_from; + + if (i + shift >= NR_MSG_FRAG_IDS) + move_from = i + shift - NR_MSG_FRAG_IDS; + else + move_from = i + shift; + if (move_from == msg->sg.end) + break; + + msg->sg.data[i] = msg->sg.data[move_from]; + msg->sg.data[move_from].length = 0; + msg->sg.data[move_from].page_link = 0; + msg->sg.data[move_from].offset = 0; + sk_msg_iter_var_next(i); + } while (1); + + msg->sg.end = msg->sg.end - shift > msg->sg.end ? + msg->sg.end - shift + NR_MSG_FRAG_IDS : + msg->sg.end - shift; +out: + sk_msg_reset_curr(msg); + msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; + msg->data_end = msg->data + bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_pull_data_proto = { + .func = bpf_msg_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; + u32 new, i = 0, l = 0, space, copy = 0, offset = 0; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(len == 0)) + return 0; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + offset += l; + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (start > offset + l) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + /* If no space available will fallback to copy, we need at + * least one scatterlist elem available to push data into + * when start aligns to the beginning of an element or two + * when it falls inside an element. We handle the start equals + * offset case because its the common case for inserting a + * header. + */ + if (!space || (space == 1 && start != offset)) + copy = msg->sg.data[i].length; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy + len)); + if (unlikely(!page)) + return -ENOMEM; + + if (copy) { + int front, back; + + raw = page_address(page); + + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); + psge = sk_msg_elem(msg, i); + front = start - offset; + back = psge->length - front; + from = sg_virt(psge); + + if (front) + memcpy(raw, from, front); + + if (back) { + from += front; + to = raw + front + len; + + memcpy(to, from, back); + } + + put_page(sg_page(psge)); + new = i; + goto place_new; + } + + if (start - offset) { + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); + psge = sk_msg_elem(msg, i); + rsge = sk_msg_elem_cpy(msg, i); + + psge->length = start - offset; + rsge.length -= psge->length; + rsge.offset += start; + + sk_msg_iter_var_next(i); + sg_unmark_end(psge); + sg_unmark_end(&rsge); + } + + /* Slot(s) to place newly allocated data */ + sk_msg_iter_next(msg, end); + new = i; + sk_msg_iter_var_next(i); + + if (i == msg->sg.end) { + if (!rsge.length) + goto place_new; + sk_msg_iter_next(msg, end); + goto place_new; + } + + /* Shift one or two slots as needed */ + sge = sk_msg_elem_cpy(msg, new); + sg_unmark_end(&sge); + + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { + sk_msg_iter_var_next(i); + nnsge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_next(msg, end); + } + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); + if (rsge.length) { + nsge = nnsge; + nnsge = sk_msg_elem_cpy(msg, i); + } else { + nsge = sk_msg_elem_cpy(msg, i); + } + } + +place_new: + /* Place newly allocated data buffer */ + sk_mem_charge(msg->sk, len); + msg->sg.size += len; + __clear_bit(new, msg->sg.copy); + sg_set_page(&msg->sg.data[new], page, len + copy, 0); + if (rsge.length) { + get_page(sg_page(&rsge)); + sk_msg_iter_var_next(new); + msg->sg.data[new] = rsge; + } + + sk_msg_reset_curr(msg); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_push_data_proto = { + .func = bpf_msg_push_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + int prev; + + put_page(sg_page(sge)); + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l = 0, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(len == 0)) + return 0; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + offset += l; + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last > msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the right free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start - offset; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (b > 0) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else { + pop -= (sge->length - a); + sge->length = a; + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_reset_curr(msg); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +#ifdef CONFIG_CGROUP_NET_CLASSID +BPF_CALL_0(bpf_get_cgroup_classid_curr) +{ + return __task_get_classid(current); +} + +const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { + .func = bpf_get_cgroup_classid_curr, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return sock_cgroup_classid(&sk->sk_cgrp_data); +} + +static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { + .func = bpf_skb_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +#endif + +BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) +{ + return task_get_classid(skb); +} + +static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { + .func = bpf_get_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) +{ + return dst_tclassid(skb); +} + +static const struct bpf_func_proto bpf_get_route_realm_proto = { + .func = bpf_get_route_realm, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) +{ + /* If skb_clear_hash() was called due to mangling, we can + * trigger SW recalculation here. Later access to hash + * can then use the inline skb->hash via context directly + * instead of calling this helper again. + */ + return skb_get_hash(skb); +} + +static const struct bpf_func_proto bpf_get_hash_recalc_proto = { + .func = bpf_get_hash_recalc, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) +{ + /* After all direct packet write, this can be used once for + * triggering a lazy recalc on next skb_get_hash() invocation. + */ + skb_clear_hash(skb); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_invalid_proto = { + .func = bpf_set_hash_invalid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) +{ + /* Set user specified hash as L4(+), so that it gets returned + * on skb_get_hash() call unless BPF prog later on triggers a + * skb_clear_hash(). + */ + __skb_set_sw_hash(skb, hash, true); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_proto = { + .func = bpf_set_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, + u16, vlan_tci) +{ + int ret; + + if (unlikely(vlan_proto != htons(ETH_P_8021Q) && + vlan_proto != htons(ETH_P_8021AD))) + vlan_proto = htons(ETH_P_8021Q); + + bpf_push_mac_rcsum(skb); + ret = skb_vlan_push(skb, vlan_proto, vlan_tci); + bpf_pull_mac_rcsum(skb); + skb_reset_mac_len(skb); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_vlan_push_proto = { + .func = bpf_skb_vlan_push, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) +{ + int ret; + + bpf_push_mac_rcsum(skb); + ret = skb_vlan_pop(skb); + bpf_pull_mac_rcsum(skb); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { + .func = bpf_skb_vlan_pop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) +{ + skb->protocol = htons(proto); + if (skb_valid_dst(skb)) + skb_dst_drop(skb); +} + +static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) +{ + /* Caller already did skb_cow() with meta_len+len as headroom, + * so no need to do it here. + */ + skb_push(skb, len); + skb_postpush_data_move(skb, len, off); + memset(skb->data + off, 0, len); + + /* No skb_postpush_rcsum(skb, skb->data + off, len) + * needed here as it does not change the skb->csum + * result for checksum complete when summing over + * zeroed blocks. + */ + return 0; +} + +static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) +{ + void *old_data; + + /* skb_ensure_writable() is not needed here, as we're + * already working on an uncloned skb. + */ + if (unlikely(!pskb_may_pull(skb, off + len))) + return -ENOMEM; + + old_data = skb->data; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, old_data + off, len); + skb_postpull_data_move(skb, len, off); + + return 0; +} + +static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* There's no need for __skb_push()/__skb_pull() pair to + * get to the start of the mac header as we're guaranteed + * to always start from here under eBPF. + */ + ret = bpf_skb_generic_push(skb, off, len); + if (likely(!ret)) { + skb->mac_header -= len; + skb->network_header -= len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* Same here, __skb_push()/__skb_pull() pair not needed. */ + ret = bpf_skb_generic_pop(skb, off, len); + if (likely(!ret)) { + skb->mac_header += len; + skb->network_header += len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + const u8 meta_len = skb_metadata_len(skb); + u32 off = skb_mac_header_len(skb); + int ret; + + ret = skb_cow(skb, meta_len + len_diff); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ + if (shinfo->gso_type & SKB_GSO_TCPV4) { + shinfo->gso_type &= ~SKB_GSO_TCPV4; + shinfo->gso_type |= SKB_GSO_TCPV6; + } + } + + bpf_skb_change_protocol(skb, ETH_P_IPV6); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb_mac_header_len(skb); + int ret; + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ + if (shinfo->gso_type & SKB_GSO_TCPV6) { + shinfo->gso_type &= ~SKB_GSO_TCPV6; + shinfo->gso_type |= SKB_GSO_TCPV4; + } + } + + bpf_skb_change_protocol(skb, ETH_P_IP); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) +{ + __be16 from_proto = skb->protocol; + + if (from_proto == htons(ETH_P_IP) && + to_proto == htons(ETH_P_IPV6)) + return bpf_skb_proto_4_to_6(skb); + + if (from_proto == htons(ETH_P_IPV6) && + to_proto == htons(ETH_P_IP)) + return bpf_skb_proto_6_to_4(skb); + + return -ENOTSUPP; +} + +BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, + u64, flags) +{ + int ret; + + if (unlikely(flags)) + return -EINVAL; + + /* General idea is that this helper does the basic groundwork + * needed for changing the protocol, and eBPF program fills the + * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() + * and other helpers, rather than passing a raw buffer here. + * + * The rationale is to keep this minimal and without a need to + * deal with raw packet data. F.e. even if we would pass buffers + * here, the program still needs to call the bpf_lX_csum_replace() + * helpers anyway. Plus, this way we keep also separation of + * concerns, since f.e. bpf_skb_store_bytes() should only take + * care of stores. + * + * Currently, additional options and extension header space are + * not supported, but flags register is reserved so we can adapt + * that. For offloads, we mark packet as dodgy, so that headers + * need to be verified first. + */ + ret = bpf_skb_proto_xlat(skb, proto); + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_proto_proto = { + .func = bpf_skb_change_proto, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) +{ + /* We only allow a restricted subset to be changed for now. */ + if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || + !skb_pkt_type_ok(pkt_type))) + return -EINVAL; + + skb->pkt_type = pkt_type; + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_type_proto = { + .func = bpf_skb_change_type, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +static u32 bpf_skb_net_base_len(const struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return sizeof(struct iphdr); + case htons(ETH_P_IPV6): + return sizeof(struct ipv6hdr); + default: + return ~0U; + } +} + +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + +#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ + BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ + BPF_F_ADJ_ROOM_ENCAP_L2( \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ + BPF_F_ADJ_ROOM_DECAP_L3_MASK) + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) +{ + u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; + bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + u16 mac_len = 0, inner_net = 0, inner_trans = 0; + const u8 meta_len = skb_metadata_len(skb); + unsigned int gso_type = SKB_GSO_DODGY; + int ret; + + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } + + ret = skb_cow_head(skb, meta_len + len_diff); + if (unlikely(ret < 0)) + return ret; + + if (encap) { + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -ENOTSUPP; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + + if (skb->encapsulation) + return -EALREADY; + + mac_len = skb->network_header - skb->mac_header; + inner_net = skb->network_header; + if (inner_mac_len > len_diff) + return -EINVAL; + inner_trans = skb->transport_header; + } + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (encap) { + skb->inner_mac_header = inner_net - inner_mac_len; + skb->inner_network_header = inner_net; + skb->inner_transport_header = inner_trans; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); + + skb->encapsulation = 1; + skb_set_network_header(skb, mac_len); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + gso_type |= SKB_GSO_UDP_TUNNEL; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) + gso_type |= SKB_GSO_GRE; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + gso_type |= SKB_GSO_IPXIP6; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) + gso_type |= SKB_GSO_IPXIP4; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { + int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr); + + skb_set_transport_header(skb, mac_len + nh_len); + } + + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + bpf_skb_change_protocol(skb, ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) + bpf_skb_change_protocol(skb, ETH_P_IP); + } + + if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* Header must be checked, and gso_segs recomputed. */ + shinfo->gso_type |= gso_type; + shinfo->gso_segs = 0; + + /* Due to header growth, MSS needs to be downgraded. + * There is a BUG_ON() when segmenting the frag_list with + * head_frag true, so linearize the skb after downgrading + * the MSS. + */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { + skb_decrease_gso_size(shinfo, len_diff); + if (shinfo->frag_list) + return skb_linearize(skb); + } + } + + return 0; +} + +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) +{ + int ret; + + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_DECAP_L3_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) + return -EINVAL; + + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + bpf_skb_change_protocol(skb, ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) + bpf_skb_change_protocol(skb, ETH_P_IP); + + if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* Due to header shrink, MSS can be upgraded. */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + + /* Header must be checked, and gso_segs recomputed. */ + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; + } + + return 0; +} + +#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC + +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) +{ + u32 len_diff_abs = abs(len_diff); + bool shrink = len_diff < 0; + int ret = 0; + + if (unlikely(flags || mode)) + return -EINVAL; + if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; + + if (!shrink) { + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + __skb_push(skb, len_diff_abs); + memset(skb->data, 0, len_diff_abs); + } else { + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) + return -ENOMEM; + __skb_pull(skb, len_diff_abs); } + if (tls_sw_has_ctx_rx(skb->sk)) { + struct strp_msg *rxm = strp_msg(skb); - atomic_set(&fp->refcnt, 1); - fp->len = fprog->len; + rxm->full_len += len_diff; + } + return ret; +} + +static const struct bpf_func_proto sk_skb_adjust_room_proto = { + .func = sk_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) +{ + u32 len_cur, len_diff_abs = abs(len_diff); + u32 len_min = bpf_skb_net_base_len(skb); + u32 len_max = BPF_SKB_MAX_LEN; + __be16 proto = skb->protocol; + bool shrink = len_diff < 0; + u32 off; + int ret; + + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) + return -EINVAL; + if (unlikely(len_diff_abs > 0xfffU)) + return -EFAULT; + if (unlikely(proto != htons(ETH_P_IP) && + proto != htons(ETH_P_IPV6))) + return -ENOTSUPP; + + off = skb_mac_header_len(skb); + switch (mode) { + case BPF_ADJ_ROOM_NET: + off += bpf_skb_net_base_len(skb); + break; + case BPF_ADJ_ROOM_MAC: + break; + default: + return -ENOTSUPP; + } + + if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + if (!shrink) + return -EINVAL; + + switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: + len_min = sizeof(struct iphdr); + break; + case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: + len_min = sizeof(struct ipv6hdr); + break; + default: + return -EINVAL; + } + } + + len_cur = skb->len - skb_network_offset(skb); + if ((shrink && (len_diff_abs >= len_cur || + len_cur - len_diff_abs < len_min)) || + (!shrink && (skb->len + len_diff_abs > len_max && + !skb_is_gso(skb)))) + return -ENOTSUPP; + + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : + bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_adjust_room_proto = { + .func = bpf_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +static u32 __bpf_skb_min_len(const struct sk_buff *skb) +{ + int offset = skb_network_offset(skb); + u32 min_len = 0; + + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } + return min_len; +} + +static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + unsigned int old_len = skb->len; + int ret; + + ret = __skb_grow_rcsum(skb, new_len); + if (!ret) + memset(skb->data + old_len, 0, new_len - old_len); + return ret; +} + +static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + return __skb_trim_rcsum(skb, new_len); +} + +static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, + u64 flags) +{ + u32 max_len = BPF_SKB_MAX_LEN; + u32 min_len = __bpf_skb_min_len(skb); + int ret; + + if (unlikely(flags || new_len > max_len || new_len < min_len)) + return -EINVAL; + if (skb->encapsulation) + return -ENOTSUPP; + + /* The basic idea of this helper is that it's performing the + * needed work to either grow or trim an skb, and eBPF program + * rewrites the rest via helpers like bpf_skb_store_bytes(), + * bpf_lX_csum_replace() and others rather than passing a raw + * buffer here. This one is a slow path helper and intended + * for replies with control messages. + * + * Like in bpf_skb_change_proto(), we want to keep this rather + * minimal and without protocol specifics so that we are able + * to separate concerns as in bpf_skb_store_bytes() should only + * be the one responsible for writing buffers. + * + * It's really expected to be a slow path operation here for + * control message replies, so we're implicitly linearizing, + * uncloning and drop offloads from the skb by this. + */ + ret = __bpf_try_make_writable(skb, skb->len); + if (!ret) { + if (new_len > skb->len) + ret = bpf_skb_grow_rcsum(skb, new_len); + else if (new_len < skb->len) + ret = bpf_skb_trim_rcsum(skb, new_len); + if (!ret && skb_is_gso(skb)) + skb_gso_reset(skb); + } + return ret; +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_tail_proto = { + .func = bpf_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + return __bpf_skb_change_tail(skb, new_len, flags); +} + +static const struct bpf_func_proto sk_skb_change_tail_proto = { + .func = sk_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, + u64 flags) +{ + const u8 meta_len = skb_metadata_len(skb); + u32 max_len = BPF_SKB_MAX_LEN; + u32 new_len = skb->len + head_room; + int ret; + + if (unlikely(flags || (int)head_room < 0 || + (!skb_is_gso(skb) && new_len > max_len) || + new_len < skb->len)) + return -EINVAL; + + ret = skb_cow(skb, meta_len + head_room); + if (likely(!ret)) { + /* Idea for this helper is that we currently only + * allow to expand on mac header. This means that + * skb->protocol network header, etc, stay as is. + * Compared to bpf_skb_change_tail(), we're more + * flexible due to not needing to linearize or + * reset GSO. Intention for this helper is to be + * used by an L3 skb that needs to push mac header + * for redirection into L2 device. + */ + __skb_push(skb, head_room); + skb_postpush_data_move(skb, head_room, 0); + memset(skb->data, 0, head_room); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + } + + return ret; +} + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_head_proto = { + .func = bpf_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + return __bpf_skb_change_head(skb, head_room, flags); +} + +static const struct bpf_func_proto sk_skb_change_head_proto = { + .func = sk_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) +{ + return xdp_get_buff_len(xdp); +} + +static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) + +const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], +}; + +static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) +{ + return xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; +} + +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); + unsigned long metalen = xdp_get_metalen(xdp); + void *data_start = xdp_frame_end + metalen; + void *data = xdp->data + offset; + + if (unlikely(data < data_start || + data > xdp->data_end - ETH_HLEN)) + return -EINVAL; + + if (metalen) + memmove(xdp->data_meta + offset, + xdp->data_meta, metalen); + xdp->data_meta += offset; + xdp->data = data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush) +{ + unsigned long ptr_len, ptr_off = 0; + skb_frag_t *next_frag, *end_frag; + struct skb_shared_info *sinfo; + void *src, *dst; + u8 *ptr_buf; + + if (likely(xdp->data_end - xdp->data >= off + len)) { + src = flush ? buf : xdp->data + off; + dst = flush ? xdp->data + off : buf; + memcpy(dst, src, len); + return; + } + + sinfo = xdp_get_shared_info_from_buff(xdp); + end_frag = &sinfo->frags[sinfo->nr_frags]; + next_frag = &sinfo->frags[0]; + + ptr_len = xdp->data_end - xdp->data; + ptr_buf = xdp->data; + + while (true) { + if (off < ptr_off + ptr_len) { + unsigned long copy_off = off - ptr_off; + unsigned long copy_len = min(len, ptr_len - copy_off); + + src = flush ? buf : ptr_buf + copy_off; + dst = flush ? ptr_buf + copy_off : buf; + memcpy(dst, src, copy_len); + + off += copy_len; + len -= copy_len; + buf += copy_len; + } + + if (!len || next_frag == end_frag) + break; + + ptr_off += ptr_len; + ptr_buf = skb_frag_address(next_frag); + ptr_len = skb_frag_size(next_frag); + next_frag++; + } +} + +void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +{ + u32 size = xdp->data_end - xdp->data; + struct skb_shared_info *sinfo; + void *addr = xdp->data; + int i; + + if (unlikely(offset > 0xffff || len > 0xffff)) + return ERR_PTR(-EFAULT); + + if (unlikely(offset + len > xdp_get_buff_len(xdp))) + return ERR_PTR(-EINVAL); + + if (likely(offset < size)) /* linear area */ + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + offset -= size; + for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ + u32 frag_size = skb_frag_size(&sinfo->frags[i]); + + if (offset < frag_size) { + addr = skb_frag_address(&sinfo->frags[i]); + size = frag_size; + break; + } + offset -= frag_size; + } +out: + return offset + len <= size ? addr + offset : NULL; +} + +BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, false); + else + memcpy(buf, ptr, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { + .func = bpf_xdp_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) +{ + return ____bpf_xdp_load_bytes(xdp, offset, buf, len); +} + +BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, true); + else + memcpy(ptr, buf, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { + .func = bpf_xdp_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) +{ + return ____bpf_xdp_store_bytes(xdp, offset, buf, len); +} + +static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; + struct xdp_rxq_info *rxq = xdp->rxq; + unsigned int tailroom; + + if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) + return -EOPNOTSUPP; + + tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + if (unlikely(offset > tailroom)) + return -EINVAL; + + memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); + skb_frag_size_add(frag, offset); + sinfo->xdp_frags_size += offset; + if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + xsk_buff_get_tail(xdp)->data_end += offset; + + return 0; +} + +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) +{ + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); + + if (release) { + xsk_buff_del_frag(zc_frag); + } else { + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; + } + + return zc_frag; +} + +static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, + int shrink, bool tail) +{ + enum xdp_mem_type mem_type = xdp->rxq->mem.type; + bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; + + if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); + } + + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } + + return release; +} + +static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, n_frags_free = 0, len_free = 0; + + if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) + return -EINVAL; + + for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { + skb_frag_t *frag = &sinfo->frags[i]; + int shrink = min_t(int, offset, skb_frag_size(frag)); + + len_free += shrink; + offset -= shrink; + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) + n_frags_free++; + } + sinfo->nr_frags -= n_frags_free; + sinfo->xdp_frags_size -= len_free; + + if (unlikely(!sinfo->nr_frags)) { + xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); + xdp->data_end -= offset; + } + + return 0; +} + +BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) +{ + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ + void *data_end = xdp->data_end + offset; + + if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ + if (offset < 0) + return bpf_xdp_frags_shrink_tail(xdp, -offset); + + return bpf_xdp_frags_increase_tail(xdp, offset); + } + + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) + return -EINVAL; + + if (unlikely(data_end < xdp->data + ETH_HLEN)) + return -EINVAL; + + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + + xdp->data_end = data_end; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { + .func = bpf_xdp_adjust_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) +{ + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); + void *meta = xdp->data_meta + offset; + unsigned long metalen = xdp->data - meta; + + if (xdp_data_meta_unsupported(xdp)) + return -ENOTSUPP; + if (unlikely(meta < xdp_frame_end || + meta > xdp->data)) + return -EINVAL; + if (unlikely(xdp_metalen_invalid(metalen))) + return -EACCES; + + xdp->data_meta = meta; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { + .func = bpf_xdp_adjust_meta, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +/** + * DOC: xdp redirect + * + * XDP_REDIRECT works by a three-step process, implemented in the functions + * below: + * + * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target + * of the redirect and store it (along with some other metadata) in a per-CPU + * struct bpf_redirect_info. + * + * 2. When the program returns the XDP_REDIRECT return code, the driver will + * call xdp_do_redirect() which will use the information in struct + * bpf_redirect_info to actually enqueue the frame into a map type-specific + * bulk queue structure. + * + * 3. Before exiting its NAPI poll loop, the driver will call + * xdp_do_flush(), which will flush all the different bulk queues, + * thus completing the redirect. Note that xdp_do_flush() must be + * called before napi_complete_done() in the driver, as the + * XDP_REDIRECT logic relies on being inside a single NAPI instance + * through to the xdp_do_flush() call for RCU protection of all + * in-kernel data structures. + */ +/* + * Pointers to the map entries will be kept around for this whole sequence of + * steps, protected by RCU. However, there is no top-level rcu_read_lock() in + * the core code; instead, the RCU protection relies on everything happening + * inside a single NAPI poll sequence, which means it's between a pair of calls + * to local_bh_disable()/local_bh_enable(). + * + * The map entries are marked as __rcu and the map code makes sure to + * dereference those pointers with rcu_dereference_check() in a way that works + * for both sections that to hold an rcu_read_lock() and sections that are + * called from NAPI without a separate rcu_read_lock(). The code below does not + * use RCU annotations, but relies on those in the map code. + */ +void xdp_do_flush(void) +{ + struct list_head *lh_map, *lh_dev, *lh_xsk; + + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) + __dev_flush(lh_dev); + if (lh_map) + __cpu_map_flush(lh_map); + if (lh_xsk) + __xsk_map_flush(lh_xsk); +} +EXPORT_SYMBOL_GPL(xdp_do_flush); + +#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) +void xdp_do_check_flushed(struct napi_struct *napi) +{ + struct list_head *lh_map, *lh_dev, *lh_xsk; + bool missed = false; + + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) { + __dev_flush(lh_dev); + missed = true; + } + if (lh_map) { + __cpu_map_flush(lh_map); + missed = true; + } + if (lh_xsk) { + __xsk_map_flush(lh_xsk); + missed = true; + } + + WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", + napi->poll); +} +#endif + +DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); +EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); + +u32 xdp_master_redirect(struct xdp_buff *xdp) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + struct net_device *master, *slave; + + master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); + slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); + if (slave && slave != xdp->rxq->dev) { + /* The target device is different from the receiving device, so + * redirect it to the new device. + * Using XDP_REDIRECT gets the correct behaviour from XDP enabled + * drivers to unmap the packet from their rx ring. + */ + ri->tgt_index = slave->ifindex; + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + return XDP_REDIRECT; + } + return XDP_TX; +} +EXPORT_SYMBOL_GPL(xdp_master_redirect); + +static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, + const struct net_device *dev, + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) +{ + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + err = __xsk_map_redirect(fwd, xdp); + if (unlikely(err)) + goto err; + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +static __always_inline int +__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) +{ + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + u32 flags = ri->flags; + struct bpf_map *map; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + if (unlikely(!xdpf)) { + err = -EOVERFLOW; + goto err; + } + + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by dev_map_free() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + + WRITE_ONCE(ri->map, NULL); + err = dev_map_enqueue_multi(xdpf, dev, map, + flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_enqueue(fwd, xdpf, dev); + } + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_enqueue(fwd, xdpf, dev); + break; + case BPF_MAP_TYPE_UNSPEC: + if (map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + break; + } + err = dev_xdp_enqueue(fwd, xdpf, dev); + break; + } + fallthrough; + default: + err = -EBADRQC; + } + + if (unlikely(err)) + goto err; + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + enum bpf_map_type map_type = ri->map_type; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), + xdp_prog); +} +EXPORT_SYMBOL_GPL(xdp_do_redirect); + +int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + enum bpf_map_type map_type = ri->map_type; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); +} +EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); + +static int xdp_do_generic_redirect_map(struct net_device *dev, + struct sk_buff *skb, + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog, + void *fwd, enum bpf_map_type map_type, + u32 map_id, u32 flags) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + struct bpf_map *map; + int err; + + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by dev_map_free() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + + WRITE_ONCE(ri->map, NULL); + err = dev_map_redirect_multi(dev, skb, xdp_prog, map, + flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_generic_redirect(fwd, skb, xdp_prog); + } + if (unlikely(err)) + goto err; + break; + case BPF_MAP_TYPE_XSKMAP: + err = xsk_generic_rcv(fwd, xdp); + if (err) + goto err; + consume_skb(skb); + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_generic_redirect(fwd, skb); + if (unlikely(err)) + goto err; + break; + default: + err = -EBADRQC; + goto err; + } + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + u32 flags = ri->flags; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); + generic_xdp_tx(skb, xdp_prog); + return 0; + } + + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); +err: + _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); + return err; +} + +BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); + + if (unlikely(flags)) + return XDP_ABORTED; + + /* NB! Map type UNSPEC and map_id == INT_MAX (never generated + * by map_idr) is used for ifindex based XDP redirect. + */ + ri->tgt_index = ifindex; + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + return XDP_REDIRECT; +} + +static const struct bpf_func_proto bpf_xdp_redirect_proto = { + .func = bpf_xdp_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, + u64, flags) +{ + return map->ops->map_redirect(map, key, flags); +} + +static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { + .func = bpf_xdp_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, + unsigned long off, unsigned long len) +{ + void *ptr = skb_header_pointer(skb, off, len, dst_buff); + + if (unlikely(!ptr)) + return len; + if (ptr != dst_buff) + memcpy(dst_buff, ptr, len); + + return 0; +} + +BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) +{ + u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(!skb || skb_size > skb->len)) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, + bpf_skb_copy); +} + +static const struct bpf_func_proto bpf_skb_event_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) + +const struct bpf_func_proto bpf_skb_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_skb_output_btf_ids[0], + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +static unsigned short bpf_tunnel_key_af(u64 flags) +{ + return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; +} + +BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, + u32, size, u64, flags) +{ + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + u8 compat[sizeof(struct bpf_tunnel_key)]; + void *to_orig = to; + int err; + + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | + BPF_F_TUNINFO_FLAGS)))) { + err = -EINVAL; + goto err_clear; + } + if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { + err = -EPROTO; + goto err_clear; + } + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + err = -EINVAL; + switch (size) { + case offsetof(struct bpf_tunnel_key, local_ipv6[0]): + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): + goto set_compat; + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + if (ip_tunnel_info_af(info) != AF_INET) + goto err_clear; +set_compat: + to = (struct bpf_tunnel_key *)compat; + break; + default: + goto err_clear; + } + } + + to->tunnel_id = be64_to_cpu(info->key.tun_id); + to->tunnel_tos = info->key.tos; + to->tunnel_ttl = info->key.ttl; + if (flags & BPF_F_TUNINFO_FLAGS) + to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); + else + to->tunnel_ext = 0; + + if (flags & BPF_F_TUNINFO_IPV6) { + memcpy(to->remote_ipv6, &info->key.u.ipv6.src, + sizeof(to->remote_ipv6)); + memcpy(to->local_ipv6, &info->key.u.ipv6.dst, + sizeof(to->local_ipv6)); + to->tunnel_label = be32_to_cpu(info->key.label); + } else { + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); + memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); + to->tunnel_label = 0; + } + + if (unlikely(size != sizeof(struct bpf_tunnel_key))) + memcpy(to_orig, to, size); + + return 0; +err_clear: + memset(to_orig, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { + .func = bpf_skb_get_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) +{ + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + int err; + + if (unlikely(!info || + !ip_tunnel_is_options_present(info->key.tun_flags))) { + err = -ENOENT; + goto err_clear; + } + if (unlikely(size < info->options_len)) { + err = -ENOMEM; + goto err_clear; + } + + ip_tunnel_info_opts_get(to, info); + if (size > info->options_len) + memset(to + info->options_len, 0, size - info->options_len); + + return info->options_len; +err_clear: + memset(to, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +static struct metadata_dst __percpu *md_dst; + +BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, + const struct bpf_tunnel_key *, from, u32, size, u64, flags) +{ + struct metadata_dst *md = this_cpu_ptr(md_dst); + u8 compat[sizeof(struct bpf_tunnel_key)]; + struct ip_tunnel_info *info; + + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | + BPF_F_NO_TUNNEL_KEY))) + return -EINVAL; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, local_ipv6[0]): + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + memcpy(compat, from, size); + memset(compat + size, 0, sizeof(compat) - size); + from = (const struct bpf_tunnel_key *) compat; + break; + default: + return -EINVAL; + } + } + if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || + from->tunnel_ext)) + return -EINVAL; + + skb_dst_drop(skb); + dst_hold((struct dst_entry *) md); + skb_dst_set(skb, (struct dst_entry *) md); + + info = &md->u.tun_info; + memset(info, 0, sizeof(*info)); + info->mode = IP_TUNNEL_INFO_TX; + + __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); + __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, + flags & BPF_F_DONT_FRAGMENT); + __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, + !(flags & BPF_F_ZERO_CSUM_TX)); + __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, + flags & BPF_F_SEQ_NUMBER); + __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, + !(flags & BPF_F_NO_TUNNEL_KEY)); + + info->key.tun_id = cpu_to_be64(from->tunnel_id); + info->key.tos = from->tunnel_tos; + info->key.ttl = from->tunnel_ttl; + + if (flags & BPF_F_TUNINFO_IPV6) { + info->mode |= IP_TUNNEL_INFO_IPV6; + memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, + sizeof(from->remote_ipv6)); + memcpy(&info->key.u.ipv6.src, from->local_ipv6, + sizeof(from->local_ipv6)); + info->key.label = cpu_to_be32(from->tunnel_label) & + IPV6_FLOWLABEL_MASK; + } else { + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); + info->key.flow_flags = FLOWI_FLAG_ANYSRC; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { + .func = bpf_skb_set_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, + const u8 *, from, u32, size) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > IP_TUNNEL_OPTS_MAX)) + return -ENOMEM; + + ip_tunnel_set_options_present(present); + ip_tunnel_info_opts_set(info, from, size, present); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) +{ + if (!md_dst) { + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + METADATA_IP_TUNNEL, + GFP_KERNEL); + if (!tmp) + return NULL; + if (cmpxchg(&md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); + } + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } +} + +BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, + u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + struct sock *sk; + + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) + return -ENOENT; + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return sk_under_cgroup_hierarchy(sk, cgrp); +} + +static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { + .func = bpf_skb_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +#ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + +BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) +{ + return __bpf_sk_cgroup_id(skb->sk); +} + +static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { + .func = bpf_skb_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) +{ + struct cgroup *ancestor; + struct cgroup *cgrp; + + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + + return cgroup_id(ancestor); +} + +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { + .func = bpf_skb_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, +}; +#endif + +static unsigned long bpf_xdp_copy(void *dst, const void *ctx, + unsigned long off, unsigned long len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + + bpf_xdp_copy_buf(xdp, off, dst, len, false); + return 0; +} + +BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) +{ + u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + + if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, xdp, + xdp_size, bpf_xdp_copy); +} + +static const struct bpf_func_proto bpf_xdp_event_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) + +const struct bpf_func_proto bpf_xdp_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_output_btf_ids[0], + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) +{ + return skb->sk ? __sock_gen_cookie(skb->sk) : 0; +} + +static const struct bpf_func_proto bpf_get_socket_cookie_proto = { + .func = bpf_get_socket_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return __sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { + .func = bpf_get_socket_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) +{ + return __sock_gen_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { + .func = bpf_get_socket_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return __sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { + .func = bpf_get_socket_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static u64 __bpf_get_netns_cookie(struct sock *sk) +{ + const struct net *net = sk ? sock_net(sk) : &init_net; + + return net->net_cookie; +} + +BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) +{ + return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_proto = { + .func = bpf_get_netns_cookie, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) +{ + return __bpf_get_netns_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { + .func = bpf_get_netns_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { + .func = bpf_get_netns_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { + .func = bpf_get_netns_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { + .func = bpf_get_netns_cookie_sk_msg, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) +{ + struct sock *sk = sk_to_full_sk(skb->sk); + kuid_t kuid; + + if (!sk || !sk_fullsock(sk)) + return overflowuid; + kuid = sock_net_uid(sock_net(sk), sk); + return from_kuid_munged(sock_net(sk)->user_ns, kuid); +} + +static const struct bpf_func_proto bpf_get_socket_uid_proto = { + .func = bpf_get_socket_uid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) +{ + u32 sk_bpf_cb_flags; + + if (getopt) { + *(u32 *)optval = sk->sk_bpf_cb_flags; + return 0; + } + + sk_bpf_cb_flags = *(u32 *)optval; + + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) + return -EINVAL; + + sk->sk_bpf_cb_flags = sk_bpf_cb_flags; + + return 0; +} + +static int sol_socket_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + switch (optname) { + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_RCVBUF: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_REUSEPORT: + case SO_RCVLOWAT: + case SO_MARK: + case SO_MAX_PACING_RATE: + case SO_BINDTOIFINDEX: + case SO_TXREHASH: + case SK_BPF_CB_FLAGS: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case SO_BINDTODEVICE: + break; + default: + return -EINVAL; + } + + if (optname == SK_BPF_CB_FLAGS) + return sk_bpf_set_get_cb_flags(sk, optval, getopt); + + if (getopt) { + if (optname == SO_BINDTODEVICE) + return -EINVAL; + return sk_getsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + return sk_setsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + if (optlen != sizeof(int)) + return -EINVAL; + + switch (optname) { + case TCP_BPF_SOCK_OPS_CB_FLAGS: { + int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags; + + memcpy(optval, &cb_flags, optlen); + break; + } + case TCP_BPF_RTO_MIN: { + int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min); + + memcpy(optval, &rto_min_us, optlen); + break; + } + case TCP_BPF_DELACK_MAX: { + int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max); + + memcpy(optval, &delack_max_us, optlen); + break; + } + default: + return -EINVAL; + } + + return 0; +} + +static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + val = *(int *)optval; + + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > tp->syn_data) + return -EINVAL; + tcp_snd_cwnd_set(tp, val); + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) + return -EINVAL; + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; + case TCP_BPF_SOCK_OPS_CB_FLAGS: + if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) + return -EINVAL; + tp->bpf_sock_ops_cb_flags = val; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, + int *optlen, bool getopt) +{ + struct tcp_sock *tp; + int ret; + + if (*optlen < 2) + return -EINVAL; + + if (getopt) { + if (!inet_csk(sk)->icsk_ca_ops) + return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; + return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; + + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another. + */ + tp = tcp_sk(sk); + if (tp->bpf_chg_cc_inprogress) + return -EBUSY; + + tp->bpf_chg_cc_inprogress = 1; + ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), *optlen); + tp->bpf_chg_cc_inprogress = 0; + return ret; +} + +static int sol_tcp_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_protocol != IPPROTO_TCP) + return -EINVAL; + + switch (optname) { + case TCP_NODELAY: + case TCP_MAXSEG: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_USER_TIMEOUT: + case TCP_NOTSENT_LOWAT: + case TCP_SAVE_SYN: + case TCP_RTO_MAX_MS: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case TCP_CONGESTION: + return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); + case TCP_SAVED_SYN: + if (*optlen < 1) + return -EINVAL; + break; + default: + if (getopt) + return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen); + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); + } + + if (getopt) { + if (optname == TCP_SAVED_SYN) { + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->saved_syn || + *optlen > tcp_saved_syn_len(tp->saved_syn)) + return -EINVAL; + memcpy(optval, tp->saved_syn->data, *optlen); + /* It cannot free tp->saved_syn here because it + * does not know if the user space still needs it. + */ + return 0; + } + + return do_tcp_getsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + return do_tcp_setsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int sol_ip_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_family != AF_INET) + return -EINVAL; + + switch (optname) { + case IP_TOS: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (getopt) + return do_ip_getsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + + return do_ip_setsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int sol_ipv6_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_family != AF_INET6) + return -EINVAL; + + switch (optname) { + case IPV6_TCLASS: + case IPV6_AUTOFLOWLABEL: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (getopt) + return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + + return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int __bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (!sk_fullsock(sk)) + return -EINVAL; + + if (level == SOL_SOCKET) + return sol_socket_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + return sol_ip_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + return sol_tcp_sockopt(sk, optname, optval, &optlen, false); + + return -EINVAL; +} + +static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) +{ + return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; +} + +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static int __bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + int err, saved_optlen = optlen; + + if (!sk_fullsock(sk)) { + err = -EINVAL; + goto done; + } + + if (level == SOL_SOCKET) + err = sol_socket_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + err = sol_ip_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); + else + err = -EINVAL; + +done: + if (err) + optlen = 0; + if (optlen < saved_optlen) + memset(optval + optlen, 0, saved_optlen - optlen); + return err; +} + +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_proto = { + .func = bpf_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_getsockopt_proto = { + .func = bpf_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { + .func = bpf_unlocked_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { + .func = bpf_unlocked_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .func = bpf_sock_addr_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { + .func = bpf_sock_addr_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, + char *optval, int optlen, + bool getopt) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + if (!sk_has_account(sk)) + return -EOPNOTSUPP; + + if (getopt) { + *(int *)optval = sk->sk_bypass_prot_mem; + return 0; + } + + val = *(int *)optval; + if (val < 0 || val > 1) + return -EINVAL; + + sk->sk_bypass_prot_mem = val; + return 0; +} + +BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) + return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); + + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { + .func = bpf_sock_create_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { + int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); + + if (err) + memset(optval, 0, optlen); + + return err; + } + + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = { + .func = bpf_sock_create_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .func = bpf_sock_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, + int optname, const u8 **start) +{ + struct sk_buff *syn_skb = bpf_sock->syn_skb; + const u8 *hdr_start; + int ret; + + if (syn_skb) { + /* sk is a request_sock here */ + + if (optname == TCP_BPF_SYN) { + hdr_start = syn_skb->data; + ret = tcp_hdrlen(syn_skb); + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = skb_network_header(syn_skb); + ret = skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } else { + /* optname == TCP_BPF_SYN_MAC */ + hdr_start = skb_mac_header(syn_skb); + ret = skb_mac_header_len(syn_skb) + + skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } + } else { + struct sock *sk = bpf_sock->sk; + struct saved_syn *saved_syn; + + if (sk->sk_state == TCP_NEW_SYN_RECV) + /* synack retransmit. bpf_sock->syn_skb will + * not be available. It has to resort to + * saved_syn (if it is saved). + */ + saved_syn = inet_reqsk(sk)->saved_syn; + else + saved_syn = tcp_sk(sk)->saved_syn; + + if (!saved_syn) + return -ENOENT; + + if (optname == TCP_BPF_SYN) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen + + saved_syn->network_hdrlen; + ret = saved_syn->tcp_hdrlen; + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen; + ret = saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } else { + /* optname == TCP_BPF_SYN_MAC */ + + /* TCP_SAVE_SYN may not have saved the mac hdr */ + if (!saved_syn->mac_hdrlen) + return -ENOENT; + + hdr_start = saved_syn->data; + ret = saved_syn->mac_hdrlen + + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } + } + + *start = hdr_start; + return ret; +} + +BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && + optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { + int ret, copy_len = 0; + const u8 *start; + + ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); + if (ret > 0) { + copy_len = ret; + if (optlen < copy_len) { + copy_len = optlen; + ret = -ENOSPC; + } + + memcpy(optval, start, copy_len); + } + + /* Zero out unused buffer at the end */ + memset(optval + copy_len, 0, optlen - copy_len); + + return ret; + } + + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { + .func = bpf_sock_ops_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) + return -EINVAL; + + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_bpf_stub); + +BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, + int, addr_len) +{ +#ifdef CONFIG_INET + struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; + int err; + + err = -EINVAL; + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return err; + if (addr->sa_family == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return err; + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags); +#if IS_ENABLED(CONFIG_IPV6) + } else if (addr->sa_family == AF_INET6) { + if (addr_len < SIN6_LEN_RFC2133) + return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + /* ipv6_bpf_stub cannot be NULL, since it's called from + * bpf_cgroup_inet6_connect hook and ipv6 is already loaded + */ + return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr, + addr_len, flags); +#endif /* CONFIG_IPV6 */ + } +#endif /* CONFIG_INET */ + + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_bind_proto = { + .func = bpf_bind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, +}; + +#ifdef CONFIG_XFRM + +#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +struct metadata_dst __percpu *xfrm_bpf_md_dst; +EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); + +#endif + +BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, + struct bpf_xfrm_state *, to, u32, size, u64, flags) +{ + const struct sec_path *sp = skb_sec_path(skb); + const struct xfrm_state *x; + + if (!sp || unlikely(index >= sp->len || flags)) + goto err_clear; + + x = sp->xvec[index]; + + if (unlikely(size != sizeof(struct bpf_xfrm_state))) + goto err_clear; + + to->reqid = x->props.reqid; + to->spi = x->id.spi; + to->family = x->props.family; + to->ext = 0; + + if (to->family == AF_INET6) { + memcpy(to->remote_ipv6, x->props.saddr.a6, + sizeof(to->remote_ipv6)); + } else { + to->remote_ipv4 = x->props.saddr.a4; + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + } + + return 0; +err_clear: + memset(to, 0, size); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { + .func = bpf_skb_get_xfrm_state, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; +#endif + +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) +{ + params->h_vlan_TCI = 0; + params->h_vlan_proto = 0; + if (mtu) + params->mtu_result = mtu; /* union with tot_len */ + + return 0; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags, bool check_mtu) +{ + struct fib_nh_common *nhc; + struct in_device *in_dev; + struct neighbour *neigh; + struct net_device *dev; + struct fib_result res; + struct flowi4 fl4; + u32 mtu = 0; + int err; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + /* verify forwarding is enabled on this interface */ + in_dev = __in_dev_get_rcu(dev); + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) + return BPF_FIB_LKUP_RET_FWD_DISABLED; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl4.flowi4_iif = 1; + fl4.flowi4_oif = params->ifindex; + } else { + fl4.flowi4_iif = params->ifindex; + fl4.flowi4_oif = 0; + } + fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; + + fl4.flowi4_proto = params->l4_protocol; + fl4.daddr = params->ipv4_dst; + fl4.saddr = params->ipv4_src; + fl4.fl4_sport = params->sport; + fl4.fl4_dport = params->dport; + fl4.flowi4_multipath_hash = 0; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib_table *tb; + + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + + tb = fib_get_table(net, tbid); + if (unlikely(!tb)) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + } else { + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; + fl4.flowi4_secid = 0; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); + + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); + } - err = __sk_prepare_filter(fp); if (err) { - sk_filter_uncharge(sk, fp); + /* map fib lookup errors to RTN_ type */ + if (err == -EINVAL) + return BPF_FIB_LKUP_RET_BLACKHOLE; + if (err == -EHOSTUNREACH) + return BPF_FIB_LKUP_RET_UNREACHABLE; + if (err == -EACCES) + return BPF_FIB_LKUP_RET_PROHIBIT; + + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + if (res.type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + if (fib_info_num_path(res.fi) > 1) + fib_select_path(net, &res, &fl4, NULL); + + if (check_mtu) { + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ + return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } + } + + nhc = res.nhc; + + /* do not handle lwt encaps right now */ + if (nhc->nhc_lwtstate) + return BPF_FIB_LKUP_RET_UNSUPP_LWT; + + dev = nhc->nhc_dev; + + params->rt_metric = res.fi->fib_priority; + params->ifindex = dev->ifindex; + + if (flags & BPF_FIB_LOOKUP_SRC) + params->ipv4_src = fib_result_prefsrc(net, &res); + + /* xdp and cls_bpf programs are run in RCU-bh so + * rcu_read_lock_bh is not needed here + */ + if (likely(nhc->nhc_gw_family != AF_INET6)) { + if (nhc->nhc_gw_family) + params->ipv4_dst = nhc->nhc_gw.ipv4; + } else { + struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; + + params->family = AF_INET6; + *dst = nhc->nhc_gw.ipv6; + } + + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) + goto set_fwd_params; + + if (likely(nhc->nhc_gw_family != AF_INET6)) + neigh = __ipv4_neigh_lookup_noref(dev, + (__force u32)params->ipv4_dst); + else + neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); + + if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) + return BPF_FIB_LKUP_RET_NO_NEIGH; + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + +set_fwd_params: + return bpf_fib_set_fwd_params(params, mtu); +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags, bool check_mtu) +{ + struct in6_addr *src = (struct in6_addr *) params->ipv6_src; + struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct fib6_result res = {}; + struct neighbour *neigh; + struct net_device *dev; + struct inet6_dev *idev; + struct flowi6 fl6; + int strict = 0; + int oif, err; + u32 mtu = 0; + + /* link local addresses are never forwarded */ + if (rt6_need_strict(dst) || rt6_need_strict(src)) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + idev = __in6_dev_get_safely(dev); + if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) + return BPF_FIB_LKUP_RET_FWD_DISABLED; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl6.flowi6_iif = 1; + oif = fl6.flowi6_oif = params->ifindex; + } else { + oif = fl6.flowi6_iif = params->ifindex; + fl6.flowi6_oif = 0; + strict = RT6_LOOKUP_F_HAS_SADDR; + } + fl6.flowlabel = params->flowinfo; + fl6.flowi6_scope = 0; + fl6.flowi6_flags = 0; + fl6.mp_hash = 0; + + fl6.flowi6_proto = params->l4_protocol; + fl6.daddr = *dst; + fl6.saddr = *src; + fl6.fl6_sport = params->sport; + fl6.fl6_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib6_table *tb; + + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + + tb = ipv6_stub->fib6_get_table(net, tbid); + if (unlikely(!tb)) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, + strict); + } else { + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; + fl6.flowi6_secid = 0; + fl6.flowi6_tun_key.tun_id = 0; + fl6.flowi6_uid = sock_net_uid(net, NULL); + + err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); + } + + if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || + res.f6i == net->ipv6.fib6_null_entry)) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + switch (res.fib6_type) { + /* only unicast is forwarded */ + case RTN_UNICAST: + break; + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, + fl6.flowi6_oif != 0, NULL, strict); + + if (check_mtu) { + mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ + return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } + } + + if (res.nh->fib_nh_lws) + return BPF_FIB_LKUP_RET_UNSUPP_LWT; + + if (res.nh->fib_nh_gw_family) + *dst = res.nh->fib_nh_gw6; + + dev = res.nh->fib_nh_dev; + params->rt_metric = res.f6i->fib6_metric; + params->ifindex = dev->ifindex; + + if (flags & BPF_FIB_LOOKUP_SRC) { + if (res.f6i->fib6_prefsrc.plen) { + *src = res.f6i->fib6_prefsrc.addr; + } else { + err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, + &fl6.daddr, 0, + src); + if (err) + return BPF_FIB_LKUP_RET_NO_SRC_ADDR; + } + } + + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) + goto set_fwd_params; + + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is + * not needed here. + */ + neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); + if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) + return BPF_FIB_LKUP_RET_NO_NEIGH; + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + +set_fwd_params: + return bpf_fib_set_fwd_params(params, mtu); +} +#endif + +#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ + BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + if (flags & ~BPF_FIB_LOOKUP_MASK) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, + flags, true); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, + flags, true); +#endif + } + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { + .func = bpf_xdp_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + struct net *net = dev_net(skb->dev); + int rc = -EAFNOSUPPORT; + bool check_mtu = false; + + if (plen < sizeof(*params)) + return -EINVAL; + + if (flags & ~BPF_FIB_LOOKUP_MASK) + return -EINVAL; + + if (params->tot_len) + check_mtu = true; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); + break; +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); + break; +#endif + } + + if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { + struct net_device *dev; + + /* When tot_len isn't provided by user, check skb + * against MTU of FIB lookup resulting net_device + */ + dev = dev_get_by_index_rcu(net, params->ifindex); + if (!is_skb_forwardable(dev, skb)) + rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; + + params->mtu_result = dev->mtu; /* union with tot_len */ + } + + return rc; +} + +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { + .func = bpf_skb_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, + u32 ifindex) +{ + struct net *netns = dev_net(dev_curr); + + /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ + if (ifindex == 0) + return dev_curr; + + return dev_get_by_index_rcu(netns, ifindex); +} + +BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + struct net_device *dev = skb->dev; + int mtu, dev_len, skb_len; + + if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) + return -EINVAL; + if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + dev_len = mtu + dev->hard_header_len; + + /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ + skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; + + skb_len += len_diff; /* minus result pass check */ + if (skb_len <= dev_len) { + ret = BPF_MTU_CHK_RET_SUCCESS; + goto out; + } + /* At this point, skb->len exceed MTU, but as it include length of all + * segments, it can still be below MTU. The SKB can possibly get + * re-segmented in transmit path (see validate_xmit_skb). Thus, user + * must choose if segs are to be MTU checked. + */ + if (skb_is_gso(skb)) { + ret = BPF_MTU_CHK_RET_SUCCESS; + if (flags & BPF_MTU_CHK_SEGS) { + if (!skb_transport_header_was_set(skb)) + return -EINVAL; + if (!skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } + } +out: + *mtu_len = mtu; + return ret; +} + +BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + struct net_device *dev = xdp->rxq->dev; + int xdp_len = xdp->data_end - xdp->data; + int ret = BPF_MTU_CHK_RET_SUCCESS; + int mtu, dev_len; + + /* XDP variant doesn't support multi-buffer segment check (yet) */ + if (unlikely(flags)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + dev_len = mtu + dev->hard_header_len; + + /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ + if (*mtu_len) + xdp_len = *mtu_len + dev->hard_header_len; + + xdp_len += len_diff; /* minus result pass check */ + if (xdp_len > dev_len) + ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + + *mtu_len = mtu; + return ret; +} + +static const struct bpf_func_proto bpf_skb_check_mtu_proto = { + .func = bpf_skb_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, + .arg3_size = sizeof(u32), + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { + .func = bpf_xdp_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, + .arg3_size = sizeof(u32), + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +{ + int err; + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + + if (!seg6_validate_srh(srh, len, false)) + return -EINVAL; + + switch (type) { + case BPF_LWT_ENCAP_SEG6_INLINE: + if (skb->protocol != htons(ETH_P_IPV6)) + return -EBADMSG; + + err = seg6_do_srh_inline(skb, srh); + break; + case BPF_LWT_ENCAP_SEG6: + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); + break; + default: + return -EINVAL; + } + + bpf_compute_data_pointers(skb); + if (err) return err; + + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + return seg6_lookup_nexthop(skb, NULL, 0); +} +#endif /* CONFIG_IPV6_SEG6_BPF */ + +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, + bool ingress) +{ + return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); +} +#endif + +BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, + u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_LWT_ENCAP_SEG6: + case BPF_LWT_ENCAP_SEG6_INLINE: + return bpf_push_seg6_encap(skb, type, hdr, len); +#endif +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); +#endif + default: + return -EINVAL; } +} - old_fp = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); - rcu_assign_pointer(sk->sk_filter, fp); +BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, + void *, hdr, u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, false /* egress */); +#endif + default: + return -EINVAL; + } +} - if (old_fp) - sk_filter_uncharge(sk, old_fp); +static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { + .func = bpf_lwt_in_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE +}; + +static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { + .func = bpf_lwt_xmit_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE +}; + +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; + void *srh_tlvs, *srh_end, *ptr; + int srhoff = 0; + + lockdep_assert_held(&srh_state->bh_lock); + if (srh == NULL) + return -EINVAL; + + srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); + + ptr = skb->data + offset; + if (ptr >= srh_tlvs && ptr + len <= srh_end) + srh_state->valid = false; + else if (ptr < (void *)&srh->flags || + ptr + len > (void *)&srh->segments) + return -EFAULT; + + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + memcpy(skb->data + offset, from, len); return 0; } -EXPORT_SYMBOL_GPL(sk_attach_filter); + +static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { + .func = bpf_lwt_seg6_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE +}; + +static void bpf_update_srh_state(struct sk_buff *skb) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { + srh_state->srh = NULL; + } else { + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_state->hdrlen = srh_state->srh->hdrlen << 3; + srh_state->valid = true; + } +} + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + int hdroff = 0; + int err; + + lockdep_assert_held(&srh_state->bh_lock); + switch (action) { + case SEG6_LOCAL_ACTION_END_X: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(struct in6_addr)) + return -EINVAL; + return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); + case SEG6_LOCAL_ACTION_END_T: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_DT6: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(int)) + return -EINVAL; + + if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) + return -EBADMSG; + if (!pskb_pull(skb, hdroff)) + return -EBADMSG; + + skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + bpf_compute_data_pointers(skb); + bpf_update_srh_state(skb); + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_B6: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, + param, param_len); + if (!err) + bpf_update_srh_state(skb); + + return err; + case SEG6_LOCAL_ACTION_END_B6_ENCAP: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, + param, param_len); + if (!err) + bpf_update_srh_state(skb); + + return err; + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { + .func = bpf_lwt_seg6_action, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, + s32, len) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; + void *srh_end, *srh_tlvs, *ptr; + struct ipv6hdr *hdr; + int srhoff = 0; + int ret; + + lockdep_assert_held(&srh_state->bh_lock); + if (unlikely(srh == NULL)) + return -EINVAL; + + srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + + srh_state->hdrlen); + ptr = skb->data + offset; + + if (unlikely(ptr < srh_tlvs || ptr > srh_end)) + return -EFAULT; + if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) + return -EFAULT; + + if (len > 0) { + ret = skb_cow_head(skb, len); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, offset, len); + } else { + ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); + } + + bpf_compute_data_pointers(skb); + if (unlikely(ret < 0)) + return ret; + + hdr = (struct ipv6hdr *)skb->data; + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_state->hdrlen += len; + srh_state->valid = false; + return 0; +} + +static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { + .func = bpf_lwt_seg6_adjust_srh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; +#endif /* CONFIG_IPV6_SEG6_BPF */ + +#ifdef CONFIG_INET +static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + int dif, int sdif, u8 family, u8 proto) +{ + bool refcounted = false; + struct sock *sk = NULL; + + if (family == AF_INET) { + __be32 src4 = tuple->ipv4.saddr; + __be32 dst4 = tuple->ipv4.daddr; + + if (proto == IPPROTO_TCP) + sk = __inet_lookup(net, NULL, 0, + src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &refcounted); + else + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, net->ipv4.udp_table, NULL); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + + if (proto == IPPROTO_TCP) + sk = __inet6_lookup(net, NULL, 0, + src6, tuple->ipv6.sport, + dst6, ntohs(tuple->ipv6.dport), + dif, sdif, &refcounted); + else if (likely(ipv6_bpf_stub)) + sk = ipv6_bpf_stub->udp6_lib_lookup(net, + src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, + net->ipv4.udp_table, NULL); +#endif + } + + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + sk = NULL; + } + return sk; +} + +/* bpf_skc_lookup performs the core lookup for different types of sockets, + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. + */ +static struct sock * +__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags, int sdif) +{ + struct sock *sk = NULL; + struct net *net; + u8 family; + + if (len == sizeof(tuple->ipv4)) + family = AF_INET; + else if (len == sizeof(tuple->ipv6)) + family = AF_INET6; + else + return NULL; + + if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) + goto out; + + if (sdif < 0) { + if (family == AF_INET) + sdif = inet_sdif(skb); + else + sdif = inet6_sdif(skb); + } + + if ((s32)netns_id < 0) { + net = caller_net; + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); + } else { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); + put_net(net); + } + +out: + return sk; +} + +static struct sock * +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags, int sdif) +{ + struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, proto, netns_id, flags, + sdif); + + if (sk) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (sk2 != sk) { + sock_gen_put(sk); + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; + } + } + + return sk; +} + +static struct sock * +bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, + netns_id, flags, -1); +} + +static struct sock * +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, + flags); + + if (sk) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (sk2 != sk) { + sock_gen_put(sk); + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; + } + } + + return sk; +} + +BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { + .func = bpf_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .func = bpf_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .func = bpf_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { + .func = bpf_tc_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { + .func = bpf_tc_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { + .func = bpf_tc_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_release, struct sock *, sk) +{ + if (sk && sk_is_refcounted(sk)) + sock_gen_put(sk); + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { + .func = bpf_xdp_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags, + -1); +} + +static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { + .func = bpf_sock_addr_skc_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_TCP, + netns_id, flags, -1); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_UDP, + netns_id, flags, -1); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, + icsk_retransmits)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_tcp_sock, bytes_received): + case offsetof(struct bpf_tcp_sock, bytes_acked): + return size == sizeof(__u64); + default: + return size == sizeof(__u32); + } +} + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_TCP_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ + sizeof_field(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct tcp_sock, FIELD)); \ + } while (0) + +#define BPF_INET_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ + FIELD) > \ + sizeof_field(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct inet_connection_sock, \ + FIELD), \ + si->dst_reg, si->src_reg, \ + offsetof( \ + struct inet_connection_sock, \ + FIELD)); \ + } while (0) + + BTF_TYPE_EMIT(struct bpf_tcp_sock); + + switch (si->off) { + case offsetof(struct bpf_tcp_sock, rtt_min): + BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct tcp_sock, rtt_min) + + offsetof(struct minmax_sample, v)); + break; + case offsetof(struct bpf_tcp_sock, snd_cwnd): + BPF_TCP_SOCK_GET_COMMON(snd_cwnd); + break; + case offsetof(struct bpf_tcp_sock, srtt_us): + BPF_TCP_SOCK_GET_COMMON(srtt_us); + break; + case offsetof(struct bpf_tcp_sock, snd_ssthresh): + BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); + break; + case offsetof(struct bpf_tcp_sock, rcv_nxt): + BPF_TCP_SOCK_GET_COMMON(rcv_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_nxt): + BPF_TCP_SOCK_GET_COMMON(snd_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_una): + BPF_TCP_SOCK_GET_COMMON(snd_una); + break; + case offsetof(struct bpf_tcp_sock, mss_cache): + BPF_TCP_SOCK_GET_COMMON(mss_cache); + break; + case offsetof(struct bpf_tcp_sock, ecn_flags): + BPF_TCP_SOCK_GET_COMMON(ecn_flags); + break; + case offsetof(struct bpf_tcp_sock, rate_delivered): + BPF_TCP_SOCK_GET_COMMON(rate_delivered); + break; + case offsetof(struct bpf_tcp_sock, rate_interval_us): + BPF_TCP_SOCK_GET_COMMON(rate_interval_us); + break; + case offsetof(struct bpf_tcp_sock, packets_out): + BPF_TCP_SOCK_GET_COMMON(packets_out); + break; + case offsetof(struct bpf_tcp_sock, retrans_out): + BPF_TCP_SOCK_GET_COMMON(retrans_out); + break; + case offsetof(struct bpf_tcp_sock, total_retrans): + BPF_TCP_SOCK_GET_COMMON(total_retrans); + break; + case offsetof(struct bpf_tcp_sock, segs_in): + BPF_TCP_SOCK_GET_COMMON(segs_in); + break; + case offsetof(struct bpf_tcp_sock, data_segs_in): + BPF_TCP_SOCK_GET_COMMON(data_segs_in); + break; + case offsetof(struct bpf_tcp_sock, segs_out): + BPF_TCP_SOCK_GET_COMMON(segs_out); + break; + case offsetof(struct bpf_tcp_sock, data_segs_out): + BPF_TCP_SOCK_GET_COMMON(data_segs_out); + break; + case offsetof(struct bpf_tcp_sock, lost_out): + BPF_TCP_SOCK_GET_COMMON(lost_out); + break; + case offsetof(struct bpf_tcp_sock, sacked_out): + BPF_TCP_SOCK_GET_COMMON(sacked_out); + break; + case offsetof(struct bpf_tcp_sock, bytes_received): + BPF_TCP_SOCK_GET_COMMON(bytes_received); + break; + case offsetof(struct bpf_tcp_sock, bytes_acked): + BPF_TCP_SOCK_GET_COMMON(bytes_acked); + break; + case offsetof(struct bpf_tcp_sock, dsack_dups): + BPF_TCP_SOCK_GET_COMMON(dsack_dups); + break; + case offsetof(struct bpf_tcp_sock, delivered): + BPF_TCP_SOCK_GET_COMMON(delivered); + break; + case offsetof(struct bpf_tcp_sock, delivered_ce): + BPF_TCP_SOCK_GET_COMMON(delivered_ce); + break; + case offsetof(struct bpf_tcp_sock, icsk_retransmits): + BPF_INET_SOCK_GET_COMMON(icsk_retransmits); + break; + } + + return insn - insn_buf; +} + +BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_tcp_sock_proto = { + .func = bpf_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + +BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_get_listener_sock_proto = { + .func = bpf_get_listener_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ + unsigned int iphdr_len; + + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): + iphdr_len = sizeof(struct iphdr); + break; + case cpu_to_be16(ETH_P_IPV6): + iphdr_len = sizeof(struct ipv6hdr); + break; + default: + return 0; + } + + if (skb_headlen(skb) < iphdr_len) + return 0; + + if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) + return 0; + + return INET_ECN_set_ce(skb); +} + +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + default: + return size == sizeof(__u32); + } +} + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_XDP_SOCK_GET(FIELD) \ + do { \ + BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ + sizeof_field(struct bpf_xdp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct xdp_sock, FIELD)); \ + } while (0) + + BTF_TYPE_EMIT(struct bpf_xdp_sock); + + switch (si->off) { + case offsetof(struct bpf_xdp_sock, queue_id): + BPF_XDP_SOCK_GET(queue_id); + break; + } + + return insn - insn_buf; +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { + .func = bpf_skb_ecn_set_ce, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + int ret; + + if (unlikely(!sk || th_len < sizeof(*th))) + return -EINVAL; + + /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) + return -EINVAL; + + if (!th->ack || th->rst || th->syn) + return -ENOENT; + + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + if (tcp_synq_no_recent_overflow(sk)) + return -ENOENT; + + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) + return -EINVAL; + + ret = __cookie_v4_check((struct iphdr *)iph, th); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case 6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + if (sk->sk_family != AF_INET6) + return -EINVAL; + + ret = __cookie_v6_check((struct ipv6hdr *)iph, th); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + + if (ret > 0) + return 0; + + return -ENOENT; +#else + return -ENOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { + .func = bpf_tcp_check_syncookie, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + u16 mss; + + if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) + return -ENOENT; + + if (!th->syn || th->ack || th->fin || th->rst) + return -EINVAL; + + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) + return -EINVAL; + + mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case 6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + if (sk->sk_family != AF_INET6) + return -EINVAL; + + mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + if (mss == 0) + return -ENOENT; + + return cookie | ((u64)mss << 32); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYN_COOKIES */ +} + +static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { + .func = bpf_tcp_gen_syncookie, + .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) +{ + if (!sk || flags != 0) + return -EINVAL; + if (!skb_at_tc_ingress(skb)) + return -EOPNOTSUPP; + if (unlikely(dev_net(skb->dev) != sock_net(sk))) + return -ENETUNREACH; + if (sk_unhashed(sk)) + return -EOPNOTSUPP; + if (sk_is_refcounted(sk) && + unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) + return -ENOENT; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_pfree; + + return 0; +} + +static const struct bpf_func_proto bpf_sk_assign_proto = { + .func = bpf_sk_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg3_type = ARG_ANYTHING, +}; + +static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, + u8 search_kind, const u8 *magic, + u8 magic_len, bool *eol) +{ + u8 kind, kind_len; + + *eol = false; + + while (op < opend) { + kind = op[0]; + + if (kind == TCPOPT_EOL) { + *eol = true; + return ERR_PTR(-ENOMSG); + } else if (kind == TCPOPT_NOP) { + op++; + continue; + } + + if (opend - op < 2 || opend - op < op[1] || op[1] < 2) + /* Something is wrong in the received header. + * Follow the TCP stack's tcp_parse_options() + * and just bail here. + */ + return ERR_PTR(-EFAULT); + + kind_len = op[1]; + if (search_kind == kind) { + if (!magic_len) + return op; + + if (magic_len > kind_len - 2) + return ERR_PTR(-ENOMSG); + + if (!memcmp(&op[2], magic, magic_len)) + return op; + } + + op += kind_len; + } + + return ERR_PTR(-ENOMSG); +} + +BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + void *, search_res, u32, len, u64, flags) +{ + bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; + const u8 *op, *opend, *magic, *search = search_res; + u8 search_kind, search_len, copy_len, magic_len; + int ret; + + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + + /* 2 byte is the minimal option len except TCPOPT_NOP and + * TCPOPT_EOL which are useless for the bpf prog to learn + * and this helper disallow loading them also. + */ + if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) + return -EINVAL; + + search_kind = search[0]; + search_len = search[1]; + + if (search_len > len || search_kind == TCPOPT_NOP || + search_kind == TCPOPT_EOL) + return -EINVAL; + + if (search_kind == TCPOPT_EXP || search_kind == 253) { + /* 16 or 32 bit magic. +2 for kind and kind length */ + if (search_len != 4 && search_len != 6) + return -EINVAL; + magic = &search[2]; + magic_len = search_len - 2; + } else { + if (search_len) + return -EINVAL; + magic = NULL; + magic_len = 0; + } + + if (load_syn) { + ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); + if (ret < 0) + return ret; + + opend = op + ret; + op += sizeof(struct tcphdr); + } else { + if (!bpf_sock->skb || + bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) + /* This bpf_sock->op cannot call this helper */ + return -EPERM; + + opend = bpf_sock->skb_data_end; + op = bpf_sock->skb->data + sizeof(struct tcphdr); + } + + op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, + &eol); + if (IS_ERR(op)) + return PTR_ERR(op); + + copy_len = op[1]; + ret = copy_len; + if (copy_len > len) { + ret = -ENOSPC; + copy_len = len; + } + + memcpy(search_res, op, copy_len); + return ret; +} + +static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { + .func = bpf_sock_ops_load_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + const void *, from, u32, len, u64, flags) +{ + u8 new_kind, new_kind_len, magic_len = 0, *opend; + const u8 *op, *new_op, *magic = NULL; + struct sk_buff *skb; + bool eol; + + if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) + return -EPERM; + + if (len < 2 || flags) + return -EINVAL; + + new_op = from; + new_kind = new_op[0]; + new_kind_len = new_op[1]; + + if (new_kind_len > len || new_kind == TCPOPT_NOP || + new_kind == TCPOPT_EOL) + return -EINVAL; + + if (new_kind_len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + /* 253 is another experimental kind */ + if (new_kind == TCPOPT_EXP || new_kind == 253) { + if (new_kind_len < 4) + return -EINVAL; + /* Match for the 2 byte magic also. + * RFC 6994: the magic could be 2 or 4 bytes. + * Hence, matching by 2 byte only is on the + * conservative side but it is the right + * thing to do for the 'search-for-duplication' + * purpose. + */ + magic = &new_op[2]; + magic_len = 2; + } + + /* Check for duplication */ + skb = bpf_sock->skb; + op = skb->data + sizeof(struct tcphdr); + opend = bpf_sock->skb_data_end; + + op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, + &eol); + if (!IS_ERR(op)) + return -EEXIST; + + if (PTR_ERR(op) != -ENOMSG) + return PTR_ERR(op); + + if (eol) + /* The option has been ended. Treat it as no more + * header option can be written. + */ + return -ENOSPC; + + /* No duplication found. Store the header option. */ + memcpy(opend, from, new_kind_len); + + bpf_sock->remaining_opt_len -= new_kind_len; + bpf_sock->skb_data_end += new_kind_len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { + .func = bpf_sock_ops_store_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + u32, len, u64, flags) +{ + if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) + return -EPERM; + + if (flags || len < 2) + return -EINVAL; + + if (len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + bpf_sock->remaining_opt_len -= len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { + .func = bpf_sock_ops_reserve_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, + u64, tstamp, u32, tstamp_type) +{ + /* skb_clear_delivery_time() is done for inet protocol */ + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -EOPNOTSUPP; + + switch (tstamp_type) { + case BPF_SKB_CLOCK_REALTIME: + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_REALTIME; + break; + case BPF_SKB_CLOCK_MONOTONIC: + if (!tstamp) + return -EINVAL; + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_MONOTONIC; + break; + case BPF_SKB_CLOCK_TAI: + if (!tstamp) + return -EINVAL; + skb->tstamp = tstamp; + skb->tstamp_type = SKB_CLOCK_TAI; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { + .func = bpf_skb_set_tstamp, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +#ifdef CONFIG_SYN_COOKIES +BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, + struct tcphdr *, th, u32, th_len) +{ + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; + cookie = __cookie_v4_init_sequence(iph, th, &mss); + + return cookie | ((u64)mss << 32); +} + +static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { + .func = bpf_tcp_raw_gen_syncookie_ipv4, + .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct iphdr), + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, + struct tcphdr *, th, u32, th_len) +{ +#if IS_BUILTIN(CONFIG_IPV6) + const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - + sizeof(struct ipv6hdr); + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; + cookie = __cookie_v6_init_sequence(iph, th, &mss); + + return cookie | ((u64)mss << 32); +#else + return -EPROTONOSUPPORT; +#endif +} + +static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { + .func = bpf_tcp_raw_gen_syncookie_ipv6, + .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct ipv6hdr), + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, + struct tcphdr *, th) +{ + if (__cookie_v4_check(iph, th) > 0) + return 0; + + return -EACCES; +} + +static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { + .func = bpf_tcp_raw_check_syncookie_ipv4, + .gpl_only = true, /* __cookie_v4_check is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct iphdr), + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_size = sizeof(struct tcphdr), +}; + +BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, + struct tcphdr *, th) +{ +#if IS_BUILTIN(CONFIG_IPV6) + if (__cookie_v6_check(iph, th) > 0) + return 0; + + return -EACCES; +#else + return -EPROTONOSUPPORT; +#endif +} + +static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { + .func = bpf_tcp_raw_check_syncookie_ipv6, + .gpl_only = true, /* __cookie_v6_check is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct ipv6hdr), + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_size = sizeof(struct tcphdr), +}; +#endif /* CONFIG_SYN_COOKIES */ + +#endif /* CONFIG_INET */ + +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_clone_redirect: + case BPF_FUNC_l3_csum_replace: + case BPF_FUNC_l4_csum_replace: + case BPF_FUNC_lwt_push_encap: + case BPF_FUNC_lwt_seg6_action: + case BPF_FUNC_lwt_seg6_adjust_srh: + case BPF_FUNC_lwt_seg6_store_bytes: + case BPF_FUNC_msg_pop_data: + case BPF_FUNC_msg_pull_data: + case BPF_FUNC_msg_push_data: + case BPF_FUNC_skb_adjust_room: + case BPF_FUNC_skb_change_head: + case BPF_FUNC_skb_change_proto: + case BPF_FUNC_skb_change_tail: + case BPF_FUNC_skb_pull_data: + case BPF_FUNC_skb_store_bytes: + case BPF_FUNC_skb_vlan_pop: + case BPF_FUNC_skb_vlan_push: + case BPF_FUNC_store_hdr_opt: + case BPF_FUNC_xdp_adjust_head: + case BPF_FUNC_xdp_adjust_meta: + case BPF_FUNC_xdp_adjust_tail: + /* tail-called program could call any of the above */ + case BPF_FUNC_tail_call: + return true; + default: + return false; + } +} + +const struct bpf_func_proto bpf_event_output_data_proto __weak; +const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; + +static const struct bpf_func_proto * +sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + switch (func_id) { + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_cg_sock_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_getsockopt_proto; + default: + return NULL; + } + default: + return bpf_base_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + switch (func_id) { + case BPF_FUNC_bind: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_bind_proto; + default: + return NULL; + } + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_addr_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_sock_addr_skc_lookup_tcp_proto; +#endif /* CONFIG_INET */ + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: + return &bpf_sock_addr_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: + return &bpf_sock_addr_getsockopt_proto; + default: + return NULL; + } + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +const struct bpf_func_proto bpf_sk_storage_get_proto __weak; +const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; + +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + switch (func_id) { + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; + case BPF_FUNC_get_listener_sock: + return &bpf_get_listener_sock_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; +#endif + default: + return sk_filter_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_skb_vlan_push: + return &bpf_skb_vlan_push_proto; + case BPF_FUNC_skb_vlan_pop: + return &bpf_skb_vlan_pop_proto; + case BPF_FUNC_skb_change_proto: + return &bpf_skb_change_proto_proto; + case BPF_FUNC_skb_change_type: + return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_adjust_room: + return &bpf_skb_adjust_room_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_redirect_neigh: + return &bpf_redirect_neigh_proto; + case BPF_FUNC_redirect_peer: + return &bpf_redirect_peer_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + case BPF_FUNC_set_hash: + return &bpf_set_hash_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_skb_check_mtu_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_XFRM + case BPF_FUNC_skb_get_xfrm_state: + return &bpf_skb_get_xfrm_state_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_skb_cgroup_classid: + return &bpf_skb_cgroup_classid_proto; +#endif +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_tc_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_tc_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; + case BPF_FUNC_get_listener_sock: + return &bpf_get_listener_sock_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_tc_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_assign_proto; + case BPF_FUNC_skb_set_tstamp: + return &bpf_skb_set_tstamp_proto; +#ifdef CONFIG_SYN_COOKIES + case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: + return &bpf_tcp_raw_gen_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: + return &bpf_tcp_raw_gen_syncookie_ipv6_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv4: + return &bpf_tcp_raw_check_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv6: + return &bpf_tcp_raw_check_syncookie_ipv6_proto; +#endif +#endif + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_xdp_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_xdp_adjust_head: + return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_xdp_adjust_meta: + return &bpf_xdp_adjust_meta_proto; + case BPF_FUNC_redirect: + return &bpf_xdp_redirect_proto; + case BPF_FUNC_redirect_map: + return &bpf_xdp_redirect_map_proto; + case BPF_FUNC_xdp_adjust_tail: + return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_proto; + case BPF_FUNC_xdp_load_bytes: + return &bpf_xdp_load_bytes_proto; + case BPF_FUNC_xdp_store_bytes: + return &bpf_xdp_store_bytes_proto; + case BPF_FUNC_fib_lookup: + return &bpf_xdp_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_xdp_check_mtu_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_xdp_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; +#ifdef CONFIG_SYN_COOKIES + case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: + return &bpf_tcp_raw_gen_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: + return &bpf_tcp_raw_gen_syncookie_ipv6_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv4: + return &bpf_tcp_raw_check_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv6: + return &bpf_tcp_raw_check_syncookie_ipv6_proto; +#endif +#endif + default: + return bpf_sk_base_func_proto(func_id, prog); + } + +#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) + /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The + * kfuncs are defined in two different modules, and we want to be able + * to use them interchangeably with the same BTF type ID. Because modules + * can't de-duplicate BTF IDs between each other, we need the type to be + * referenced in the vmlinux BTF or the verifier will get confused about + * the different types. So we add this dummy type reference which will + * be included in vmlinux BTF, allowing both modules to refer to the + * same type ID. + */ + BTF_TYPE_EMIT(struct nf_conn___init); +#endif +} + +const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; + +static const struct bpf_func_proto * +sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sock_ops_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sock_ops_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; + case BPF_FUNC_sock_map_update: + return &bpf_sock_map_update_proto; + case BPF_FUNC_sock_hash_update: + return &bpf_sock_hash_update_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_ops_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_ops_proto; +#ifdef CONFIG_INET + case BPF_FUNC_load_hdr_opt: + return &bpf_sock_ops_load_hdr_opt_proto; + case BPF_FUNC_store_hdr_opt: + return &bpf_sock_ops_store_hdr_opt_proto; + case BPF_FUNC_reserve_hdr_opt: + return &bpf_sock_ops_reserve_hdr_opt_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif /* CONFIG_INET */ + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; +const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; + +static const struct bpf_func_proto * +sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_redirect_hash: + return &bpf_msg_redirect_hash_proto; + case BPF_FUNC_msg_apply_bytes: + return &bpf_msg_apply_bytes_proto; + case BPF_FUNC_msg_cork_bytes: + return &bpf_msg_cork_bytes_proto; + case BPF_FUNC_msg_pull_data: + return &bpf_msg_pull_data_proto; + case BPF_FUNC_msg_push_data: + return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sk_msg_proto; + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; +const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; + +static const struct bpf_func_proto * +sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &sk_skb_pull_data_proto; + case BPF_FUNC_skb_change_tail: + return &sk_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &sk_skb_change_head_proto; + case BPF_FUNC_skb_adjust_room: + return &sk_skb_adjust_room_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + case BPF_FUNC_sk_redirect_map: + return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_redirect_hash: + return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; +#endif + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_flow_dissector_load_bytes_proto; + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_in_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_xmit_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_FUNC_lwt_seg6_store_bytes: + return &bpf_lwt_seg6_store_bytes_proto; + case BPF_FUNC_lwt_seg6_action: + return &bpf_lwt_seg6_action_proto; + case BPF_FUNC_lwt_seg6_adjust_srh: + return &bpf_lwt_seg6_adjust_srh_proto; +#endif + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + if (off + size > offsetofend(struct __sk_buff, cb[4])) + return false; + break; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, data_end): + if (info->is_ldsx || size != size_default) + return false; + break; + case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): + case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): + case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): + if (size != size_default) + return false; + break; + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + return false; + case bpf_ctx_range(struct __sk_buff, hwtstamp): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range_ptr(struct __sk_buff, sk): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + break; + case offsetof(struct __sk_buff, tstamp_type): + return false; + case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: + /* Explicitly prohibit access to padding in __sk_buff. */ + return false; + default: + /* Only narrow read access allowed for now. */ + if (type == BPF_WRITE) { + if (size != size_default) + return false; + } else { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } + } + + return true; +} + +static bool sk_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +static bool cg_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, wire_len): + return false; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) + return false; + break; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) + return false; + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +static bool lwt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +/* Attach type specific accesses */ +static bool __sock_filter_check_attach_type(int off, + enum bpf_access_type access_type, + enum bpf_attach_type attach_type) +{ + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + switch (attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: + goto full_access; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_ip4): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + switch (attach_type) { + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_port): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + } +read_only: + return access_type == BPF_READ; +full_access: + return true; +} + +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range_till(struct bpf_sock, type, priority): + return false; + default: + return bpf_sock_is_valid_access(off, size, type, info); + } +} + +bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + int field_size; + + if (off < 0 || off >= sizeof(struct bpf_sock)) + return false; + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_sock, state): + case offsetof(struct bpf_sock, family): + case offsetof(struct bpf_sock, type): + case offsetof(struct bpf_sock, protocol): + case offsetof(struct bpf_sock, src_port): + case offsetof(struct bpf_sock, rx_queue_mapping): + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + case bpf_ctx_range(struct bpf_sock, dst_ip4): + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range(struct bpf_sock, dst_port): + field_size = size == size_default ? + size_default : sizeof_field(struct bpf_sock, dst_port); + bpf_ctx_record_field_size(info, field_size); + return bpf_ctx_narrow_access_ok(off, size, field_size); + case offsetofend(struct bpf_sock, dst_port) ... + offsetof(struct bpf_sock, dst_ip4) - 1: + return false; + } + + return size == size_default; +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (!bpf_sock_is_valid_access(off, size, type, info)) + return false; + return __sock_filter_check_attach_type(off, type, + prog->expected_attach_type); +} + +static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + /* Neither direct read nor direct write requires any preliminary + * action. + */ + return 0; +} + +static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog, int drop_verdict) +{ + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return TC_ACT_SHOT; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); + *insn++ = BPF_EXIT_INSN(); + + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +static int bpf_gen_ld_abs(const struct bpf_insn *orig, + struct bpf_insn *insn_buf) +{ + bool indirect = BPF_MODE(orig->code) == BPF_IND; + struct bpf_insn *insn = insn_buf; + + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); + if (orig->imm) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); + } + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); + + switch (BPF_SIZE(orig->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); + break; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); +} + +static bool tc_cls_act_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, queue_mapping): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; + case offsetof(struct __sk_buff, tstamp_type): + /* The convert_ctx_access() on reading and writing + * __sk_buff->tstamp depends on whether the bpf prog + * has used __sk_buff->tstamp_type or not. + * Thus, we need to set prog->tstamp_type_access + * earlier during is_valid_access() here. + */ + ((struct bpf_prog *)prog)->tstamp_type_access = 1; + return size == sizeof(__u8); + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +DEFINE_MUTEX(nf_conn_btf_access_lock); +EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); + +int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size); +EXPORT_SYMBOL_GPL(nfct_btf_struct_access); + +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + int ret = -EACCES; + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, reg, off, size); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + +static bool __is_valid_xdp_access(int off, int size) +{ + if (off < 0 || off >= sizeof(struct xdp_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static bool xdp_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + + if (type == BPF_WRITE) { + if (bpf_prog_is_offloaded(prog->aux)) { + switch (off) { + case offsetof(struct xdp_md, rx_queue_index): + return __is_valid_xdp_access(off, size); + } + } + return false; + } else { + switch (off) { + case offsetof(struct xdp_md, data_meta): + case offsetof(struct xdp_md, data): + case offsetof(struct xdp_md, data_end): + if (info->is_ldsx) + return false; + } + } + + switch (off) { + case offsetof(struct xdp_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct xdp_md, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; + case offsetof(struct xdp_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_xdp_access(off, size); +} + +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act) +{ + const u32 act_max = XDP_REDIRECT; + + pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); + +static int xdp_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + int ret = -EACCES; + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, reg, off, size); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + +static bool sock_addr_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_addr)) + return false; + if (off % size != 0) + return false; + + /* Disallow access to fields not belonging to the attach type's address + * family. + */ + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP6_RECVMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_UDP4_SENDMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_UDP6_SENDMSG: + break; + default: + return false; + } + break; + } + + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } else { + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + + if (size != size_default) + return false; + } + break; + case bpf_ctx_range_ptr(struct bpf_sock_addr, sk): + if (type != BPF_READ) + return false; + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case bpf_ctx_range(struct bpf_sock_addr, user_family): + case bpf_ctx_range(struct bpf_sock_addr, family): + case bpf_ctx_range(struct bpf_sock_addr, type): + case bpf_ctx_range(struct bpf_sock_addr, protocol): + if (type != BPF_READ) + return false; + if (size != size_default) + return false; + break; + default: + return false; + } + + return true; +} + +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): + if (size != size_default) + return false; + break; + default: + return false; + } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range_ptr(struct bpf_sock_ops, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; + case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, + size_default); + case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } + } + + return true; +} + +static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); +} + +static bool sk_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + return false; + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case bpf_ctx_range_ptr(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + if (size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range_ptr(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + if (size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range_ptr(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case bpf_ctx_range(struct sk_msg_md, family): + case bpf_ctx_range(struct sk_msg_md, remote_ip4): + case bpf_ctx_range(struct sk_msg_md, local_ip4): + case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct sk_msg_md, remote_port): + case bpf_ctx_range(struct sk_msg_md, local_port): + case bpf_ctx_range(struct sk_msg_md, size): + if (size != sizeof(__u32)) + return false; + break; + default: + return false; + } + return true; +} + +static bool flow_dissector_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) + return false; + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + if (info->is_ldsx || size != size_default) + return false; + info->reg_type = PTR_TO_PACKET; + return true; + case bpf_ctx_range(struct __sk_buff, data_end): + if (info->is_ldsx || size != size_default) + return false; + info->reg_type = PTR_TO_PACKET_END; + return true; + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_FLOW_KEYS; + return true; + default: + return false; + } +} + +static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) + +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, data)); + break; + + case offsetof(struct __sk_buff, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, data_end)); + break; + + case offsetof(struct __sk_buff, flow_keys): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, flow_keys)); + break; + } + + return insn - insn_buf; +} + +static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); + BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); + BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); + BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); + *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); +#else + BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); +#endif + + return insn; +} + +static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, skb_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + dst_reg, skb_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + dst_reg, skb_reg, + offsetof(struct sk_buff, end)); +#endif + + return insn; +} + +static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + +#ifdef CONFIG_NET_XGRESS + /* If the tstamp_type is read, + * the bpf prog is aware the tstamp could have delivery time. + * Thus, read skb->tstamp as is if tstamp_type_access is true. + */ + if (!prog->tstamp_type_access) { + /* AX is needed because src_reg and dst_reg could be the same */ + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); + /* check if ingress mask bits is set */ + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); + *insn++ = BPF_JMP_A(4); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); + *insn++ = BPF_JMP_A(2); + /* skb->tc_at_ingress && skb->tstamp_type, + * read 0 as the (rcv) timestamp. + */ + *insn++ = BPF_MOV64_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); + } +#endif + + *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + +static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->src_reg; + __u8 skb_reg = si->dst_reg; + +#ifdef CONFIG_NET_XGRESS + /* If the tstamp_type is read, + * the bpf prog is aware the tstamp could have delivery time. + * Thus, write skb->tstamp as is if tstamp_type_access is true. + * Otherwise, writing at ingress will have to clear the + * skb->tstamp_type bit also. + */ + if (!prog->tstamp_type_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); + /* Writing __sk_buff->tstamp as ingress, goto <clear> */ + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); + /* goto <store> */ + *insn++ = BPF_JMP_A(2); + /* <clear>: skb->tstamp_type */ + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); + } +#endif + + /* <store>: skb->tstamp = tstamp */ + *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, + skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); + return insn; +} + +#define BPF_EMIT_STORE(size, si, off) \ + BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ + (si)->dst_reg, (si)->src_reg, (off), (si)->imm) + +static u32 bpf_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct __sk_buff, len): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, len, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, protocol, 2, + target_size)); + break; + + case offsetof(struct __sk_buff, vlan_proto): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_proto, 2, + target_size)); + break; + + case offsetof(struct __sk_buff, priority): + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_W, si, + bpf_target_off(struct sk_buff, priority, 4, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, priority, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, skb_iif, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, ifindex, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, hash): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, hash, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, mark): + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_W, si, + bpf_target_off(struct sk_buff, mark, 4, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, mark, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, pkt_type): + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_TYPE_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); +#endif + break; + + case offsetof(struct __sk_buff, queue_mapping): + if (type == BPF_WRITE) { + u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); + + if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { + *insn++ = BPF_JMP_A(0); /* noop */ + break; + } + + if (BPF_CLASS(si->code) == BPF_STX) + *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); + *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); + } else { + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + queue_mapping, + 2, target_size)); + } + break; + + case offsetof(struct __sk_buff, vlan_present): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + vlan_all, 4, target_size)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1); + break; + + case offsetof(struct __sk_buff, vlan_tci): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_tci, 2, + target_size)); + break; + + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct qdisc_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + case offsetof(struct __sk_buff, tc_classid): + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); + + off = si->off; + off -= offsetof(struct __sk_buff, tc_classid); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, tc_classid); + *target_size = 2; + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_H, si, off); + else + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); + break; + + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + break; + + case offsetof(struct __sk_buff, data_meta): + off = si->off; + off -= offsetof(struct __sk_buff, data_meta); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_meta); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + + case offsetof(struct __sk_buff, data_end): + off = si->off; + off -= offsetof(struct __sk_buff, data_end); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + + case offsetof(struct __sk_buff, tc_index): +#ifdef CONFIG_NET_SCHED + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_H, si, + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); +#else + *target_size = 2; + if (type == BPF_WRITE) + *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); + else + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct __sk_buff, napi_id): +#if defined(CONFIG_NET_RX_BUSY_POLL) + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, napi_id, 4, + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#else + *target_size = 4; + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#endif + break; + case offsetof(struct __sk_buff, family): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_family, + 2, target_size)); + break; + case offsetof(struct __sk_buff, remote_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_daddr, + 4, target_size)); + break; + case offsetof(struct __sk_buff, local_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_rcv_saddr, + 4, target_size)); + break; + case offsetof(struct __sk_buff, remote_ip6[0]) ... + offsetof(struct __sk_buff, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, remote_ip6[0]); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + case offsetof(struct __sk_buff, local_ip6[0]) ... + offsetof(struct __sk_buff, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, local_ip6[0]); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct __sk_buff, remote_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_dport, + 2, target_size)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct __sk_buff, local_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_num, 2, target_size)); + break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + insn = bpf_convert_tstamp_write(prog, si, insn); + else + insn = bpf_convert_tstamp_read(prog, si, insn); + break; + + case offsetof(struct __sk_buff, tstamp_type): + insn = bpf_convert_tstamp_type_read(si, insn); + break; + + case offsetof(struct __sk_buff, gso_segs): + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_segs, 2, + target_size)); + break; + case offsetof(struct __sk_buff, gso_size): + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_size, 2, + target_size)); + break; + case offsetof(struct __sk_buff, wire_len): + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, wire_len); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, pkt_len); + *target_size = 4; + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); + break; + + case offsetof(struct __sk_buff, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + break; + case offsetof(struct __sk_buff, hwtstamp): + BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); + BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); + + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + hwtstamps, 8, + target_size)); + break; + } + + return insn - insn_buf; +} + +u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + + case offsetof(struct bpf_sock, mark): + BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_mark)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_mark)); + break; + + case offsetof(struct bpf_sock, priority): + BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_W, si, + offsetof(struct sock, sk_priority)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_priority)); + break; + + case offsetof(struct bpf_sock, family): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_family), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_family, + sizeof_field(struct sock_common, + skc_family), + target_size)); + break; + + case offsetof(struct bpf_sock, type): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_type), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_type, + sizeof_field(struct sock, sk_type), + target_size)); + break; + + case offsetof(struct bpf_sock, protocol): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_protocol), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_protocol, + sizeof_field(struct sock, sk_protocol), + target_size)); + break; + + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + sizeof_field(struct sock_common, + skc_rcv_saddr), + target_size)); + break; + + case offsetof(struct bpf_sock, dst_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_daddr, + sizeof_field(struct sock_common, + skc_daddr), + target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + sizeof_field(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); +#else + (void)off; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, dst_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_v6_daddr.s6_addr32[0], + sizeof_field(struct sock_common, + skc_v6_daddr.s6_addr32[0]), + target_size) + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + *target_size = 4; +#endif + break; + + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + sizeof_field(struct sock_common, + skc_num), + target_size)); + break; + + case offsetof(struct bpf_sock, dst_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_dport), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_dport, + sizeof_field(struct sock_common, + skc_dport), + target_size)); + break; + + case offsetof(struct bpf_sock, state): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_state), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_state, + sizeof_field(struct sock_common, + skc_state), + target_size)); + break; + case offsetof(struct bpf_sock, rx_queue_mapping): +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_rx_queue_mapping, + sizeof_field(struct sock, + sk_rx_queue_mapping), + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, + 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); + *target_size = 2; +#endif + break; + } + + return insn - insn_buf; +} + +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct __sk_buff, ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, ifindex, 4, + target_size)); + break; + default: + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); + } + + return insn - insn_buf; +} + +static u32 xdp_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct xdp_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data)); + break; + case offsetof(struct xdp_md, data_meta): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_meta)); + break; + case offsetof(struct xdp_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_end)); + break; + case offsetof(struct xdp_md, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; + case offsetof(struct xdp_md, rx_queue_index): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, + queue_index)); + break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; + } + + return insn - insn_buf; +} + +/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of + * context Structure, F is Field in context structure that contains a pointer + * to Nested Structure of type NS that has the field NF. + * + * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make + * sure that SIZE is not greater than actual size of S.F.NF. + * + * If offset OFF is provided, the load happens from that offset relative to + * offset of NF. + */ +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ + do { \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ + si->src_reg, offsetof(S, F)); \ + *insn++ = BPF_LDX_MEM( \ + SIZE, si->dst_reg, si->dst_reg, \ + bpf_target_off(NS, NF, sizeof_field(NS, NF), \ + target_size) \ + + OFF); \ + } while (0) + +#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ + BPF_FIELD_SIZEOF(NS, NF), 0) + +/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to + * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. + * + * In addition it uses Temporary Field TF (member of struct S) as the 3rd + * "register" since two registers available in convert_ctx_access are not + * enough: we can't override neither SRC, since it contains value to store, nor + * DST since it contains pointer to context that may be used by later + * instructions. But we need a temporary place to save pointer to nested + * structure whose field we want to store to. + */ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ + do { \ + int tmp_reg = BPF_REG_9; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ + offsetof(S, TF)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ + si->dst_reg, offsetof(S, F)); \ + *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ + tmp_reg, si->src_reg, \ + bpf_target_off(NS, NF, sizeof_field(NS, NF), \ + target_size) \ + + OFF, \ + si->imm); \ + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ + offsetof(S, TF)); \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ + TF) \ + do { \ + if (type == BPF_WRITE) { \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ + OFF, TF); \ + } else { \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, SIZE, OFF); \ + } \ + } while (0) + +static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sock_addr, user_family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr, uaddr, sa_family); + break; + + case offsetof(struct bpf_sock_addr, user_ip4): + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, + sin_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, user_ip6[0]); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, + tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, user_port): + /* To get port we need to know sa_family first and then treat + * sockaddr as either sockaddr_in or sockaddr_in6. + * Though we can simplify since port field has same offset and + * size in both structures. + * Here we check this invariant and use just one of the + * structures if it's true. + */ + BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != + offsetof(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != + sizeof_field(struct sockaddr_in6, sin6_port)); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_family); + break; + + case offsetof(struct bpf_sock_addr, type): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_type); + break; + + case offsetof(struct bpf_sock_addr, protocol): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_protocol); + break; + + case offsetof(struct bpf_sock_addr, msg_src_ip4): + /* Treat t_ctx as struct in_addr for msg_src_ip4. */ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct in_addr, t_ctx, + s_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); + /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct in6_addr, t_ctx, + s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); + break; + case offsetof(struct bpf_sock_addr, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_addr_kern, sk)); + break; + } + + return insn - insn_buf; +} + +static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ + BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ + sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + fullsock_reg = reg; \ + jmp += 2; \ + } \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_locked_tcp_sock), \ + fullsock_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_locked_tcp_sock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ + if (si->dst_reg == si->src_reg) \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_JMP_A(1); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } \ + } while (0) + +#define SOCK_OPS_GET_SK() \ + do { \ + int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + fullsock_reg = reg; \ + jmp += 2; \ + } \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + fullsock_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ + if (si->dst_reg == si->src_reg) \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_JMP_A(1); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } \ + } while (0) + +#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ + SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ + sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_locked_tcp_sock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_locked_tcp_sock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ + BPF_MEM | BPF_CLASS(si->code), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD), \ + si->imm); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + + switch (si->off) { + case offsetof(struct bpf_sock_ops, op): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + op), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, op)); + break; + + case offsetof(struct bpf_sock_ops, replylong[0]) ... + offsetof(struct bpf_sock_ops, replylong[3]): + BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != + sizeof_field(struct bpf_sock_ops_kern, reply)); + BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != + sizeof_field(struct bpf_sock_ops_kern, replylong)); + off = si->off; + off -= offsetof(struct bpf_sock_ops, replylong[0]); + off += offsetof(struct bpf_sock_ops_kern, replylong[0]); + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_W, si, off); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + break; + + case offsetof(struct bpf_sock_ops, family): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct bpf_sock_ops, local_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... + offsetof(struct bpf_sock_ops, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_ip6[0]) ... + offsetof(struct bpf_sock_ops, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, remote_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + sizeof_field(struct minmax_sample, t)); + break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); + break; + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); + break; + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); + break; + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); + break; + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); + break; + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); + break; + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); + break; + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); + break; + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); + break; + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); + break; + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); + break; + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); + break; + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); + break; + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); + break; + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); + break; + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); + break; + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); + break; + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); + break; + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); + break; + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); + break; + case offsetof(struct bpf_sock_ops, sk): + SOCK_OPS_GET_SK(); + break; + case offsetof(struct bpf_sock_ops, skb_data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb_data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb_data_end)); + break; + case offsetof(struct bpf_sock_ops, skb_data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, data)); + break; + case offsetof(struct bpf_sock_ops, skb_len): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, len)); + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + off = offsetof(struct sk_buff, cb); + off += offsetof(struct tcp_skb_cb, tcp_flags); + *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, + tcp_flags), + si->dst_reg, si->dst_reg, off); + break; + case offsetof(struct bpf_sock_ops, skb_hwtstamp): { + struct bpf_insn *jmp_on_null_skb; + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + /* Reserve one insn to test skb == NULL */ + jmp_on_null_skb = insn++; + insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + hwtstamps, 8, + target_size)); + *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, + insn - jmp_on_null_skb - 1); + break; + } + } + return insn - insn_buf; +} + +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + int reg; + int temp_reg_off = offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, temp_reg); + + if (si->src_reg == si->dst_reg) { + /* We need an extra register, choose and save a register. */ + reg = BPF_REG_9; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); + } else { + reg = si->dst_reg; + } + + /* reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + + /* reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); + + if (si->src_reg == si->dst_reg) { + /* Restore the saved register */ + *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, reg); + *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); + } + + return insn; +} + +static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct __sk_buff, data_end): + insn = bpf_convert_data_end_access(si, insn); + break; + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct sk_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + + default: + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); + } + + return insn - insn_buf; +} + +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; +#if IS_ENABLED(CONFIG_IPV6) + int off; +#endif + + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, data_end)); + break; + case offsetof(struct sk_msg_md, family): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct sk_msg_md, remote_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct sk_msg_md, local_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct sk_msg_md, remote_ip6[0]) ... + offsetof(struct sk_msg_md, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, local_ip6[0]) ... + offsetof(struct sk_msg_md, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, remote_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct sk_msg_md, local_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + + case offsetof(struct sk_msg_md, size): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_sg, size)); + break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_filter_verifier_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, +}; + +const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops tc_cls_act_verifier_ops = { + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = tc_cls_act_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, + .gen_ld_abs = bpf_gen_ld_abs, + .btf_struct_access = tc_cls_act_btf_struct_access, +}; + +const struct bpf_prog_ops tc_cls_act_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops xdp_verifier_ops = { + .get_func_proto = xdp_func_proto, + .is_valid_access = xdp_is_valid_access, + .convert_ctx_access = xdp_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, + .btf_struct_access = xdp_btf_struct_access, +}; + +const struct bpf_prog_ops xdp_prog_ops = { + .test_run = bpf_prog_test_run_xdp, +}; + +const struct bpf_verifier_ops cg_skb_verifier_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = cg_skb_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_skb_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_in_verifier_ops = { + .get_func_proto = lwt_in_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_in_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_out_verifier_ops = { + .get_func_proto = lwt_out_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_out_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_xmit_verifier_ops = { + .get_func_proto = lwt_xmit_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + +const struct bpf_prog_ops lwt_xmit_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { + .get_func_proto = lwt_seg6local_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { +}; + +const struct bpf_verifier_ops cg_sock_verifier_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = bpf_sock_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_prog_ops = { +}; + +const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { + .get_func_proto = sock_addr_func_proto, + .is_valid_access = sock_addr_is_valid_access, + .convert_ctx_access = sock_addr_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_addr_prog_ops = { +}; + +const struct bpf_verifier_ops sock_ops_verifier_ops = { + .get_func_proto = sock_ops_func_proto, + .is_valid_access = sock_ops_is_valid_access, + .convert_ctx_access = sock_ops_convert_ctx_access, +}; + +const struct bpf_prog_ops sock_ops_prog_ops = { +}; + +const struct bpf_verifier_ops sk_skb_verifier_ops = { + .get_func_proto = sk_skb_func_proto, + .is_valid_access = sk_skb_is_valid_access, + .convert_ctx_access = sk_skb_convert_ctx_access, + .gen_prologue = sk_skb_prologue, +}; + +const struct bpf_prog_ops sk_skb_prog_ops = { +}; + +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + +const struct bpf_verifier_ops flow_dissector_verifier_ops = { + .get_func_proto = flow_dissector_func_proto, + .is_valid_access = flow_dissector_is_valid_access, + .convert_ctx_access = flow_dissector_convert_ctx_access, +}; + +const struct bpf_prog_ops flow_dissector_prog_ops = { + .test_run = bpf_prog_test_run_flow_dissector, +}; int sk_detach_filter(struct sock *sk) { @@ -768,143 +11352,1227 @@ int sk_detach_filter(struct sock *sk) return -EPERM; filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } + return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); -void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) -{ - static const u16 decodes[] = { - [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K, - [BPF_S_ALU_ADD_X] = BPF_ALU|BPF_ADD|BPF_X, - [BPF_S_ALU_SUB_K] = BPF_ALU|BPF_SUB|BPF_K, - [BPF_S_ALU_SUB_X] = BPF_ALU|BPF_SUB|BPF_X, - [BPF_S_ALU_MUL_K] = BPF_ALU|BPF_MUL|BPF_K, - [BPF_S_ALU_MUL_X] = BPF_ALU|BPF_MUL|BPF_X, - [BPF_S_ALU_DIV_X] = BPF_ALU|BPF_DIV|BPF_X, - [BPF_S_ALU_MOD_K] = BPF_ALU|BPF_MOD|BPF_K, - [BPF_S_ALU_MOD_X] = BPF_ALU|BPF_MOD|BPF_X, - [BPF_S_ALU_AND_K] = BPF_ALU|BPF_AND|BPF_K, - [BPF_S_ALU_AND_X] = BPF_ALU|BPF_AND|BPF_X, - [BPF_S_ALU_OR_K] = BPF_ALU|BPF_OR|BPF_K, - [BPF_S_ALU_OR_X] = BPF_ALU|BPF_OR|BPF_X, - [BPF_S_ALU_XOR_K] = BPF_ALU|BPF_XOR|BPF_K, - [BPF_S_ALU_XOR_X] = BPF_ALU|BPF_XOR|BPF_X, - [BPF_S_ALU_LSH_K] = BPF_ALU|BPF_LSH|BPF_K, - [BPF_S_ALU_LSH_X] = BPF_ALU|BPF_LSH|BPF_X, - [BPF_S_ALU_RSH_K] = BPF_ALU|BPF_RSH|BPF_K, - [BPF_S_ALU_RSH_X] = BPF_ALU|BPF_RSH|BPF_X, - [BPF_S_ALU_NEG] = BPF_ALU|BPF_NEG, - [BPF_S_LD_W_ABS] = BPF_LD|BPF_W|BPF_ABS, - [BPF_S_LD_H_ABS] = BPF_LD|BPF_H|BPF_ABS, - [BPF_S_LD_B_ABS] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PROTOCOL] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PKTTYPE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_IFINDEX] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_NLATTR] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_NLATTR_NEST] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_MARK] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_QUEUE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_HATYPE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN, - [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND, - [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND, - [BPF_S_LD_B_IND] = BPF_LD|BPF_B|BPF_IND, - [BPF_S_LD_IMM] = BPF_LD|BPF_IMM, - [BPF_S_LDX_W_LEN] = BPF_LDX|BPF_W|BPF_LEN, - [BPF_S_LDX_B_MSH] = BPF_LDX|BPF_B|BPF_MSH, - [BPF_S_LDX_IMM] = BPF_LDX|BPF_IMM, - [BPF_S_MISC_TAX] = BPF_MISC|BPF_TAX, - [BPF_S_MISC_TXA] = BPF_MISC|BPF_TXA, - [BPF_S_RET_K] = BPF_RET|BPF_K, - [BPF_S_RET_A] = BPF_RET|BPF_A, - [BPF_S_ALU_DIV_K] = BPF_ALU|BPF_DIV|BPF_K, - [BPF_S_LD_MEM] = BPF_LD|BPF_MEM, - [BPF_S_LDX_MEM] = BPF_LDX|BPF_MEM, - [BPF_S_ST] = BPF_ST, - [BPF_S_STX] = BPF_STX, - [BPF_S_JMP_JA] = BPF_JMP|BPF_JA, - [BPF_S_JMP_JEQ_K] = BPF_JMP|BPF_JEQ|BPF_K, - [BPF_S_JMP_JEQ_X] = BPF_JMP|BPF_JEQ|BPF_X, - [BPF_S_JMP_JGE_K] = BPF_JMP|BPF_JGE|BPF_K, - [BPF_S_JMP_JGE_X] = BPF_JMP|BPF_JGE|BPF_X, - [BPF_S_JMP_JGT_K] = BPF_JMP|BPF_JGT|BPF_K, - [BPF_S_JMP_JGT_X] = BPF_JMP|BPF_JGT|BPF_X, - [BPF_S_JMP_JSET_K] = BPF_JMP|BPF_JSET|BPF_K, - [BPF_S_JMP_JSET_X] = BPF_JMP|BPF_JSET|BPF_X, - }; - u16 code; - - code = filt->code; - - to->code = decodes[code]; - to->jt = filt->jt; - to->jf = filt->jf; - - if (code == BPF_S_ALU_DIV_K) { - /* - * When loaded this rule user gave us X, which was - * translated into R = r(X). Now we calculate the - * RR = r(R) and report it back. If next time this - * value is loaded and RRR = r(RR) is calculated - * then the R == RRR will be true. - * - * One exception. X == 1 translates into R == 0 and - * we can't calculate RR out of it with r(). - */ - - if (filt->k == 0) - to->k = 1; - else - to->k = reciprocal_value(filt->k); - - BUG_ON(reciprocal_value(to->k) != filt->k); - } else - to->k = filt->k; -} - -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { + struct sock_fprog_kern *fprog; struct sk_filter *filter; - int i, ret; + int ret = 0; - lock_sock(sk); + sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); - ret = 0; + lockdep_sock_is_held(sk)); if (!filter) goto out; - ret = filter->len; + + /* We're copying the filter that has been originally attached, + * so no conversion/decode needed anymore. eBPF programs that + * have no original program cannot be dumped through this. + */ + ret = -EACCES; + fprog = filter->prog->orig_prog; + if (!fprog) + goto out; + + ret = fprog->len; if (!len) + /* User space only enquires number of filter blocks. */ goto out; + ret = -EINVAL; - if (len < filter->len) + if (len < fprog->len) goto out; ret = -EFAULT; - for (i = 0; i < filter->len; i++) { - struct sock_filter fb; - - sk_decode_filter(&filter->insns[i], &fb); - if (copy_to_user(&ubuf[i], &fb, sizeof(fb))) - goto out; - } + if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) + goto out; - ret = filter->len; + /* Instead of bytes, the API requests to return the number + * of filter blocks. + */ + ret = fprog->len; out: - release_sock(sk); + sockopt_release_sock(sk); return ret; } + +#ifdef CONFIG_INET +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + struct sock *migrating_sk, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->migrating_sk = migrating_sk; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); + action = bpf_prog_run(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; + struct sock_reuseport *reuse; + struct sock *selected_sk; + int err; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) { + /* reuseport_array has only sk with non NULL sk_reuseport_cb. + * The only (!reuse) case here is - the sk has already been + * unhashed (e.g. by close()), so treat it as -ENOENT. + * + * Other maps (e.g. sock_map) do not provide this guarantee and + * the sk may never be in the reuseport group to begin with. + */ + err = is_sockarray ? -ENOENT : -EINVAL; + goto error; + } + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk = reuse_kern->sk; + + if (sk->sk_protocol != selected_sk->sk_protocol) { + err = -EPROTOTYPE; + } else if (sk->sk_family != selected_sk->sk_family) { + err = -EAFNOSUPPORT; + } else { + /* Catch all. Likely bound to a different sockaddr. */ + err = -EBADFD; + } + goto error; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +error: + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + + return err; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; + default: + return bpf_base_func_proto(func_id, prog); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + case offsetof(struct sk_reuseport_md, sk): + info->reg_type = PTR_TO_SOCKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, migrating_sk): + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + return size == sizeof(__u64); + + /* Fields that allow narrowing */ + case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): + if (size < sizeof_field(struct sk_buff, protocol)) + return false; + fallthrough; + case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): + case bpf_ctx_range(struct sk_reuseport_md, bind_inany): + case bpf_ctx_range(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + sizeof_field(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + + case offsetof(struct sk_reuseport_md, sk): + SK_REUSEPORT_LOAD_FIELD(sk); + break; + + case offsetof(struct sk_reuseport_md, migrating_sk): + SK_REUSEPORT_LOAD_FIELD(migrating_sk); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; + +DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); +EXPORT_SYMBOL(bpf_sk_lookup_enabled); + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) + return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ + if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) + return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_sk_base_func_proto(func_id, prog); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + /* Allow 4-byte access to 2-byte field for backward compatibility */ + if (size == sizeof(__u32)) + return true; + bpf_ctx_record_field_size(info, sizeof(__be16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); + + case offsetofend(struct bpf_sk_lookup, remote_port) ... + offsetof(struct bpf_sk_lookup, local_ip4) - 1: + /* Allow access to zero padding for backward compatibility */ + bpf_ctx_record_field_size(info, sizeof(__u16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetofend(struct bpf_sk_lookup, remote_port): + *target_size = 2; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + ingress_ifindex, 4, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + +#endif /* CONFIG_INET */ + +DEFINE_BPF_DISPATCHER(xdp) + +void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) +{ + bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); +} + +BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) +#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE + +BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) +{ + /* tcp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct tcp6_sock); + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && + sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { + .func = bpf_skc_to_tcp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], +}; + +BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) +{ + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { + .func = bpf_skc_to_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +}; + +BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) +{ + /* BTF types for tcp_timewait_sock and inet_timewait_sock are not + * generated if CONFIG_INET=n. Trigger an explicit generation here. + */ + BTF_TYPE_EMIT(struct inet_timewait_sock); + BTF_TYPE_EMIT(struct tcp_timewait_sock); + +#ifdef CONFIG_INET + if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { + .func = bpf_skc_to_tcp_timewait_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], +}; + +BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) +{ +#ifdef CONFIG_INET + if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { + .func = bpf_skc_to_tcp_request_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], +}; + +BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) +{ + /* udp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct udp6_sock); + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && + sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { + .func = bpf_skc_to_udp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], +}; + +BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) +{ + /* unix_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct unix_sock); + if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { + .func = bpf_skc_to_unix_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], +}; + +BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) +{ + BTF_TYPE_EMIT(struct mptcp_sock); + return (unsigned long)bpf_mptcp_sock_from_subflow(sk); +} + +const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { + .func = bpf_skc_to_mptcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], +}; + +BPF_CALL_1(bpf_sock_from_file, struct file *, file) +{ + return (unsigned long)sock_from_file(file); +} + +BTF_ID_LIST(bpf_sock_from_file_btf_ids) +BTF_ID(struct, socket) +BTF_ID(struct, file) + +const struct bpf_func_proto bpf_sock_from_file_proto = { + .func = bpf_sock_from_file, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &bpf_sock_from_file_btf_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], +}; + +static const struct bpf_func_proto * +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func; + + switch (func_id) { + case BPF_FUNC_skc_to_tcp6_sock: + func = &bpf_skc_to_tcp6_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_sock: + func = &bpf_skc_to_tcp_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_timewait_sock: + func = &bpf_skc_to_tcp_timewait_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_request_sock: + func = &bpf_skc_to_tcp_request_sock_proto; + break; + case BPF_FUNC_skc_to_udp6_sock: + func = &bpf_skc_to_udp6_sock_proto; + break; + case BPF_FUNC_skc_to_unix_sock: + func = &bpf_skc_to_unix_sock_proto; + break; + case BPF_FUNC_skc_to_mptcp_sock: + func = &bpf_skc_to_mptcp_sock_proto; + break; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; + default: + return bpf_base_func_proto(func_id, prog); + } + + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) + return NULL; + + return func; +} + +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + +int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags) +{ + if (unlikely(flags)) + return -EINVAL; + if (unlikely(bpf_try_make_writable(skb, 0))) + return -EFAULT; + + memmove(bpf_skb_meta_pointer(skb, offset), from, len); + return 0; +} + +__bpf_kfunc_start_defs(); +__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)s; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); + + return 0; +} + +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + return 0; +} + +__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct xdp_buff *xdp = (struct xdp_buff *)x; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); + + return 0; +} + +__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, + const u8 *sun_path, u32 sun_path__sz) +{ + struct sockaddr_un *un; + + if (sa_kern->sk->sk_family != AF_UNIX) + return -EINVAL; + + /* We do not allow changing the address to unnamed or larger than the + * maximum allowed address size for a unix sockaddr. + */ + if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) + return -EINVAL; + + un = (struct sockaddr_un *)sa_kern->uaddr; + memcpy(un->sun_path, sun_path, sun_path__sz); + sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; + + return 0; +} + +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, + struct bpf_tcp_req_attrs *attrs, int attrs__sz) +{ +#if IS_ENABLED(CONFIG_SYN_COOKIES) + struct sk_buff *skb = (struct sk_buff *)s; + const struct request_sock_ops *ops; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct request_sock *req; + struct net *net; + __u16 min_mss; + u32 tsoff = 0; + + if (attrs__sz != sizeof(*attrs) || + attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) + return -EINVAL; + + if (!skb_at_tc_ingress(skb)) + return -EINVAL; + + net = dev_net(skb->dev); + if (net != sock_net(sk)) + return -ENETUNREACH; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ops = &tcp_request_sock_ops; + min_mss = 536; + break; +#if IS_BUILTIN(CONFIG_IPV6) + case htons(ETH_P_IPV6): + ops = &tcp6_request_sock_ops; + min_mss = IPV6_MIN_MTU - 60; + break; +#endif + default: + return -EINVAL; + } + + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || + sk_is_mptcp(sk)) + return -EINVAL; + + if (attrs->mss < min_mss) + return -EINVAL; + + if (attrs->wscale_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) + return -EINVAL; + + if (attrs->snd_wscale > TCP_MAX_WSCALE || + attrs->rcv_wscale > TCP_MAX_WSCALE) + return -EINVAL; + } + + if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) + return -EINVAL; + + if (attrs->tstamp_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) + return -EINVAL; + + tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); + } + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return -ENOMEM; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + + req->rsk_listener = sk; + req->syncookie = 1; + req->mss = attrs->mss; + req->ts_recent = attrs->rcv_tsval; + + ireq->snd_wscale = attrs->snd_wscale; + ireq->rcv_wscale = attrs->rcv_wscale; + ireq->tstamp_ok = !!attrs->tstamp_ok; + ireq->sack_ok = !!attrs->sack_ok; + ireq->wscale_ok = !!attrs->wscale_ok; + ireq->ecn_ok = !!attrs->ecn_ok; + + treq->req_usec_ts = !!attrs->usec_ts_ok; + treq->ts_off = tsoff; + + skb_orphan(skb); + skb->sk = req_to_sk(req); + skb->destructor = sock_pfree; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, + u64 flags) +{ + struct sk_buff *skb; + + if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) + return -EOPNOTSUPP; + + if (flags) + return -EINVAL; + + skb = skops->skb; + skb_shinfo(skb)->tx_flags |= SKBTX_BPF; + TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; + skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + + return 0; +} + +/** + * bpf_xdp_pull_data() - Pull in non-linear xdp data. + * @x: &xdp_md associated with the XDP buffer + * @len: length of data to be made directly accessible in the linear part + * + * Pull in data in case the XDP buffer associated with @x is non-linear and + * not all @len are in the linear data area. + * + * Direct packet access allows reading and writing linear XDP data through + * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which + * ends up in the linear part of the xdp_buff depends on the NIC and its + * configuration. When a frag-capable XDP program wants to directly access + * headers that may be in the non-linear area, call this kfunc to make sure + * the data is available in the linear area. Alternatively, use dynptr or + * bpf_xdp_{load,store}_bytes() to access data without pulling. + * + * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate + * headers in the non-linear data area. + * + * A call to this kfunc may reduce headroom. If there is not enough tailroom + * in the linear data area, metadata and data will be shifted down. + * + * A call to this kfunc is susceptible to change the buffer geometry. + * Therefore, at load time, all checks on pointers previously done by the + * verifier are invalidated and must be performed again, if the kfunc is used + * in combination with direct packet access. + * + * Return: + * * %0 - success + * * %-EINVAL - invalid len + */ +__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)x; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, delta, shift, headroom, tailroom, n_frags_free = 0; + void *data_hard_end = xdp_data_hard_end(xdp); + int data_len = xdp->data_end - xdp->data; + void *start; + + if (len <= data_len) + return 0; + + if (unlikely(len > xdp_get_buff_len(xdp))) + return -EINVAL; + + start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; + + headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); + tailroom = data_hard_end - xdp->data_end; + + delta = len - data_len; + if (unlikely(delta > tailroom + headroom)) + return -EINVAL; + + shift = delta - tailroom; + if (shift > 0) { + memmove(start - shift, start, xdp->data_end - start); + + xdp->data_meta -= shift; + xdp->data -= shift; + xdp->data_end -= shift; + } + + for (i = 0; i < sinfo->nr_frags && delta; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + u32 shrink = min_t(u32, delta, skb_frag_size(frag)); + + memcpy(xdp->data_end, skb_frag_address(frag), shrink); + + xdp->data_end += shrink; + sinfo->xdp_frags_size -= shrink; + delta -= shrink; + if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) + n_frags_free++; + } + + if (unlikely(n_frags_free)) { + memmove(sinfo->frags, sinfo->frags + n_frags_free, + (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); + + sinfo->nr_frags -= n_frags_free; + + if (!sinfo->nr_frags) { + xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); + } + } + + return 0; +} + +__bpf_kfunc_end_defs(); + +int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + int err; + + err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); + if (err) + return err; + + bpf_dynptr_set_rdonly(ptr); + + return 0; +} + +BTF_KFUNCS_START(bpf_kfunc_check_set_skb) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb) + +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + +BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) +BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) +BTF_ID_FLAGS(func, bpf_xdp_pull_data) +BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) + +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) +BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) + +BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) + +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) + +static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_xdp, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_sock_addr, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_tcp_reqsk, +}; + +static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_sock_ops, +}; + +static int __init bpf_kfunc_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + &bpf_kfunc_set_sock_addr); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); +} +late_initcall(bpf_kfunc_init); + +__bpf_kfunc_start_defs(); + +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. + * + * The function expects a non-NULL pointer to a socket, and invokes the + * protocol specific socket destroy handlers. + * + * The helper can only be called from BPF contexts that have acquired the socket + * locks. + * + * Parameters: + * @sock: Pointer to socket to be destroyed + * + * Return: + * On error, may return EPROTONOSUPPORT, EINVAL. + * EPROTONOSUPPORT if protocol specific destroy handler is not supported. + * 0 otherwise + */ +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) +{ + struct sock *sk = (struct sock *)sock; + + /* The locking semantics that allow for synchronous execution of the + * destroy handlers are only supported for TCP and UDP. + * Supporting protocols will need to acquire sock lock in the BPF context + * prior to invoking this kfunc. + */ + if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && + sk->sk_protocol != IPPROTO_UDP)) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, ECONNABORTED); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) +BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) + +static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && + prog->expected_attach_type != BPF_TRACE_ITER) + return -EACCES; + return 0; +} + +static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_sk_iter_kfunc_ids, + .filter = tracing_iter_filter, +}; + +static int init_subsystem(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); +} +late_initcall(init_subsystem); diff --git a/net/core/flow.c b/net/core/flow.c deleted file mode 100644 index 7102f166482d..000000000000 --- a/net/core/flow.c +++ /dev/null @@ -1,497 +0,0 @@ -/* flow.c: Generic flow cache. - * - * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) - * Copyright (C) 2003 David S. Miller (davem@redhat.com) - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/list.h> -#include <linux/jhash.h> -#include <linux/interrupt.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/smp.h> -#include <linux/completion.h> -#include <linux/percpu.h> -#include <linux/bitops.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/mutex.h> -#include <net/flow.h> -#include <linux/atomic.h> -#include <linux/security.h> - -struct flow_cache_entry { - union { - struct hlist_node hlist; - struct list_head gc_list; - } u; - struct net *net; - u16 family; - u8 dir; - u32 genid; - struct flowi key; - struct flow_cache_object *object; -}; - -struct flow_cache_percpu { - struct hlist_head *hash_table; - int hash_count; - u32 hash_rnd; - int hash_rnd_recalc; - struct tasklet_struct flush_tasklet; -}; - -struct flow_flush_info { - struct flow_cache *cache; - atomic_t cpuleft; - struct completion completion; -}; - -struct flow_cache { - u32 hash_shift; - struct flow_cache_percpu __percpu *percpu; - struct notifier_block hotcpu_notifier; - int low_watermark; - int high_watermark; - struct timer_list rnd_timer; -}; - -atomic_t flow_cache_genid = ATOMIC_INIT(0); -EXPORT_SYMBOL(flow_cache_genid); -static struct flow_cache flow_cache_global; -static struct kmem_cache *flow_cachep __read_mostly; - -static DEFINE_SPINLOCK(flow_cache_gc_lock); -static LIST_HEAD(flow_cache_gc_list); - -#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) -#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) - -static void flow_cache_new_hashrnd(unsigned long arg) -{ - struct flow_cache *fc = (void *) arg; - int i; - - for_each_possible_cpu(i) - per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; - - fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&fc->rnd_timer); -} - -static int flow_entry_valid(struct flow_cache_entry *fle) -{ - if (atomic_read(&flow_cache_genid) != fle->genid) - return 0; - if (fle->object && !fle->object->ops->check(fle->object)) - return 0; - return 1; -} - -static void flow_entry_kill(struct flow_cache_entry *fle) -{ - if (fle->object) - fle->object->ops->delete(fle->object); - kmem_cache_free(flow_cachep, fle); -} - -static void flow_cache_gc_task(struct work_struct *work) -{ - struct list_head gc_list; - struct flow_cache_entry *fce, *n; - - INIT_LIST_HEAD(&gc_list); - spin_lock_bh(&flow_cache_gc_lock); - list_splice_tail_init(&flow_cache_gc_list, &gc_list); - spin_unlock_bh(&flow_cache_gc_lock); - - list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) - flow_entry_kill(fce); -} -static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task); - -static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, - int deleted, struct list_head *gc_list) -{ - if (deleted) { - fcp->hash_count -= deleted; - spin_lock_bh(&flow_cache_gc_lock); - list_splice_tail(gc_list, &flow_cache_gc_list); - spin_unlock_bh(&flow_cache_gc_lock); - schedule_work(&flow_cache_gc_work); - } -} - -static void __flow_cache_shrink(struct flow_cache *fc, - struct flow_cache_percpu *fcp, - int shrink_to) -{ - struct flow_cache_entry *fle; - struct hlist_node *tmp; - LIST_HEAD(gc_list); - int i, deleted = 0; - - for (i = 0; i < flow_cache_hash_size(fc); i++) { - int saved = 0; - - hlist_for_each_entry_safe(fle, tmp, - &fcp->hash_table[i], u.hlist) { - if (saved < shrink_to && - flow_entry_valid(fle)) { - saved++; - } else { - deleted++; - hlist_del(&fle->u.hlist); - list_add_tail(&fle->u.gc_list, &gc_list); - } - } - } - - flow_cache_queue_garbage(fcp, deleted, &gc_list); -} - -static void flow_cache_shrink(struct flow_cache *fc, - struct flow_cache_percpu *fcp) -{ - int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); - - __flow_cache_shrink(fc, fcp, shrink_to); -} - -static void flow_new_hash_rnd(struct flow_cache *fc, - struct flow_cache_percpu *fcp) -{ - get_random_bytes(&fcp->hash_rnd, sizeof(u32)); - fcp->hash_rnd_recalc = 0; - __flow_cache_shrink(fc, fcp, 0); -} - -static u32 flow_hash_code(struct flow_cache *fc, - struct flow_cache_percpu *fcp, - const struct flowi *key, - size_t keysize) -{ - const u32 *k = (const u32 *) key; - const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32); - - return jhash2(k, length, fcp->hash_rnd) - & (flow_cache_hash_size(fc) - 1); -} - -/* I hear what you're saying, use memcmp. But memcmp cannot make - * important assumptions that we can here, such as alignment. - */ -static int flow_key_compare(const struct flowi *key1, const struct flowi *key2, - size_t keysize) -{ - const flow_compare_t *k1, *k1_lim, *k2; - - k1 = (const flow_compare_t *) key1; - k1_lim = k1 + keysize; - - k2 = (const flow_compare_t *) key2; - - do { - if (*k1++ != *k2++) - return 1; - } while (k1 < k1_lim); - - return 0; -} - -struct flow_cache_object * -flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver, void *ctx) -{ - struct flow_cache *fc = &flow_cache_global; - struct flow_cache_percpu *fcp; - struct flow_cache_entry *fle, *tfle; - struct flow_cache_object *flo; - size_t keysize; - unsigned int hash; - - local_bh_disable(); - fcp = this_cpu_ptr(fc->percpu); - - fle = NULL; - flo = NULL; - - keysize = flow_key_size(family); - if (!keysize) - goto nocache; - - /* Packet really early in init? Making flow_cache_init a - * pre-smp initcall would solve this. --RR */ - if (!fcp->hash_table) - goto nocache; - - if (fcp->hash_rnd_recalc) - flow_new_hash_rnd(fc, fcp); - - hash = flow_hash_code(fc, fcp, key, keysize); - hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) { - if (tfle->net == net && - tfle->family == family && - tfle->dir == dir && - flow_key_compare(key, &tfle->key, keysize) == 0) { - fle = tfle; - break; - } - } - - if (unlikely(!fle)) { - if (fcp->hash_count > fc->high_watermark) - flow_cache_shrink(fc, fcp); - - fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); - if (fle) { - fle->net = net; - fle->family = family; - fle->dir = dir; - memcpy(&fle->key, key, keysize * sizeof(flow_compare_t)); - fle->object = NULL; - hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); - fcp->hash_count++; - } - } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { - flo = fle->object; - if (!flo) - goto ret_object; - flo = flo->ops->get(flo); - if (flo) - goto ret_object; - } else if (fle->object) { - flo = fle->object; - flo->ops->delete(flo); - fle->object = NULL; - } - -nocache: - flo = NULL; - if (fle) { - flo = fle->object; - fle->object = NULL; - } - flo = resolver(net, key, family, dir, flo, ctx); - if (fle) { - fle->genid = atomic_read(&flow_cache_genid); - if (!IS_ERR(flo)) - fle->object = flo; - else - fle->genid--; - } else { - if (!IS_ERR_OR_NULL(flo)) - flo->ops->delete(flo); - } -ret_object: - local_bh_enable(); - return flo; -} -EXPORT_SYMBOL(flow_cache_lookup); - -static void flow_cache_flush_tasklet(unsigned long data) -{ - struct flow_flush_info *info = (void *)data; - struct flow_cache *fc = info->cache; - struct flow_cache_percpu *fcp; - struct flow_cache_entry *fle; - struct hlist_node *tmp; - LIST_HEAD(gc_list); - int i, deleted = 0; - - fcp = this_cpu_ptr(fc->percpu); - for (i = 0; i < flow_cache_hash_size(fc); i++) { - hlist_for_each_entry_safe(fle, tmp, - &fcp->hash_table[i], u.hlist) { - if (flow_entry_valid(fle)) - continue; - - deleted++; - hlist_del(&fle->u.hlist); - list_add_tail(&fle->u.gc_list, &gc_list); - } - } - - flow_cache_queue_garbage(fcp, deleted, &gc_list); - - if (atomic_dec_and_test(&info->cpuleft)) - complete(&info->completion); -} - -/* - * Return whether a cpu needs flushing. Conservatively, we assume - * the presence of any entries means the core may require flushing, - * since the flow_cache_ops.check() function may assume it's running - * on the same core as the per-cpu cache component. - */ -static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu) -{ - struct flow_cache_percpu *fcp; - int i; - - fcp = per_cpu_ptr(fc->percpu, cpu); - for (i = 0; i < flow_cache_hash_size(fc); i++) - if (!hlist_empty(&fcp->hash_table[i])) - return 0; - return 1; -} - -static void flow_cache_flush_per_cpu(void *data) -{ - struct flow_flush_info *info = data; - struct tasklet_struct *tasklet; - - tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet; - tasklet->data = (unsigned long)info; - tasklet_schedule(tasklet); -} - -void flow_cache_flush(void) -{ - struct flow_flush_info info; - static DEFINE_MUTEX(flow_flush_sem); - cpumask_var_t mask; - int i, self; - - /* Track which cpus need flushing to avoid disturbing all cores. */ - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return; - cpumask_clear(mask); - - /* Don't want cpus going down or up during this. */ - get_online_cpus(); - mutex_lock(&flow_flush_sem); - info.cache = &flow_cache_global; - for_each_online_cpu(i) - if (!flow_cache_percpu_empty(info.cache, i)) - cpumask_set_cpu(i, mask); - atomic_set(&info.cpuleft, cpumask_weight(mask)); - if (atomic_read(&info.cpuleft) == 0) - goto done; - - init_completion(&info.completion); - - local_bh_disable(); - self = cpumask_test_and_clear_cpu(smp_processor_id(), mask); - on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0); - if (self) - flow_cache_flush_tasklet((unsigned long)&info); - local_bh_enable(); - - wait_for_completion(&info.completion); - -done: - mutex_unlock(&flow_flush_sem); - put_online_cpus(); - free_cpumask_var(mask); -} - -static void flow_cache_flush_task(struct work_struct *work) -{ - flow_cache_flush(); -} - -static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); - -void flow_cache_flush_deferred(void) -{ - schedule_work(&flow_cache_flush_work); -} - -static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) -{ - struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); - - if (!fcp->hash_table) { - fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); - if (!fcp->hash_table) { - pr_err("NET: failed to allocate flow cache sz %zu\n", sz); - return -ENOMEM; - } - fcp->hash_rnd_recalc = 1; - fcp->hash_count = 0; - tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); - } - return 0; -} - -static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); - int res, cpu = (unsigned long) hcpu; - struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - res = flow_cache_cpu_prepare(fc, cpu); - if (res) - return notifier_from_errno(res); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - __flow_cache_shrink(fc, fcp, 0); - break; - } - return NOTIFY_OK; -} - -static int __init flow_cache_init(struct flow_cache *fc) -{ - int i; - - fc->hash_shift = 10; - fc->low_watermark = 2 * flow_cache_hash_size(fc); - fc->high_watermark = 4 * flow_cache_hash_size(fc); - - fc->percpu = alloc_percpu(struct flow_cache_percpu); - if (!fc->percpu) - return -ENOMEM; - - for_each_online_cpu(i) { - if (flow_cache_cpu_prepare(fc, i)) - goto err; - } - fc->hotcpu_notifier = (struct notifier_block){ - .notifier_call = flow_cache_cpu, - }; - register_hotcpu_notifier(&fc->hotcpu_notifier); - - setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, - (unsigned long) fc); - fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&fc->rnd_timer); - - return 0; - -err: - for_each_possible_cpu(i) { - struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); - kfree(fcp->hash_table); - fcp->hash_table = NULL; - } - - free_percpu(fc->percpu); - fc->percpu = NULL; - - return -ENOMEM; -} - -static int __init flow_cache_init_global(void) -{ - flow_cachep = kmem_cache_create("flow_cache", - sizeof(struct flow_cache_entry), - 0, SLAB_PANIC, NULL); - - return flow_cache_init(&flow_cache_global); -} - -module_init(flow_cache_init_global); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 00ee068efc1c..1b61bb25ba0e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1,10 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> +#include <linux/filter.h> +#include <net/dsa.h> +#include <net/dst_metadata.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/gre.h> +#include <net/pptp.h> +#include <net/tipc.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> @@ -12,250 +20,1924 @@ #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> -#include <net/flow_keys.h> +#include <linux/stddef.h> +#include <linux/if_ether.h> +#include <linux/if_hsr.h> +#include <linux/mpls.h> +#include <linux/tcp.h> +#include <linux/ptp_classify.h> +#include <net/flow_dissector.h> +#include <net/pkt_cls.h> +#include <scsi/fc/fc_fcoe.h> +#include <uapi/linux/batadv_packet.h> +#include <linux/bpf.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_labels.h> +#endif +#include <linux/bpf-netns.h> + +static void dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + flow_dissector->used_keys |= (1ULL << key_id); +} + +void skb_flow_dissector_init(struct flow_dissector *flow_dissector, + const struct flow_dissector_key *key, + unsigned int key_count) +{ + unsigned int i; + + memset(flow_dissector, 0, sizeof(*flow_dissector)); + + for (i = 0; i < key_count; i++, key++) { + /* User should make sure that every key target offset is within + * boundaries of unsigned short. + */ + BUG_ON(key->offset > USHRT_MAX); + BUG_ON(dissector_uses_key(flow_dissector, + key->key_id)); + + dissector_set_key(flow_dissector, key->key_id); + flow_dissector->offset[key->key_id] = key->offset; + } + + /* Ensure that the dissector always includes control and basic key. + * That way we are able to avoid handling lack of these in fast path. + */ + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); +} +EXPORT_SYMBOL(skb_flow_dissector_init); + +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach_check(struct net *net, + struct bpf_prog *prog) +{ + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + + if (net == &init_net) { + /* BPF flow dissector in the root namespace overrides + * any per-net-namespace one. When attaching to root, + * make sure we don't have any BPF program attached + * to the non-root namespaces. + */ + struct net *ns; + + for_each_net(ns) { + if (ns == &init_net) + continue; + if (rcu_access_pointer(ns->bpf.run_array[type])) + return -EEXIST; + } + } else { + /* Make sure root flow dissector is not attached + * when attaching to the non-root namespace. + */ + if (rcu_access_pointer(init_net.bpf.run_array[type])) + return -EEXIST; + } + + return 0; +} +#endif /* CONFIG_BPF_SYSCALL */ + +/** + * skb_flow_get_ports - extract the upper layer ports and return them + * @skb: sk_buff to extract the ports from + * @thoff: transport header offset + * @ip_proto: protocol for which to get port offset + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve the ports at offset thoff + poff where poff + * is the protocol port offset returned from proto_ports_offset + */ +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + const void *data, int hlen) +{ + int poff = proto_ports_offset(ip_proto); + + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + + if (poff >= 0) { + __be32 *ports, _ports; + + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); + if (ports) + return *ports; + } + + return 0; +} +EXPORT_SYMBOL(skb_flow_get_ports); + +static bool icmp_has_id(u8 type) +{ + switch (type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + return true; + } + + return false; +} + +/** + * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields + * @skb: sk_buff to extract from + * @key_icmp: struct flow_dissector_key_icmp to fill + * @data: raw buffer pointer to the packet + * @thoff: offset to extract at + * @hlen: packet header length + */ +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + const void *data, int thoff, int hlen) +{ + struct icmphdr *ih, _ih; + + ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); + if (!ih) + return; + + key_icmp->type = ih->type; + key_icmp->code = ih->code; + + /* As we use 0 to signal that the Id field is not present, + * avoid confusion with packets without such field + */ + if (icmp_has_id(ih->type)) + key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1; + else + key_icmp->id = 0; +} +EXPORT_SYMBOL(skb_flow_get_icmp_tci); + +/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet + * using skb_flow_get_icmp_tci(). + */ +static void __skb_flow_dissect_icmp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int thoff, int hlen) +{ + struct flow_dissector_key_icmp *key_icmp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) + return; + + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + + skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); +} + +static void __skb_flow_dissect_ah(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_ah; + struct ip_auth_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_ah = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_ah->spi = hdr->spi; +} + +static void __skb_flow_dissect_esp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_esp; + struct ip_esp_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_esp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_esp->spi = hdr->spi; +} + +static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_l2tpv3 *key_l2tpv3; + struct { + __be32 session_id; + } *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_l2tpv3 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_L2TPV3, + target_container); + + key_l2tpv3->session_id = hdr->session_id; +} + +void skb_flow_dissect_meta(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_meta *meta; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) + return; + + meta = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_META, + target_container); + meta->ingress_ifindex = skb->skb_iif; +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (tc_skb_ext_tc_enabled()) { + struct tc_skb_ext *ext; + + ext = skb_ext_find(skb, TC_SKB_EXT); + if (ext) + meta->l2_miss = ext->l2_miss; + } +#endif +} +EXPORT_SYMBOL(skb_flow_dissect_meta); + +static void +skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type, + u32 ctrl_flags, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *ctrl; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) + return; + + ctrl = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + target_container); + ctrl->addr_type = type; + ctrl->flags = ctrl_flags; +} + +void +skb_flow_dissect_ct(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, u16 *ctinfo_map, + size_t mapsize, bool post_ct, u16 zone) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct flow_dissector_key_ct *key; + enum ip_conntrack_info ctinfo; + struct nf_conn_labels *cl; + struct nf_conn *ct; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT)) + return; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct && !post_ct) + return; + + key = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CT, + target_container); + + if (!ct) { + key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID; + key->ct_zone = zone; + return; + } + + if (ctinfo < mapsize) + key->ct_state = ctinfo_map[ctinfo]; +#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) + key->ct_zone = ct->zone.id; +#endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + key->ct_mark = READ_ONCE(ct->mark); +#endif + + cl = nf_ct_labels_find(ct); + if (cl) + memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels)); +#endif /* CONFIG_NF_CONNTRACK */ +} +EXPORT_SYMBOL(skb_flow_dissect_ct); + +void +skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct ip_tunnel_info *info; + struct ip_tunnel_key *key; + u32 ctrl_flags = 0; + + /* A quick check to see if there might be something to do. */ + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_OPTS)) + return; + + info = skb_tunnel_info(skb); + if (!info) + return; + + key = &info->key; + + if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM; + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT; + if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM; + if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags)) + ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + ctrl_flags, flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + struct flow_dissector_key_ipv4_addrs *ipv4; + + ipv4 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + target_container); + ipv4->src = key->u.ipv4.src; + ipv4->dst = key->u.ipv4.dst; + } + break; + case AF_INET6: + skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + ctrl_flags, flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *ipv6; + + ipv6 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + target_container); + ipv6->src = key->u.ipv6.src; + ipv6->dst = key->u.ipv6.dst; + } + break; + default: + skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector, + target_container); + break; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { + struct flow_dissector_key_keyid *keyid; + + keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID, + target_container); + keyid->keyid = tunnel_id_to_key32(key->tun_id); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { + struct flow_dissector_key_ports *tp; + + tp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + target_container); + tp->src = key->tp_src; + tp->dst = key->tp_dst; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { + struct flow_dissector_key_ip *ip; -/* copy saddr & daddr, possibly using 64bit load/store - * Equivalent to : flow->src = iph->saddr; - * flow->dst = iph->daddr; + ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP, + target_container); + ip->tos = key->tos; + ip->ttl = key->ttl; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { + struct flow_dissector_key_enc_opts *enc_opt; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + u32 val; + + enc_opt = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_OPTS, + target_container); + + if (!info->options_len) + return; + + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + + ip_tunnel_set_options_present(flags); + ip_tunnel_flags_and(flags, info->key.tun_flags, flags); + + val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, + IP_TUNNEL_GENEVE_OPT_BIT); + enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; + } +} +EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); + +void skb_flow_dissect_hash(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_hash *key; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) + return; + + key = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_HASH, + target_container); + + key->hash = skb_get_hash_raw(skb); +} +EXPORT_SYMBOL(skb_flow_dissect_hash); + +static enum flow_dissect_ret +__skb_flow_dissect_mpls(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, int nhoff, + int hlen, int lse_index, bool *entropy_label) +{ + struct mpls_label *hdr, _hdr; + u32 entry, label, bos; + + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && + !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) + return FLOW_DISSECT_RET_OUT_GOOD; + + if (lse_index >= FLOW_DIS_MPLS_MAX) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + entry = ntohl(hdr->entry); + label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { + struct flow_dissector_key_mpls *key_mpls; + struct flow_dissector_mpls_lse *lse; + + key_mpls = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS, + target_container); + lse = &key_mpls->ls[lse_index]; + + lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + lse->mpls_bos = bos; + lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + lse->mpls_label = label; + dissector_set_mpls_lse(key_mpls, lse_index); + } + + if (*entropy_label && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + struct flow_dissector_key_keyid *key_keyid; + + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY, + target_container); + key_keyid->keyid = cpu_to_be32(label); + } + + *entropy_label = label == MPLS_LABEL_ENTROPY; + + return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; +} + +static enum flow_dissect_ret +__skb_flow_dissect_arp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_arp *key_arp; + struct { + unsigned char ar_sha[ETH_ALEN]; + unsigned char ar_sip[4]; + unsigned char ar_tha[ETH_ALEN]; + unsigned char ar_tip[4]; + } *arp_eth, _arp_eth; + const struct arphdr *arp; + struct arphdr _arp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) + return FLOW_DISSECT_RET_OUT_GOOD; + + arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, + hlen, &_arp); + if (!arp) + return FLOW_DISSECT_RET_OUT_BAD; + + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_hln != ETH_ALEN || + arp->ar_pln != 4 || + (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST))) + return FLOW_DISSECT_RET_OUT_BAD; + + arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), + sizeof(_arp_eth), data, + hlen, &_arp_eth); + if (!arp_eth) + return FLOW_DISSECT_RET_OUT_BAD; + + key_arp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ARP, + target_container); + + memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); + memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); + + /* Only store the lower byte of the opcode; + * this covers ARPOP_REPLY and ARPOP_REQUEST. + */ + key_arp->op = ntohs(arp->ar_op) & 0xff; + + ether_addr_copy(key_arp->sha, arp_eth->ar_sha); + ether_addr_copy(key_arp->tha, arp_eth->ar_tha); + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret +__skb_flow_dissect_cfm(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_cfm *key, *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, + target_container); + + key->mdl_ver = hdr->mdl_ver; + key->opcode = hdr->opcode; + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret +__skb_flow_dissect_gre(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + __be16 *p_proto, int *p_nhoff, int *p_hlen, + unsigned int flags) +{ + struct flow_dissector_key_keyid *key_keyid; + struct gre_base_hdr *hdr, _hdr; + int offset = 0; + u16 gre_ver; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), + data, *p_hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) + return FLOW_DISSECT_RET_OUT_GOOD; + + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + return FLOW_DISSECT_RET_OUT_GOOD; + + *p_proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + return FLOW_DISSECT_RET_OUT_GOOD; + } + + offset += sizeof(struct gre_base_hdr); + + if (hdr->flags & GRE_CSUM) + offset += sizeof_field(struct gre_full_hdr, csum) + + sizeof_field(struct gre_full_hdr, reserved1); + + if (hdr->flags & GRE_KEY) { + const __be32 *keyid; + __be32 _keyid; + + keyid = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_keyid), + data, *p_hlen, &_keyid); + if (!keyid) + return FLOW_DISSECT_RET_OUT_BAD; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID, + target_container); + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; + } + offset += sizeof_field(struct gre_full_hdr, key); + } + + if (hdr->flags & GRE_SEQ) + offset += sizeof_field(struct pptp_gre_header, seq); + + if (gre_ver == 0) { + if (*p_proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_eth), + data, *p_hlen, &_eth); + if (!eth) + return FLOW_DISSECT_RET_OUT_BAD; + *p_proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + *p_hlen = *p_nhoff + offset; + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof_field(struct pptp_gre_header, ack); + + ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_ppp_hdr), + data, *p_hlen, _ppp_hdr); + if (!ppp_hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + *p_proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + *p_proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; + } + + *p_nhoff += offset; + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} + +/** + * __skb_flow_dissect_batadv() - dissect batman-adv header + * @skb: sk_buff to with the batman-adv header + * @key_control: flow dissectors control key + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @p_proto: pointer used to update the protocol to process next + * @p_nhoff: pointer used to update inner network header offset + * @hlen: packet header length + * @flags: any combination of FLOW_DISSECTOR_F_* + * + * ETH_P_BATMAN packets are tried to be dissected. Only + * &struct batadv_unicast packets are actually processed because they contain an + * inner ethernet header and are usually followed by actual network header. This + * allows the flow dissector to continue processing the packet. + * + * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, + * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, + * otherwise FLOW_DISSECT_RET_OUT_BAD + */ +static enum flow_dissect_ret +__skb_flow_dissect_batadv(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + const void *data, __be16 *p_proto, int *p_nhoff, + int hlen, unsigned int flags) +{ + struct { + struct batadv_unicast_packet batadv_unicast; + struct ethhdr eth; + } *hdr, _hdr; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) + return FLOW_DISSECT_RET_OUT_BAD; + + *p_proto = hdr->eth.h_proto; + *p_nhoff += sizeof(*hdr); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} + +static void +__skb_flow_dissect_tcp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int thoff, int hlen) +{ + struct flow_dissector_key_tcp *key_tcp; + struct tcphdr *th, _th; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) + return; + + th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); + if (!th) + return; + + if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) + return; + + key_tcp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TCP, + target_container); + key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); +} + +static void +__skb_flow_dissect_ports(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, u8 ip_proto, int hlen) +{ + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (!key_ports && !key_ports_range) + return; + + ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; +} + +static void +__skb_flow_dissect_ipv4(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + const struct iphdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = iph->tos; + key_ip->ttl = iph->ttl; +} + +static void +__skb_flow_dissect_ipv6(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + const struct ipv6hdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = ipv6_get_dsfield(iph); + key_ip->ttl = iph->hop_limit; +} + +/* Maximum number of protocol headers that can be parsed in + * __skb_flow_dissect */ -static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) +#define MAX_FLOW_DISSECT_HDRS 15 + +static bool skb_flow_dissect_allowed(int *num_hdrs) +{ + ++*num_hdrs; + + return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); +} + +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_tags *key_tags; + + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + key_control->thoff = flow_keys->thoff; + if (flow_keys->is_frag) + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + if (flow_keys->is_first_frag) + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flow_keys->is_encap) + key_control->flags |= FLOW_DIS_ENCAPSULATION; + + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = flow_keys->n_proto; + key_basic->ip_proto = flow_keys->ip_proto; + + if (flow_keys->addr_proto == ETH_P_IP && + dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + key_addrs->v4addrs.src = flow_keys->ipv4_src; + key_addrs->v4addrs.dst = flow_keys->ipv4_dst; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else if (flow_keys->addr_proto == ETH_P_IPV6 && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, + sizeof(key_addrs->v6addrs.dst)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->src = flow_keys->sport; + key_ports->dst = flow_keys->dport; + } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_keys->flow_label); + } +} + +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags) +{ + struct bpf_flow_keys *flow_keys = ctx->flow_keys; + u32 result; + + /* Pass parameters to the BPF program */ + memset(flow_keys, 0, sizeof(*flow_keys)); + flow_keys->n_proto = proto; + flow_keys->nhoff = nhoff; + flow_keys->thoff = flow_keys->nhoff; + + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != + (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != + (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != + (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); + flow_keys->flags = flags; + + result = bpf_prog_run_pin_on_cpu(prog, ctx); + + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); + flow_keys->thoff = clamp_t(u16, flow_keys->thoff, + flow_keys->nhoff, hlen); + + return result; +} + +static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) { - BUILD_BUG_ON(offsetof(typeof(*flow), dst) != - offsetof(typeof(*flow), src) + sizeof(flow->src)); - memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); + return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0; } -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +/** + * __skb_flow_dissect - extract the flow_keys struct and return it + * @net: associated network namespace, derived from @skb if NULL + * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @flow_dissector: list of keys to dissect + * @target_container: target structure to put dissected values into + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol + * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * @flags: flags that control the dissection process, e.g. + * FLOW_DISSECTOR_F_STOP_AT_ENCAP. + * + * The function will try to retrieve individual keys into target specified + * by flow_dissector from either the skbuff or a raw buffer specified by the + * rest parameters. + * + * Caller must take care of zeroing target container memory. + */ +bool __skb_flow_dissect(const struct net *net, + const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + __be16 proto, int nhoff, int hlen, unsigned int flags) { - int poff, nhoff = skb_network_offset(skb); - u8 ip_proto; - __be16 proto = skb->protocol; + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_vlan *key_vlan; + enum flow_dissect_ret fdret; + enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + bool mpls_el = false; + int mpls_lse = 0; + int num_hdrs = 0; + u8 ip_proto = 0; + bool ret; + + if (!data) { + data = skb->data; + proto = skb_vlan_tag_present(skb) ? + skb->vlan_proto : skb->protocol; + nhoff = skb_network_offset(skb); + hlen = skb_headlen(skb); +#if IS_ENABLED(CONFIG_NET_DSA) + if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && + proto == htons(ETH_P_XDSA))) { + struct metadata_dst *md_dst = skb_metadata_dst(skb); + const struct dsa_device_ops *ops; + int offset = 0; + + ops = skb->dev->dsa_ptr->tag_ops; + /* Only DSA header taggers break flow dissection */ + if (ops->needed_headroom && + (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) { + if (ops->flow_dissect) + ops->flow_dissect(skb, &proto, &offset); + else + dsa_tag_generic_flow_dissect(skb, + &proto, + &offset); + hlen -= offset; + nhoff += offset; + } + } +#endif + } + + /* It is ensured by skb_flow_dissector_init() that control key will + * be always present. + */ + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + + rcu_read_lock(); + + if (skb) { + if (!net) { + if (skb->dev) + net = dev_net_rcu(skb->dev); + else if (skb->sk) + net = sock_net(skb->sk); + } + } + + DEBUG_NET_WARN_ON_ONCE(!net); + if (net) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + struct bpf_prog_array *run_array; + + run_array = rcu_dereference(init_net.bpf.run_array[type]); + if (!run_array) + run_array = rcu_dereference(net->bpf.run_array[type]); + + if (run_array) { + struct bpf_flow_keys flow_keys; + struct bpf_flow_dissector ctx = { + .flow_keys = &flow_keys, + .data = data, + .data_end = data + hlen, + }; + __be16 n_proto = proto; + struct bpf_prog *prog; + u32 result; + + if (skb) { + ctx.skb = skb; + /* we can't use 'proto' in the skb case + * because it might be set to skb->vlan_proto + * which has been pulled from the data + */ + n_proto = skb->protocol; + } + + prog = READ_ONCE(run_array->items[0].prog); + result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, + hlen, flags); + if (result != BPF_FLOW_DISSECTOR_CONTINUE) { + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return result == BPF_OK; + } + } + } + + rcu_read_unlock(); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + struct ethhdr *eth = eth_hdr(skb); + struct flow_dissector_key_eth_addrs *key_eth_addrs; + + key_eth_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS, + target_container); + memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { + struct flow_dissector_key_num_of_vlans *key_num_of_vlans; - memset(flow, 0, sizeof(*flow)); + key_num_of_vlans = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, + target_container); + key_num_of_vlans->num_of_vlans = 0; + } + +proto_again: + fdret = FLOW_DISSECT_RET_CONTINUE; -again: switch (proto) { - case __constant_htons(ETH_P_IP): { + case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; -ip: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (!iph) - return false; - - if (ip_is_fragment(iph)) - ip_proto = 0; - else - ip_proto = iph->protocol; - iph_to_flow_copy_addrs(flow, iph); + + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); + if (!iph || iph->ihl < 5) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + nhoff += iph->ihl * 4; + + ip_proto = iph->protocol; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + + memcpy(&key_addrs->v4addrs.src, &iph->saddr, + sizeof(key_addrs->v4addrs.src)); + memcpy(&key_addrs->v4addrs.dst, &iph->daddr, + sizeof(key_addrs->v4addrs.dst)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } + + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + + if (ip_is_fragment(iph)) { + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + if (iph->frag_off & htons(IP_OFFSET)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } else { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (!(flags & + FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + } + } + break; } - case __constant_htons(ETH_P_IPV6): { + case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; -ipv6: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (!iph) - return false; + + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); + if (!iph) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } ip_proto = iph->nexthdr; - flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); - flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + + memcpy(&key_addrs->v6addrs.src, &iph->saddr, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &iph->daddr, + sizeof(key_addrs->v6addrs.dst)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if ((dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL) || + (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && + ip6_flowlabel(iph)) { + __be32 flow_label = ip6_flowlabel(iph); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_label); + } + if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + } + + __skb_flow_dissect_ipv6(skb, flow_dissector, + target_container, data, iph); + break; } - case __constant_htons(ETH_P_8021Q): { - const struct vlan_hdr *vlan; + case htons(ETH_P_8021AD): + case htons(ETH_P_8021Q): { + const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; + __be16 saved_vlan_tpid = proto; + + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && + skb && skb_vlan_tag_present(skb)) { + proto = skb->protocol; + } else { + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), + data, hlen, &_vlan); + if (!vlan) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && + !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { + struct flow_dissector_key_num_of_vlans *key_nvs; + + key_nvs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, + target_container); + key_nvs->num_of_vlans++; + } - vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); - if (!vlan) - return false; + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { + dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; + } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { + dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; + } else { + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + + if (dissector_uses_key(flow_dissector, dissector_vlan)) { + key_vlan = skb_flow_dissector_target(flow_dissector, + dissector_vlan, + target_container); + + if (!vlan) { + key_vlan->vlan_id = skb_vlan_tag_get_id(skb); + key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); + } else { + key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & + VLAN_VID_MASK; + key_vlan->vlan_priority = + (ntohs(vlan->h_vlan_TCI) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } + key_vlan->vlan_tpid = saved_vlan_tpid; + key_vlan->vlan_eth_type = proto; + } - proto = vlan->h_vlan_encapsulated_proto; - nhoff += sizeof(*vlan); - goto again; + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; } - case __constant_htons(ETH_P_PPP_SES): { + case htons(ETH_P_PPP_SES): { struct { struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); - if (!hdr) - return false; - proto = hdr->proto; - nhoff += PPPOE_SES_HLEN; - switch (proto) { - case __constant_htons(PPP_IP): - goto ip; - case __constant_htons(PPP_IPV6): - goto ipv6; - default: - return false; + u16 ppp_proto; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + /* least significant bit of the most significant octet + * indicates if protocol field was compressed + */ + ppp_proto = ntohs(hdr->proto); + if (ppp_proto & 0x0100) { + ppp_proto = ppp_proto >> 8; + nhoff += PPPOE_SES_HLEN - 1; + } else { + nhoff += PPPOE_SES_HLEN; + } + + if (ppp_proto == PPP_IP) { + proto = htons(ETH_P_IP); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto == PPP_IPV6) { + proto = htons(ETH_P_IPV6); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto == PPP_MPLS_UC) { + proto = htons(ETH_P_MPLS_UC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto == PPP_MPLS_MC) { + proto = htons(ETH_P_MPLS_MC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto_is_valid(ppp_proto)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + } else { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PPPOE)) { + struct flow_dissector_key_pppoe *key_pppoe; + + key_pppoe = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PPPOE, + target_container); + key_pppoe->session_id = hdr->hdr.sid; + key_pppoe->ppp_proto = htons(ppp_proto); + key_pppoe->type = htons(ETH_P_PPP_SES); + } + break; + } + case htons(ETH_P_TIPC): { + struct tipc_basic_hdr *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), + data, hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC, + target_container); + key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; + } + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): + fdret = __skb_flow_dissect_mpls(skb, flow_dissector, + target_container, data, + nhoff, hlen, mpls_lse, + &mpls_el); + nhoff += sizeof(struct mpls_label); + mpls_lse++; + break; + case htons(ETH_P_FCOE): + if ((hlen - nhoff) < FCOE_HEADER_LEN) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += FCOE_HEADER_LEN; + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + fdret = __skb_flow_dissect_arp(skb, flow_dissector, + target_container, data, + nhoff, hlen); + break; + + case htons(ETH_P_BATMAN): + fdret = __skb_flow_dissect_batadv(skb, key_control, data, + &proto, &nhoff, hlen, flags); + break; + + case htons(ETH_P_1588): { + struct ptp_header *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; } + + nhoff += sizeof(struct ptp_header); + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + case htons(ETH_P_PRP): + case htons(ETH_P_HSR): { + struct hsr_tag *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = hdr->encap_proto; + nhoff += HSR_HLEN; + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; } + + case htons(ETH_P_CFM): + fdret = __skb_flow_dissect_cfm(skb, flow_dissector, + target_container, data, + nhoff, hlen); + break; + + default: + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + /* Process result of proto processing */ + switch (fdret) { + case FLOW_DISSECT_RET_OUT_GOOD: + goto out_good; + case FLOW_DISSECT_RET_PROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto proto_again; + goto out_good; + case FLOW_DISSECT_RET_CONTINUE: + case FLOW_DISSECT_RET_IPPROTO_AGAIN: + break; + case FLOW_DISSECT_RET_OUT_BAD: default: - return false; + goto out_bad; } +ip_proto_again: + fdret = FLOW_DISSECT_RET_CONTINUE; + switch (ip_proto) { - case IPPROTO_GRE: { - struct gre_hdr { - __be16 flags; - __be16 proto; - } *hdr, _hdr; + case IPPROTO_GRE: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); - if (!hdr) - return false; - /* - * Only look inside GRE if version zero and no - * routing - */ - if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { - proto = hdr->proto; - nhoff += 4; - if (hdr->flags & GRE_CSUM) - nhoff += 4; - if (hdr->flags & GRE_KEY) - nhoff += 4; - if (hdr->flags & GRE_SEQ) - nhoff += 4; - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = skb_header_pointer(skb, nhoff, - sizeof(_eth), &_eth); - if (!eth) - return false; - proto = eth->h_proto; - nhoff += sizeof(*eth); + fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, + target_container, data, + &proto, &nhoff, &hlen, flags); + break; + + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 _opthdr[2], *opthdr; + + if (proto != htons(ETH_P_IPV6)) + break; + + opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), + data, hlen, &_opthdr); + if (!opthdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + ip_proto = opthdr[0]; + nhoff += (opthdr[1] + 1) << 3; + + fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; + break; + } + case NEXTHDR_FRAGMENT: { + struct frag_hdr _fh, *fh; + + if (proto != htons(ETH_P_IPV6)) + break; + + fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), + data, hlen, &_fh); + + if (!fh) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + nhoff += sizeof(_fh); + ip_proto = fh->nexthdr; + + if (!(fh->frag_off & htons(IP6_OFFSET))) { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { + fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; + break; } - goto again; } + + fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case IPPROTO_IPIP: - goto again; + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + proto = htons(ETH_P_IP); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + case IPPROTO_IPV6: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + proto = htons(ETH_P_IPV6); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + + case IPPROTO_MPLS: + proto = htons(ETH_P_MPLS_UC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + case IPPROTO_TCP: + __skb_flow_dissect_tcp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + __skb_flow_dissect_icmp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_L2TP: + __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_ESP: + __skb_flow_dissect_esp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_AH: + __skb_flow_dissect_ah(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } - flow->ip_proto = ip_proto; - poff = proto_ports_offset(ip_proto); - if (poff >= 0) { - __be32 *ports, _ports; + if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) + __skb_flow_dissect_ports(skb, flow_dissector, target_container, + data, nhoff, ip_proto, hlen); - nhoff += poff; - ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); - if (ports) - flow->ports = *ports; + /* Process result of IP proto processing */ + switch (fdret) { + case FLOW_DISSECT_RET_PROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto proto_again; + break; + case FLOW_DISSECT_RET_IPPROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto ip_proto_again; + break; + case FLOW_DISSECT_RET_OUT_GOOD: + case FLOW_DISSECT_RET_CONTINUE: + break; + case FLOW_DISSECT_RET_OUT_BAD: + default: + goto out_bad; } - flow->thoff = (u16) nhoff; +out_good: + ret = true; - return true; +out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + + return ret; + +out_bad: + ret = false; + goto out; } -EXPORT_SYMBOL(skb_flow_dissect); +EXPORT_SYMBOL(__skb_flow_dissect); -static u32 hashrnd __read_mostly; +static siphash_aligned_key_t hashrnd; +static __always_inline void __flow_hash_secret_init(void) +{ + net_get_random_once(&hashrnd, sizeof(hashrnd)); +} -/* - * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Sets rxhash in skb to non-zero hash value - * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb - * if hash is a canonical 4-tuple hash over transport ports. - */ -void __skb_get_rxhash(struct sk_buff *skb) +static const void *flow_keys_hash_start(const struct flow_keys *flow) { - struct flow_keys keys; - u32 hash; + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); + return &flow->FLOW_KEYS_HASH_START_FIELD; +} - if (!skb_flow_dissect(skb, &keys)) - return; +static inline size_t flow_keys_hash_length(const struct flow_keys *flow) +{ + size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); + + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + diff -= sizeof(flow->addrs.v4addrs); + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + diff -= sizeof(flow->addrs.v6addrs); + break; + case FLOW_DISSECTOR_KEY_TIPC: + diff -= sizeof(flow->addrs.tipckey); + break; + } + return sizeof(*flow) - diff; +} - if (keys.ports) - skb->l4_rxhash = 1; +__be32 flow_get_u32_src(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.src; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.src); + case FLOW_DISSECTOR_KEY_TIPC: + return flow->addrs.tipckey.key; + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_src); + +__be32 flow_get_u32_dst(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.dst; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.dst); + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_dst); + +/* Sort the source and destination IP and the ports, + * to have consistent hash within the two directions + */ +static inline void __flow_hash_consistentify(struct flow_keys *keys) +{ + int addr_diff, i; - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys.dst < (__force u32)keys.src) || - (((__force u32)keys.dst == (__force u32)keys.src) && - ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { - swap(keys.dst, keys.src); - swap(keys.port16[0], keys.port16[1]); + switch (keys->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + if ((__force u32)keys->addrs.v4addrs.dst < + (__force u32)keys->addrs.v4addrs.src) + swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + + if ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src) { + swap(keys->ports.src, keys->ports.dst); + } + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + addr_diff = memcmp(&keys->addrs.v6addrs.dst, + &keys->addrs.v6addrs.src, + sizeof(keys->addrs.v6addrs.dst)); + if (addr_diff < 0) { + for (i = 0; i < 4; i++) + swap(keys->addrs.v6addrs.src.s6_addr32[i], + keys->addrs.v6addrs.dst.s6_addr32[i]); + } + if ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src) { + swap(keys->ports.src, keys->ports.dst); + } + break; } +} - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, hashrnd); +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, + const siphash_key_t *keyval) +{ + u32 hash; + + __flow_hash_consistentify(keys); + + hash = siphash(flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; - skb->rxhash = hash; + return hash; +} + +u32 flow_hash_from_keys(struct flow_keys *keys) +{ + __flow_hash_secret_init(); + return __flow_hash_from_keys(keys, &hashrnd); +} +EXPORT_SYMBOL(flow_hash_from_keys); + +u32 flow_hash_from_keys_seed(struct flow_keys *keys, + const siphash_key_t *keyval) +{ + return __flow_hash_from_keys(keys, keyval); +} +EXPORT_SYMBOL(flow_hash_from_keys_seed); + +static inline u32 ___skb_get_hash(const struct sk_buff *skb, + struct flow_keys *keys, + const siphash_key_t *keyval) +{ + skb_flow_dissect_flow_keys(skb, keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + return __flow_hash_from_keys(keys, keyval); +} + +struct _flow_keys_digest_data { + __be16 n_proto; + u8 ip_proto; + u8 padding; + __be32 ports; + __be32 src; + __be32 dst; +}; + +void make_flow_keys_digest(struct flow_keys_digest *digest, + const struct flow_keys *flow) +{ + struct _flow_keys_digest_data *data = + (struct _flow_keys_digest_data *)digest; + + BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); + + memset(digest, 0, sizeof(*digest)); + + data->n_proto = flow->basic.n_proto; + data->ip_proto = flow->basic.ip_proto; + data->ports = flow->ports.ports; + data->src = flow->addrs.v4addrs.src; + data->dst = flow->addrs.v4addrs.dst; +} +EXPORT_SYMBOL(make_flow_keys_digest); + +static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; + +u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb) +{ + struct flow_keys keys; + + __flow_hash_secret_init(); + + memset(&keys, 0, sizeof(keys)); + __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric, + &keys, NULL, 0, 0, 0, 0); + + return __flow_hash_from_keys(&keys, &hashrnd); } -EXPORT_SYMBOL(__skb_get_rxhash); +EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net); -/* - * Returns a Tx hash based on the given packet descriptor a Tx queues' number - * to be used as a distribution range. +/** + * __skb_get_hash_net: calculate a flow hash + * @net: associated network namespace, derived from @skb if NULL + * @skb: sk_buff to calculate flow hash from + * + * This function calculates a flow hash based on src/dst addresses + * and src/dst port numbers. Sets hash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_hash in skb + * if hash is a canonical 4-tuple hash over transport ports. */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, - unsigned int num_tx_queues) +void __skb_get_hash_net(const struct net *net, struct sk_buff *skb) { + struct flow_keys keys; u32 hash; - u16 qoffset = 0; - u16 qcount = num_tx_queues; - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; - return hash; - } + memset(&keys, 0, sizeof(keys)); - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; - } + __skb_flow_dissect(net, skb, &flow_keys_dissector, + &keys, NULL, 0, 0, 0, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol; - hash = jhash_1word(hash, hashrnd); + __flow_hash_secret_init(); + + hash = __flow_hash_from_keys(&keys, &hashrnd); - return (u16) (((u64) hash * qcount) >> 32) + qoffset; + __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } -EXPORT_SYMBOL(__skb_tx_hash); +EXPORT_SYMBOL(__skb_get_hash_net); -/* __skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically - * truncate packets without needing to push actual payload to the user - * space and can analyze headers only, instead. - */ -u32 __skb_get_poff(const struct sk_buff *skb) +__u32 skb_get_hash_perturb(const struct sk_buff *skb, + const siphash_key_t *perturb) { struct flow_keys keys; - u32 poff = 0; - if (!skb_flow_dissect(skb, &keys)) - return 0; + return ___skb_get_hash(skb, &keys, perturb); +} +EXPORT_SYMBOL(skb_get_hash_perturb); + +u32 __skb_get_poff(const struct sk_buff *skb, const void *data, + const struct flow_keys_basic *keys, int hlen) +{ + u32 poff = keys->control.thoff; - poff += keys.thoff; - switch (keys.ip_proto) { + /* skip L4 headers for fragments after the first */ + if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && + !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) + return poff; + + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { - const struct tcphdr *tcph; - struct tcphdr _tcph; + /* access doff as u8 to avoid unaligned access */ + const u8 *doff; + u8 _doff; - tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); - if (!tcph) + doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), + data, hlen, &_doff); + if (!doff) return poff; - poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4); + poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); break; } case IPPROTO_UDP: @@ -285,104 +1967,135 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ - if (unlikely(queue_index >= dev->real_num_tx_queues)) { - net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); - return 0; - } - return queue_index; -} - -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +/** + * skb_get_poff - get the offset to the payload + * @skb: sk_buff to get the payload offset from + * + * The function will get the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) { -#ifdef CONFIG_XPS - struct xps_dev_maps *dev_maps; - struct xps_map *map; - int queue_index = -1; + struct flow_keys_basic keys; - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); - if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[raw_smp_processor_id()]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else { - u32 hash; - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol ^ - skb->rxhash; - hash = jhash_1word(hash, hashrnd); - queue_index = map->queues[ - ((u64)hash * map->len) >> 32]; - } - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; - } - } - rcu_read_unlock(); + if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, + NULL, 0, 0, 0, 0)) + return 0; - return queue_index; -#else - return -1; -#endif + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { - struct sock *sk = skb->sk; - int queue_index = sk_tx_queue_get(sk); - - if (queue_index < 0 || skb->ooo_okay || - queue_index >= dev->real_num_tx_queues) { - int new_index = get_xps_queue(dev, skb); - if (new_index < 0) - new_index = skb_tx_hash(dev, skb); + memset(keys, 0, sizeof(*keys)); - if (queue_index != new_index && sk) { - struct dst_entry *dst = - rcu_dereference_check(sk->sk_dst_cache, 1); + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + keys->basic.ip_proto = fl6->flowi6_proto; - if (dst && skb_dst(skb) == dst) - sk_tx_queue_set(sk, queue_index); + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi6); - } +static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_TIPC, + .offset = offsetof(struct flow_keys, addrs.tipckey), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, + { + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), + }, + { + .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, + .offset = offsetof(struct flow_keys, tags), + }, + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct flow_keys, keyid), + }, +}; - queue_index = new_index; - } +static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, +}; - return queue_index; -} -EXPORT_SYMBOL(__netdev_pick_tx); +static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, +}; -struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb) -{ - int queue_index = 0; +struct flow_dissector flow_keys_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_dissector); - if (dev->real_num_tx_queues != 1) { - const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb); - else - queue_index = __netdev_pick_tx(dev, skb); - queue_index = dev_cap_txqueue(dev, queue_index); - } - - skb_set_queue_mapping(skb, queue_index); - return netdev_get_tx_queue(dev, queue_index); -} +struct flow_dissector flow_keys_basic_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_basic_dissector); -static int __init initialize_hashrnd(void) +static int __init init_default_flow_dissectors(void) { - get_random_bytes(&hashrnd, sizeof(hashrnd)); + skb_flow_dissector_init(&flow_keys_dissector, + flow_keys_dissector_keys, + ARRAY_SIZE(flow_keys_dissector_keys)); + skb_flow_dissector_init(&flow_keys_dissector_symmetric, + flow_keys_dissector_symmetric_keys, + ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); + skb_flow_dissector_init(&flow_keys_basic_dissector, + flow_keys_basic_dissector_keys, + ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } - -late_initcall_sync(initialize_hashrnd); +core_initcall(init_default_flow_dissectors); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c new file mode 100644 index 000000000000..bc5169482710 --- /dev/null +++ b/net/core/flow_offload.c @@ -0,0 +1,638 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <net/act_api.h> +#include <net/flow_offload.h> +#include <linux/rtnetlink.h> +#include <linux/mutex.h> +#include <linux/rhashtable.h> + +struct flow_rule *flow_rule_alloc(unsigned int num_actions) +{ + struct flow_rule *rule; + int i; + + rule = kzalloc(struct_size(rule, action.entries, num_actions), + GFP_KERNEL); + if (!rule) + return NULL; + + rule->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; + + return rule; +} +EXPORT_SYMBOL(flow_rule_alloc); + +struct flow_offload_action *offload_action_alloc(unsigned int num_actions) +{ + struct flow_offload_action *fl_action; + int i; + + fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions), + GFP_KERNEL); + if (!fl_action) + return NULL; + + fl_action->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; + + return fl_action; +} + +#define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \ + const struct flow_match *__m = &(__rule)->match; \ + struct flow_dissector *__d = (__m)->dissector; \ + \ + (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \ + (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \ + +void flow_rule_match_meta(const struct flow_rule *rule, + struct flow_match_meta *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out); +} +EXPORT_SYMBOL(flow_rule_match_meta); + +void flow_rule_match_basic(const struct flow_rule *rule, + struct flow_match_basic *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out); +} +EXPORT_SYMBOL(flow_rule_match_basic); + +void flow_rule_match_control(const struct flow_rule *rule, + struct flow_match_control *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_control); + +void flow_rule_match_eth_addrs(const struct flow_rule *rule, + struct flow_match_eth_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_eth_addrs); + +void flow_rule_match_vlan(const struct flow_rule *rule, + struct flow_match_vlan *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out); +} +EXPORT_SYMBOL(flow_rule_match_vlan); + +void flow_rule_match_cvlan(const struct flow_rule *rule, + struct flow_match_vlan *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CVLAN, out); +} +EXPORT_SYMBOL(flow_rule_match_cvlan); + +void flow_rule_match_arp(const struct flow_rule *rule, + struct flow_match_arp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ARP, out); +} +EXPORT_SYMBOL(flow_rule_match_arp); + +void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, + struct flow_match_ipv4_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv4_addrs); + +void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, + struct flow_match_ipv6_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv6_addrs); + +void flow_rule_match_ip(const struct flow_rule *rule, + struct flow_match_ip *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IP, out); +} +EXPORT_SYMBOL(flow_rule_match_ip); + +void flow_rule_match_ports(const struct flow_rule *rule, + struct flow_match_ports *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_ports); + +void flow_rule_match_ports_range(const struct flow_rule *rule, + struct flow_match_ports_range *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS_RANGE, out); +} +EXPORT_SYMBOL(flow_rule_match_ports_range); + +void flow_rule_match_tcp(const struct flow_rule *rule, + struct flow_match_tcp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_TCP, out); +} +EXPORT_SYMBOL(flow_rule_match_tcp); + +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out); +} +EXPORT_SYMBOL(flow_rule_match_ipsec); + +void flow_rule_match_icmp(const struct flow_rule *rule, + struct flow_match_icmp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ICMP, out); +} +EXPORT_SYMBOL(flow_rule_match_icmp); + +void flow_rule_match_mpls(const struct flow_rule *rule, + struct flow_match_mpls *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_MPLS, out); +} +EXPORT_SYMBOL(flow_rule_match_mpls); + +void flow_rule_match_enc_control(const struct flow_rule *rule, + struct flow_match_control *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_control); + +void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, + struct flow_match_ipv4_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv4_addrs); + +void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, + struct flow_match_ipv6_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv6_addrs); + +void flow_rule_match_enc_ip(const struct flow_rule *rule, + struct flow_match_ip *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IP, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ip); + +void flow_rule_match_enc_ports(const struct flow_rule *rule, + struct flow_match_ports *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ports); + +void flow_rule_match_enc_keyid(const struct flow_rule *rule, + struct flow_match_enc_keyid *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_keyid); + +void flow_rule_match_enc_opts(const struct flow_rule *rule, + struct flow_match_enc_opts *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_opts); + +struct flow_action_cookie *flow_action_cookie_create(void *data, + unsigned int len, + gfp_t gfp) +{ + struct flow_action_cookie *cookie; + + cookie = kmalloc(sizeof(*cookie) + len, gfp); + if (!cookie) + return NULL; + cookie->cookie_len = len; + memcpy(cookie->cookie, data, len); + return cookie; +} +EXPORT_SYMBOL(flow_action_cookie_create); + +void flow_action_cookie_destroy(struct flow_action_cookie *cookie) +{ + kfree(cookie); +} +EXPORT_SYMBOL(flow_action_cookie_destroy); + +void flow_rule_match_ct(const struct flow_rule *rule, + struct flow_match_ct *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out); +} +EXPORT_SYMBOL(flow_rule_match_ct); + +void flow_rule_match_pppoe(const struct flow_rule *rule, + struct flow_match_pppoe *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PPPOE, out); +} +EXPORT_SYMBOL(flow_rule_match_pppoe); + +void flow_rule_match_l2tpv3(const struct flow_rule *rule, + struct flow_match_l2tpv3 *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out); +} +EXPORT_SYMBOL(flow_rule_match_l2tpv3); + +struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + void (*release)(void *cb_priv)) +{ + struct flow_block_cb *block_cb; + + block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); + if (!block_cb) + return ERR_PTR(-ENOMEM); + + block_cb->cb = cb; + block_cb->cb_ident = cb_ident; + block_cb->cb_priv = cb_priv; + block_cb->release = release; + + return block_cb; +} +EXPORT_SYMBOL(flow_block_cb_alloc); + +void flow_block_cb_free(struct flow_block_cb *block_cb) +{ + if (block_cb->release) + block_cb->release(block_cb->cb_priv); + + kfree(block_cb); +} +EXPORT_SYMBOL(flow_block_cb_free); + +struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, + flow_setup_cb_t *cb, void *cb_ident) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, &block->cb_list, list) { + if (block_cb->cb == cb && + block_cb->cb_ident == cb_ident) + return block_cb; + } + + return NULL; +} +EXPORT_SYMBOL(flow_block_cb_lookup); + +void *flow_block_cb_priv(struct flow_block_cb *block_cb) +{ + return block_cb->cb_priv; +} +EXPORT_SYMBOL(flow_block_cb_priv); + +void flow_block_cb_incref(struct flow_block_cb *block_cb) +{ + block_cb->refcnt++; +} +EXPORT_SYMBOL(flow_block_cb_incref); + +unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb) +{ + return --block_cb->refcnt; +} +EXPORT_SYMBOL(flow_block_cb_decref); + +bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, + struct list_head *driver_block_list) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, driver_block_list, driver_list) { + if (block_cb->cb == cb && + block_cb->cb_ident == cb_ident) + return true; + } + + return false; +} +EXPORT_SYMBOL(flow_block_cb_is_busy); + +int flow_block_cb_setup_simple(struct flow_block_offload *f, + struct list_head *driver_block_list, + flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + bool ingress_only) +{ + struct flow_block_cb *block_cb; + + if (ingress_only && + f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) + return -EOPNOTSUPP; + + f->driver_block_list = driver_block_list; + + switch (f->command) { + case FLOW_BLOCK_BIND: + if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list)) + return -EBUSY; + + block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, NULL); + if (IS_ERR(block_cb)) + return PTR_ERR(block_cb); + + flow_block_cb_add(block_cb, f); + list_add_tail(&block_cb->driver_list, driver_block_list); + return 0; + case FLOW_BLOCK_UNBIND: + block_cb = flow_block_cb_lookup(f->block, cb, cb_ident); + if (!block_cb) + return -ENOENT; + + flow_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); + return 0; + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL(flow_block_cb_setup_simple); + +static DEFINE_MUTEX(flow_indr_block_lock); +static LIST_HEAD(flow_block_indr_list); +static LIST_HEAD(flow_block_indr_dev_list); +static LIST_HEAD(flow_indir_dev_list); + +struct flow_indr_dev { + struct list_head list; + flow_indr_block_bind_cb_t *cb; + void *cb_priv; + refcount_t refcnt; +}; + +static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, + void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + indr_dev->cb = cb; + indr_dev->cb_priv = cb_priv; + refcount_set(&indr_dev->refcnt, 1); + + return indr_dev; +} + +struct flow_indir_dev_info { + void *data; + struct net_device *dev; + struct Qdisc *sch; + enum tc_setup_type type; + void (*cleanup)(struct flow_block_cb *block_cb); + struct list_head list; + enum flow_block_command command; + enum flow_block_binder_type binder_type; + struct list_head *cb_list; +}; + +static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_block_offload bo; + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + memset(&bo, 0, sizeof(bo)); + bo.command = cur->command; + bo.binder_type = cur->binder_type; + INIT_LIST_HEAD(&bo.cb_list); + cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup); + list_splice(&bo.cb_list, cur->cb_list); + } +} + +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) { + if (indr_dev->cb == cb && + indr_dev->cb_priv == cb_priv) { + refcount_inc(&indr_dev->refcnt); + mutex_unlock(&flow_indr_block_lock); + return 0; + } + } + + indr_dev = flow_indr_dev_alloc(cb, cb_priv); + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return -ENOMEM; + } + + list_add(&indr_dev->list, &flow_block_indr_dev_list); + existing_qdiscs_register(cb, cb_priv); + mutex_unlock(&flow_indr_block_lock); + + tcf_action_reoffload_cb(cb, cb_priv, true); + + return 0; +} +EXPORT_SYMBOL(flow_indr_dev_register); + +static void __flow_block_indr_cleanup(void (*release)(void *cb_priv), + void *cb_priv, + struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { + if (this->release == release && + this->indr.cb_priv == cb_priv) + list_move(&this->indr.list, cleanup_list); + } +} + +static void flow_block_indr_notify(struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, cleanup_list, indr.list) { + list_del(&this->indr.list); + this->indr.cleanup(this); + } +} + +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + void (*release)(void *cb_priv)) +{ + struct flow_indr_dev *this, *next, *indr_dev = NULL; + LIST_HEAD(cleanup_list); + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) { + if (this->cb == cb && + this->cb_priv == cb_priv && + refcount_dec_and_test(&this->refcnt)) { + indr_dev = this; + list_del(&indr_dev->list); + break; + } + } + + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return; + } + + __flow_block_indr_cleanup(release, cb_priv, &cleanup_list); + mutex_unlock(&flow_indr_block_lock); + + tcf_action_reoffload_cb(cb, cb_priv, false); + flow_block_indr_notify(&cleanup_list); + kfree(indr_dev); +} +EXPORT_SYMBOL(flow_indr_dev_unregister); + +static void flow_block_indr_init(struct flow_block_cb *flow_block, + struct flow_block_offload *bo, + struct net_device *dev, struct Qdisc *sch, void *data, + void *cb_priv, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + flow_block->indr.binder_type = bo->binder_type; + flow_block->indr.data = data; + flow_block->indr.cb_priv = cb_priv; + flow_block->indr.dev = dev; + flow_block->indr.sch = sch; + flow_block->indr.cleanup = cleanup; +} + +struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + void (*release)(void *cb_priv), + struct flow_block_offload *bo, + struct net_device *dev, + struct Qdisc *sch, void *data, + void *indr_cb_priv, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_block_cb *block_cb; + + block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, release); + if (IS_ERR(block_cb)) + goto out; + + flow_block_indr_init(block_cb, bo, dev, sch, data, indr_cb_priv, cleanup); + list_add(&block_cb->indr.list, &flow_block_indr_list); + +out: + return block_cb; +} +EXPORT_SYMBOL(flow_indr_block_cb_alloc); + +static struct flow_indir_dev_info *find_indir_dev(void *data) +{ + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + if (cur->data == data) + return cur; + } + return NULL; +} + +static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch, + enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb), + struct flow_block_offload *bo) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (info) + return -EEXIST; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->data = data; + info->dev = dev; + info->sch = sch; + info->type = type; + info->cleanup = cleanup; + info->command = bo->command; + info->binder_type = bo->binder_type; + info->cb_list = bo->cb_list_head; + + list_add(&info->list, &flow_indir_dev_list); + return 0; +} + +static int indir_dev_remove(void *data) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (!info) + return -ENOENT; + + list_del(&info->list); + + kfree(info); + return 0; +} + +int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_indr_dev *this; + u32 count = 0; + int err; + + mutex_lock(&flow_indr_block_lock); + if (bo) { + if (bo->command == FLOW_BLOCK_BIND) + indir_dev_add(data, dev, sch, type, cleanup, bo); + else if (bo->command == FLOW_BLOCK_UNBIND) + indir_dev_remove(data); + } + + list_for_each_entry(this, &flow_block_indr_dev_list, list) { + err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); + if (!err) + count++; + } + + mutex_unlock(&flow_indr_block_lock); + + return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count; +} +EXPORT_SYMBOL(flow_indr_dev_setup_offload); + +bool flow_indr_dev_exists(void) +{ + return !list_empty(&flow_block_indr_dev_list); +} +EXPORT_SYMBOL(flow_indr_dev_exists); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 6b5b6e7013ca..f112156db587 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -1,19 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/gen_estimator.c Simple rate estimator. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Eric Dumazet <edumazet@google.com> * * Changes: * Jamal Hadi Salim - moved it to net/core and reshulfed * names to make it usable in general net subsystem. */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> @@ -30,258 +27,204 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> -#include <linux/rbtree.h> #include <linux/slab.h> +#include <linux/seqlock.h> #include <net/sock.h> #include <net/gen_stats.h> -/* - This code is NOT intended to be used for statistics collection, - its purpose is to provide a base for statistical multiplexing - for controlled load service. - If you need only statistics, run a user level daemon which - periodically reads byte counters. - - Unfortunately, rate estimation is not a very easy task. - F.e. I did not find a simple way to estimate the current peak rate - and even failed to formulate the problem 8)8) - - So I preferred not to built an estimator into the scheduler, - but run this task separately. - Ideally, it should be kernel thread(s), but for now it runs - from timers, which puts apparent top bounds on the number of rated - flows, has minimal overhead on small, but is enough - to handle controlled load service, sets of aggregates. - - We measure rate over A=(1<<interval) seconds and evaluate EWMA: - - avrate = avrate*(1-W) + rate*W - - where W is chosen as negative power of 2: W = 2^(-ewma_log) - - The resulting time constant is: - - T = A/(-ln(1-W)) - - - NOTES. - - * avbps is scaled by 2^5, avpps is scaled by 2^10. - * both values are reported as 32 bit unsigned values. bps can - overflow for fast links : max speed being 34360Mbit/sec - * Minimal interval is HZ/4=250msec (it is the greatest common divisor - for HZ=100 and HZ=1024 8)), maximal interval - is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals - are too expensive, longer ones can be implemented - at user level painlessly. +/* This code is NOT intended to be used for statistics collection, + * its purpose is to provide a base for statistical multiplexing + * for controlled load service. + * If you need only statistics, run a user level daemon which + * periodically reads byte counters. */ -#define EST_MAX_INTERVAL 5 - -struct gen_estimator -{ - struct list_head list; - struct gnet_stats_basic_packed *bstats; - struct gnet_stats_rate_est64 *rate_est; +struct net_rate_estimator { + struct gnet_stats_basic_sync *bstats; spinlock_t *stats_lock; - int ewma_log; + bool running; + struct gnet_stats_basic_sync __percpu *cpu_bstats; + u8 ewma_log; + u8 intvl_log; /* period : (250ms << intvl_log) */ + + seqcount_t seq; + u64 last_packets; u64 last_bytes; + + u64 avpps; u64 avbps; - u32 last_packets; - u32 avpps; - struct rcu_head e_rcu; - struct rb_node node; -}; -struct gen_estimator_head -{ - struct timer_list timer; - struct list_head list; + unsigned long next_jiffies; + struct timer_list timer; + struct rcu_head rcu; }; -static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; - -/* Protects against NULL dereference */ -static DEFINE_RWLOCK(est_lock); - -/* Protects against soft lockup during large deletion */ -static struct rb_root est_root = RB_ROOT; -static DEFINE_SPINLOCK(est_tree_lock); - -static void est_timer(unsigned long arg) +static void est_fetch_counters(struct net_rate_estimator *e, + struct gnet_stats_basic_sync *b) { - int idx = (int)arg; - struct gen_estimator *e; + gnet_stats_basic_sync_init(b); + if (e->stats_lock) + spin_lock(e->stats_lock); - rcu_read_lock(); - list_for_each_entry_rcu(e, &elist[idx].list, list) { - u64 nbytes; - u64 brate; - u32 npackets; - u32 rate; + gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); - spin_lock(e->stats_lock); - read_lock(&est_lock); - if (e->bstats == NULL) - goto skip; - - nbytes = e->bstats->bytes; - npackets = e->bstats->packets; - brate = (nbytes - e->last_bytes)<<(7 - idx); - e->last_bytes = nbytes; - e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); - e->rate_est->bps = (e->avbps+0xF)>>5; - - rate = (npackets - e->last_packets)<<(12 - idx); - e->last_packets = npackets; - e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); - e->rate_est->pps = (e->avpps+0x1FF)>>10; -skip: - read_unlock(&est_lock); + if (e->stats_lock) spin_unlock(e->stats_lock); - } - if (!list_empty(&elist[idx].list)) - mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); - rcu_read_unlock(); } -static void gen_add_node(struct gen_estimator *est) +static void est_timer(struct timer_list *t) { - struct rb_node **p = &est_root.rb_node, *parent = NULL; + struct net_rate_estimator *est = timer_container_of(est, t, timer); + struct gnet_stats_basic_sync b; + u64 b_bytes, b_packets; + u64 rate, brate; - while (*p) { - struct gen_estimator *e; + est_fetch_counters(est, &b); + b_bytes = u64_stats_read(&b.bytes); + b_packets = u64_stats_read(&b.packets); - parent = *p; - e = rb_entry(parent, struct gen_estimator, node); + brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - if (est->bstats > e->bstats) - p = &parent->rb_right; - else - p = &parent->rb_left; - } - rb_link_node(&est->node, parent, p); - rb_insert_color(&est->node, &est_root); -} + rate = (b_packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); -static -struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est64 *rate_est) -{ - struct rb_node *p = est_root.rb_node; + preempt_disable_nested(); + write_seqcount_begin(&est->seq); + est->avbps += brate; + est->avpps += rate; + write_seqcount_end(&est->seq); + preempt_enable_nested(); - while (p) { - struct gen_estimator *e; + est->last_bytes = b_bytes; + est->last_packets = b_packets; - e = rb_entry(p, struct gen_estimator, node); + est->next_jiffies += ((HZ/4) << est->intvl_log); - if (bstats > e->bstats) - p = p->rb_right; - else if (bstats < e->bstats || rate_est != e->rate_est) - p = p->rb_left; - else - return e; + if (unlikely(time_after_eq(jiffies, est->next_jiffies))) { + /* Ouch... timer was delayed. */ + est->next_jiffies = jiffies + 1; } - return NULL; + mod_timer(&est->timer, est->next_jiffies); } /** * gen_new_estimator - create a new rate estimator * @bstats: basic statistics + * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics - * @stats_lock: statistics lock + * @lock: lock for statistics and control path + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @bstats_cpu is NULL * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est * as destination. A new timer with the interval specified in the * configuration TLV is created. Upon each interval, the latest statistics * will be read from &bstats and the estimated rate will be stored in - * &rate_est with the statistics lock grabed during this period. + * &rate_est with the statistics lock grabbed during this period. * * Returns 0 on success or a negative error code. * */ -int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est64 *rate_est, - spinlock_t *stats_lock, +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, + bool running, struct nlattr *opt) { - struct gen_estimator *est; struct gnet_estimator *parm = nla_data(opt); - int idx; + struct net_rate_estimator *old, *est; + struct gnet_stats_basic_sync b; + int intvl_log; if (nla_len(opt) < sizeof(*parm)) return -EINVAL; + /* allowed timer periods are : + * -2 : 250ms, -1 : 500ms, 0 : 1 sec + * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec + */ if (parm->interval < -2 || parm->interval > 3) return -EINVAL; + if (parm->ewma_log == 0 || parm->ewma_log >= 31) + return -EINVAL; + est = kzalloc(sizeof(*est), GFP_KERNEL); - if (est == NULL) + if (!est) return -ENOBUFS; - idx = parm->interval + 2; + seqcount_init(&est->seq); + intvl_log = parm->interval + 2; est->bstats = bstats; - est->rate_est = rate_est; - est->stats_lock = stats_lock; + est->stats_lock = lock; + est->running = running; est->ewma_log = parm->ewma_log; - est->last_bytes = bstats->bytes; - est->avbps = rate_est->bps<<5; - est->last_packets = bstats->packets; - est->avpps = rate_est->pps<<10; - - spin_lock_bh(&est_tree_lock); - if (!elist[idx].timer.function) { - INIT_LIST_HEAD(&elist[idx].list); - setup_timer(&elist[idx].timer, est_timer, idx); + est->intvl_log = intvl_log; + est->cpu_bstats = cpu_bstats; + + if (lock) + local_bh_disable(); + est_fetch_counters(est, &b); + if (lock) + local_bh_enable(); + est->last_bytes = u64_stats_read(&b.bytes); + est->last_packets = u64_stats_read(&b.packets); + + if (lock) + spin_lock_bh(lock); + old = rcu_dereference_protected(*rate_est, 1); + if (old) { + timer_delete_sync(&old->timer); + est->avbps = old->avbps; + est->avpps = old->avpps; } - if (list_empty(&elist[idx].list)) - mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); - - list_add_rcu(&est->list, &elist[idx].list); - gen_add_node(est); - spin_unlock_bh(&est_tree_lock); + est->next_jiffies = jiffies + ((HZ/4) << intvl_log); + timer_setup(&est->timer, est_timer, 0); + mod_timer(&est->timer, est->next_jiffies); + rcu_assign_pointer(*rate_est, est); + if (lock) + spin_unlock_bh(lock); + if (old) + kfree_rcu(old, rcu); return 0; } EXPORT_SYMBOL(gen_new_estimator); /** * gen_kill_estimator - remove a rate estimator - * @bstats: basic statistics - * @rate_est: rate estimator statistics + * @rate_est: rate estimator * - * Removes the rate estimator specified by &bstats and &rate_est. + * Removes the rate estimator. * - * Note : Caller should respect an RCU grace period before freeing stats_lock */ -void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est64 *rate_est) +void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) { - struct gen_estimator *e; - - spin_lock_bh(&est_tree_lock); - while ((e = gen_find_node(bstats, rate_est))) { - rb_erase(&e->node, &est_root); + struct net_rate_estimator *est; - write_lock(&est_lock); - e->bstats = NULL; - write_unlock(&est_lock); - - list_del_rcu(&e->list); - kfree_rcu(e, e_rcu); + est = unrcu_pointer(xchg(rate_est, NULL)); + if (est) { + timer_shutdown_sync(&est->timer); + kfree_rcu(est, rcu); } - spin_unlock_bh(&est_tree_lock); } EXPORT_SYMBOL(gen_kill_estimator); /** * gen_replace_estimator - replace rate estimator configuration * @bstats: basic statistics + * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics - * @stats_lock: statistics lock + * @lock: lock for statistics and control path + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @cpu_bstats is NULL * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling @@ -289,33 +232,49 @@ EXPORT_SYMBOL(gen_kill_estimator); * * Returns 0 on success or a negative error code. */ -int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est64 *rate_est, - spinlock_t *stats_lock, struct nlattr *opt) +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, + bool running, struct nlattr *opt) { - gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, rate_est, stats_lock, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, + lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); /** * gen_estimator_active - test if estimator is currently in use - * @bstats: basic statistics - * @rate_est: rate estimator statistics + * @rate_est: rate estimator * * Returns true if estimator is active, and false if not. */ -bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est64 *rate_est) +bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est) { - bool res; + return !!rcu_access_pointer(*rate_est); +} +EXPORT_SYMBOL(gen_estimator_active); - ASSERT_RTNL(); +bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est, + struct gnet_stats_rate_est64 *sample) +{ + struct net_rate_estimator *est; + unsigned seq; - spin_lock_bh(&est_tree_lock); - res = gen_find_node(bstats, rate_est) != NULL; - spin_unlock_bh(&est_tree_lock); + rcu_read_lock(); + est = rcu_dereference(*rate_est); + if (!est) { + rcu_read_unlock(); + return false; + } + + do { + seq = read_seqcount_begin(&est->seq); + sample->bps = est->avbps >> 8; + sample->pps = est->avpps >> 8; + } while (read_seqcount_retry(&est->seq, seq)); - return res; + rcu_read_unlock(); + return true; } -EXPORT_SYMBOL(gen_estimator_active); +EXPORT_SYMBOL(gen_estimator_read); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 9d3d9e78397b..b71ccaec0991 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -1,16 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/gen_stats.c * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Authors: Thomas Graf <tgraf@suug.ch> * Jamal Hadi Salim * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * - * See Documentation/networking/gen_stats.txt + * See Documentation/networking/gen_stats.rst */ #include <linux/types.h> @@ -22,17 +18,21 @@ #include <linux/gen_stats.h> #include <net/netlink.h> #include <net/gen_stats.h> - +#include <net/sch_generic.h> static inline int -gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) { - if (nla_put(d->skb, type, size, buf)) + if (nla_put_64bit(d->skb, type, size, buf, padattr)) goto nla_put_failure; return 0; nla_put_failure: - spin_unlock_bh(d->lock); + if (d->lock) + spin_unlock_bh(d->lock); + kfree(d->xstats); + d->xstats = NULL; + d->xstats_len = 0; return -1; } @@ -44,6 +44,7 @@ nla_put_failure: * @xstats_type: TLV type for backward compatibility xstats TLV * @lock: statistics lock * @d: dumping handle + * @padattr: padding attribute * * Initializes the dumping handle, grabs the statistic lock and appends * an empty TLV header to the socket buffer for use a container for all @@ -56,32 +57,48 @@ nla_put_failure: */ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, - int xstats_type, spinlock_t *lock, struct gnet_dump *d) + int xstats_type, spinlock_t *lock, + struct gnet_dump *d, int padattr) __acquires(lock) { memset(d, 0, sizeof(*d)); - spin_lock_bh(lock); - d->lock = lock; if (type) d->tail = (struct nlattr *)skb_tail_pointer(skb); d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; + d->padattr = padattr; + if (lock) { + d->lock = lock; + spin_lock_bh(lock); + } + if (d->tail) { + int ret = gnet_stats_copy(d, type, NULL, 0, padattr); - if (d->tail) - return gnet_stats_copy(d, type, NULL, 0); + /* The initial attribute added in gnet_stats_copy() may be + * preceded by a padding attribute, in which case d->tail will + * end up pointing at the padding instead of the real attribute. + * Fix this so gnet_stats_finish_copy() adjusts the length of + * the right attribute. + */ + if (ret == 0 && d->tail->nla_type == padattr) + d->tail = (struct nlattr *)((char *)d->tail + + NLA_ALIGN(d->tail->nla_len)); + return ret; + } return 0; } EXPORT_SYMBOL(gnet_stats_start_copy_compat); /** - * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * gnet_stats_start_copy - start dumping procedure in compatibility mode * @skb: socket buffer to put statistics TLVs into * @type: TLV type for top level statistic TLV * @lock: statistics lock * @d: dumping handle + * @padattr: padding attribute * * Initializes the dumping handle, grabs the statistic lock and appends * an empty TLV header to the socket buffer for use a container for all @@ -91,16 +108,147 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat); */ int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, - struct gnet_dump *d) + struct gnet_dump *d, int padattr) { - return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr); } EXPORT_SYMBOL(gnet_stats_start_copy); +/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) +{ + u64_stats_set(&b->bytes, 0); + u64_stats_set(&b->packets, 0); + u64_stats_init(&b->syncp); +} +EXPORT_SYMBOL(gnet_stats_basic_sync_init); + +static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu) +{ + u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin(&bcpu->syncp); + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry(&bcpu->syncp, start)); + + t_bytes += bytes; + t_packets += packets; + } + _bstats_update(bstats, t_bytes, t_packets); +} + +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) +{ + unsigned int start; + u64 bytes = 0; + u64 packets = 0; + + WARN_ON_ONCE((cpu || running) && in_hardirq()); + + if (cpu) { + gnet_stats_add_basic_cpu(bstats, cpu); + return; + } + do { + if (running) + start = u64_stats_fetch_begin(&b->syncp); + bytes = u64_stats_read(&b->bytes); + packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry(&b->syncp, start)); + + _bstats_update(bstats, bytes, packets); +} +EXPORT_SYMBOL(gnet_stats_add_basic); + +static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) +{ + unsigned int start; + + if (cpu) { + u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin(&bcpu->syncp); + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry(&bcpu->syncp, start)); + + t_bytes += bytes; + t_packets += packets; + } + *ret_bytes = t_bytes; + *ret_packets = t_packets; + return; + } + do { + if (running) + start = u64_stats_fetch_begin(&b->syncp); + *ret_bytes = u64_stats_read(&b->bytes); + *ret_packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry(&b->syncp, start)); +} + +static int +___gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + int type, bool running) +{ + u64 bstats_bytes, bstats_packets; + + gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); + + if (d->compat_tc_stats && type == TCA_STATS_BASIC) { + d->tc_stats.bytes = bstats_bytes; + d->tc_stats.packets = bstats_packets; + } + + if (d->tail) { + struct gnet_stats_basic sb; + int res; + + memset(&sb, 0, sizeof(sb)); + sb.bytes = bstats_bytes; + sb.packets = bstats_packets; + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); + if (res < 0 || sb.packets == bstats_packets) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, + sizeof(bstats_packets), TCA_STATS_PAD); + } + return 0; +} + /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @d: dumping handle + * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -109,30 +257,46 @@ EXPORT_SYMBOL(gnet_stats_start_copy); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - if (d->compat_tc_stats) { - d->tc_stats.bytes = b->bytes; - d->tc_stats.packets = b->packets; - } - - if (d->tail) { - struct gnet_stats_basic sb; - - memset(&sb, 0, sizeof(sb)); - sb.bytes = b->bytes; - sb.packets = b->packets; - return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); - } - return 0; + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); } EXPORT_SYMBOL(gnet_stats_copy_basic); /** - * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV + * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV * @d: dumping handle + * @cpu: copy statistic per cpu * @b: basic statistics - * @r: rate estimator statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) +{ + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); +} +EXPORT_SYMBOL(gnet_stats_copy_basic_hw); + +/** + * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV + * @d: dumping handle + * @rate_est: rate estimator * * Appends the rate estimator statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -142,18 +306,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); */ int gnet_stats_copy_rate_est(struct gnet_dump *d, - const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est64 *r) + struct net_rate_estimator __rcu **rate_est) { + struct gnet_stats_rate_est64 sample; struct gnet_stats_rate_est est; int res; - if (b && !gen_estimator_active(b, r)) + if (!gen_estimator_read(rate_est, &sample)) return 0; - - est.bps = min_t(u64, UINT_MAX, r->bps); + est.bps = min_t(u64, UINT_MAX, sample.bps); /* we have some time before reaching 2^32 packets per second */ - est.pps = r->pps; + est.pps = sample.pps; if (d->compat_tc_stats) { d->tc_stats.bps = est.bps; @@ -161,40 +324,86 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } if (d->tail) { - res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); - if (res < 0 || est.bps == r->bps) + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), + TCA_STATS_PAD); + if (res < 0 || est.bps == sample.bps) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample, + sizeof(sample), TCA_STATS_PAD); } return 0; } EXPORT_SYMBOL(gnet_stats_copy_rate_est); +static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) +{ + int i; + + for_each_possible_cpu(i) { + const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + + qstats->qlen += qcpu->qlen; + qstats->backlog += qcpu->backlog; + qstats->drops += qcpu->drops; + qstats->requeues += qcpu->requeues; + qstats->overlimits += qcpu->overlimits; + } +} + +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q) +{ + if (cpu) { + gnet_stats_add_queue_cpu(qstats, cpu); + } else { + qstats->qlen += q->qlen; + qstats->backlog += q->backlog; + qstats->drops += q->drops; + qstats->requeues += q->requeues; + qstats->overlimits += q->overlimits; + } +} +EXPORT_SYMBOL(gnet_stats_add_queue); + /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV * @d: dumping handle + * @cpu_q: per cpu queue statistics * @q: queue statistics + * @qlen: queue length statistics * * Appends the queue statistics to the top level TLV created by - * gnet_stats_start_copy(). + * gnet_stats_start_copy(). Using per cpu queue statistics if + * they are available. * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue __percpu *cpu_q, + struct gnet_stats_queue *q, __u32 qlen) { + struct gnet_stats_queue qstats = {0}; + + gnet_stats_add_queue(&qstats, cpu_q, q); + qstats.qlen = qlen; + if (d->compat_tc_stats) { - d->tc_stats.drops = q->drops; - d->tc_stats.qlen = q->qlen; - d->tc_stats.backlog = q->backlog; - d->tc_stats.overlimits = q->overlimits; + d->tc_stats.drops = qstats.drops; + d->tc_stats.qlen = qstats.qlen; + d->tc_stats.backlog = qstats.backlog; + d->tc_stats.overlimits = qstats.overlimits; } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + return gnet_stats_copy(d, TCA_STATS_QUEUE, + &qstats, sizeof(qstats), + TCA_STATS_PAD); return 0; } @@ -206,7 +415,7 @@ EXPORT_SYMBOL(gnet_stats_copy_queue); * @st: application specific statistics data * @len: length of data * - * Appends the application sepecific statistics to the top level TLV created by + * Appends the application specific statistics to the top level TLV created by * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping * handle is in backward compatibility mode. * @@ -217,14 +426,23 @@ int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) { if (d->compat_xstats) { - d->xstats = st; + d->xstats = kmemdup(st, len, GFP_ATOMIC); + if (!d->xstats) + goto err_out; d->xstats_len = len; } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_APP, st, len); + return gnet_stats_copy(d, TCA_STATS_APP, st, len, + TCA_STATS_PAD); return 0; + +err_out: + if (d->lock) + spin_unlock_bh(d->lock); + d->xstats_len = 0; + return -1; } EXPORT_SYMBOL(gnet_stats_copy_app); @@ -248,16 +466,20 @@ gnet_stats_finish_copy(struct gnet_dump *d) if (d->compat_tc_stats) if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, - sizeof(d->tc_stats)) < 0) + sizeof(d->tc_stats), d->padattr) < 0) return -1; if (d->compat_xstats && d->xstats) { if (gnet_stats_copy(d, d->compat_xstats, d->xstats, - d->xstats_len) < 0) + d->xstats_len, d->padattr) < 0) return -1; } - spin_unlock_bh(d->lock); + if (d->lock) + spin_unlock_bh(d->lock); + kfree(d->xstats); + d->xstats = NULL; + d->xstats_len = 0; return 0; } EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/gro.c b/net/core/gro.c new file mode 100644 index 000000000000..76f9c3712422 --- /dev/null +++ b/net/core/gro.c @@ -0,0 +1,835 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <net/psp.h> +#include <net/gro.h> +#include <net/dst_metadata.h> +#include <net/busy_poll.h> +#include <trace/events/net.h> +#include <linux/skbuff_ref.h> + +#define MAX_GRO_SKBS 8 + +static DEFINE_SPINLOCK(offload_lock); + +/** + * dev_add_offload - register offload handlers + * @po: protocol offload declaration + * + * Add protocol offload handlers to the networking stack. The passed + * &proto_offload is linked into kernel lists and may not be freed until + * it has been removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ + struct packet_offload *elem; + + spin_lock(&offload_lock); + list_for_each_entry(elem, &net_hotdata.offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + * __dev_remove_offload - remove offload handler + * @po: packet offload declaration + * + * Remove a protocol offload handler that was previously added to the + * kernel offload handlers by dev_add_offload(). The passed &offload_type + * is removed from the kernel lists and can be freed or reused once this + * function returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +static void __dev_remove_offload(struct packet_offload *po) +{ + struct list_head *head = &net_hotdata.offload_base; + struct packet_offload *po1; + + spin_lock(&offload_lock); + + list_for_each_entry(po1, head, list) { + if (po == po1) { + list_del_rcu(&po->list); + goto out; + } + } + + pr_warn("dev_remove_offload: %p not found\n", po); +out: + spin_unlock(&offload_lock); +} + +/** + * dev_remove_offload - remove packet offload handler + * @po: packet offload declaration + * + * Remove a packet offload handler that was previously added to the kernel + * offload handlers by dev_add_offload(). The passed &offload_type is + * removed from the kernel lists and can be freed or reused once this + * function returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ + __dev_remove_offload(po); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); + + +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) +{ + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); + unsigned int len = skb_gro_len(skb); + unsigned int delta_truesize; + unsigned int new_truesize; + struct sk_buff *lp; + int segs; + + /* Do not splice page pool based packets w/ non-page pool + * packets. This can result in reference count issues as page + * pool pages will not decrement the reference count and will + * instead be immediately returned to the pool or have frag + * count decremented. + */ + if (p->pp_recycle != skb->pp_recycle) + return -ETOOMANYREFS; + + if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) || + NAPI_GRO_CB(skb)->flush)) + return -E2BIG; + + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || + (p->protocol == htons(ETH_P_IPV6) && + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || + p->encapsulation) + return -E2BIG; + } + + segs = NAPI_GRO_CB(skb)->count; + lp = NAPI_GRO_CB(p)->last; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { + skb_frag_t *frag; + skb_frag_t *frag2; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; + + if (nr_frags > MAX_SKB_FRAGS) + goto merge; + + offset -= headlen; + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; + + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; + do { + *--frag = *--frag2; + } while (--i); + + skb_frag_off_add(frag, offset); + skb_frag_size_sub(frag, offset); + + /* all fragments truesize : remove (head size + sk_buff) */ + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; + + skb->truesize = new_truesize; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; + goto done; + } else if (skb->head_frag) { + int nr_frags = pinfo->nr_frags; + skb_frag_t *frag = pinfo->frags + nr_frags; + struct page *page = virt_to_head_page(skb->head); + unsigned int first_size = headlen - offset; + unsigned int first_offset; + + if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) + goto merge; + + first_offset = skb->data - + (unsigned char *)page_address(page) + + offset; + + pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; + + skb_frag_fill_page_desc(frag, page, first_offset, first_size); + + memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); + /* We dont need to clear skbinfo->nr_frags here */ + + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; + goto done; + } + +merge: + /* sk ownership - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + skb->sk = NULL; + delta_truesize = skb->truesize; + if (offset > headlen) { + unsigned int eat = offset - headlen; + + skb_frag_off_add(&skbinfo->frags[0], eat); + skb_frag_size_sub(&skbinfo->frags[0], eat); + skb->data_len -= eat; + skb->len -= eat; + offset = headlen; + } + + __skb_pull(skb, offset); + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + NAPI_GRO_CB(p)->last = skb; + __skb_header_release(skb); + lp = p; + +done: + NAPI_GRO_CB(p)->count += segs; + p->data_len += len; + p->truesize += delta_truesize; + p->len += len; + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} + +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + + /* sk ownership - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + skb->sk = NULL; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} + +static void gro_complete(struct gro_node *gro, struct sk_buff *skb) +{ + struct list_head *head = &net_hotdata.offload_base; + struct packet_offload *ptype; + __be16 type = skb->protocol; + int err = -ENOENT; + + BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return; + } + +out: + gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count); +} + +static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old) +{ + struct list_head *head = &gro->hash[index].list; + struct sk_buff *skb, *p; + + list_for_each_entry_safe_reverse(skb, p, head, list) { + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + skb_list_del_init(skb); + gro_complete(gro, skb); + gro->hash[index].count--; + } + + if (!gro->hash[index].count) + __clear_bit(index, &gro->bitmask); +} + +/* + * gro->hash[].list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void __gro_flush(struct gro_node *gro, bool flush_old) +{ + unsigned long bitmask = gro->bitmask; + unsigned int i, base = ~0U; + + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __gro_flush_chain(gro, base, flush_old); + } +} +EXPORT_SYMBOL(__gro_flush); + +static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, + const struct sk_buff *p, + unsigned long diffs) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; + + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + return diffs; +} + +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) +{ + unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); + struct sk_buff *p; + + list_for_each_entry(p, head, list) { + unsigned long diffs; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_all ^ skb->vlan_all; + diffs |= skb_metadata_differs(p, skb); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_mac_header(skb), + maclen); + + /* in most common scenarios 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + + diffs |= gro_list_prepare_tc_ext(skb, p, diffs); + diffs |= __psp_skb_coalesce_diff(skb, p, diffs); + } + + NAPI_GRO_CB(p)->same_flow = !diffs; + } +} + +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) +{ + const struct skb_shared_info *pinfo; + const skb_frag_t *frag0; + unsigned int headlen; + + NAPI_GRO_CB(skb)->network_offset = 0; + NAPI_GRO_CB(skb)->data_offset = 0; + headlen = skb_headlen(skb); + NAPI_GRO_CB(skb)->frag0 = skb->data; + NAPI_GRO_CB(skb)->frag0_len = headlen; + if (headlen) + return; + + pinfo = skb_shinfo(skb); + frag0 = &pinfo->frags[0]; + + if (pinfo->nr_frags && skb_frag_page(frag0) && + !PageHighMem(skb_frag_page(frag0)) && + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + skb_frag_off_add(&pinfo->frags[0], grow); + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); + } +} + +static void gro_try_pull_from_frag0(struct sk_buff *skb) +{ + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + if (grow > 0) + gro_pull_from_frag0(skb, grow); +} + +static void gro_flush_oldest(struct gro_node *gro, struct list_head *head) +{ + struct sk_buff *oldest; + + oldest = list_last_entry(head, struct sk_buff, list); + + /* We are called with head length >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. + */ + skb_list_del_init(oldest); + gro_complete(gro, oldest); +} + +static enum gro_result dev_gro_receive(struct gro_node *gro, + struct sk_buff *skb) +{ + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct list_head *head = &net_hotdata.offload_base; + struct gro_list *gro_list = &gro->hash[bucket]; + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct sk_buff *pp = NULL; + enum gro_result ret; + int same_flow; + + if (netif_elide_gro(skb->dev)) + goto normal; + + gro_list_prepare(&gro_list->list, skb); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type == type && ptype->callbacks.gro_receive) + goto found_ptype; + } + rcu_read_unlock(); + goto normal; + +found_ptype: + skb_set_network_header(skb, skb_gro_offset(skb)); + skb_reset_mac_len(skb); + BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), + sizeof(u32))); /* Avoid slow unaligned acc */ + *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->count = 1; + if (unlikely(skb_is_gso(skb))) { + NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; + /* Only support TCP and non DODGY users. */ + if (!skb_is_gso_tcp(skb) || + (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) + NAPI_GRO_CB(skb)->flush = 1; + } + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + break; + } + + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + &gro_list->list, skb); + + rcu_read_unlock(); + + if (PTR_ERR(pp) == -EINPROGRESS) { + ret = GRO_CONSUMED; + goto ok; + } + + same_flow = NAPI_GRO_CB(skb)->same_flow; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + + if (pp) { + skb_list_del_init(pp); + gro_complete(gro, pp); + gro_list->count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush) + goto normal; + + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(gro, &gro_list->list); + else + gro_list->count++; + + /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */ + gro_try_pull_from_frag0(skb); + NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; + if (!skb_is_gso(skb)) + skb_shinfo(skb)->gso_size = skb_gro_len(skb); + list_add(&skb->list, &gro_list->list); + ret = GRO_HELD; +ok: + if (gro_list->count) { + if (!test_bit(bucket, &gro->bitmask)) + __set_bit(bucket, &gro->bitmask); + } else if (test_bit(bucket, &gro->bitmask)) { + __clear_bit(bucket, &gro->bitmask); + } + + return ret; + +normal: + ret = GRO_NORMAL; + gro_try_pull_from_frag0(skb); + goto ok; +} + +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &net_hotdata.offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &net_hotdata.offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); + +static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + gro_normal_one(gro, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __napi_kfree_skb(skb, SKB_CONSUMED); + break; + + case GRO_HELD: + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb) +{ + gro_result_t ret; + + __skb_mark_napi_id(skb, gro); + trace_napi_gro_receive_entry(skb); + + skb_gro_reset_offset(skb, 0); + + ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb)); + trace_napi_gro_receive_exit(ret); + + return ret; +} +EXPORT_SYMBOL(gro_receive_skb); + +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + + if (unlikely(skb->pfmemalloc)) { + consume_skb(skb); + return; + } + __skb_pull(skb, skb_headlen(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); + __vlan_hwaccel_clear_tag(skb); + skb->dev = napi->dev; + skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + + skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; + + shinfo = skb_shinfo(skb); + shinfo->gso_type = 0; + shinfo->gso_size = 0; + shinfo->hwtstamps.hwtstamp = 0; + + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } + + napi->skb = skb; +} + +struct sk_buff *napi_get_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + + if (!skb) { + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } + } + return skb; +} +EXPORT_SYMBOL(napi_get_frags); + +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + if (ret == GRO_NORMAL) + gro_normal_one(&napi->gro, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else + napi_reuse_skb(napi, skb); + break; + + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ +static struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb, hlen); + + if (unlikely(!skb_gro_may_pull(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); + if (unlikely(!eth)) { + net_warn_ratelimited("%s: dropping impossible skb from %s\n", + __func__, napi->dev->name); + napi_reuse_skb(napi, skb); + return NULL; + } + } else { + eth = (const struct ethhdr *)skb->data; + + if (NAPI_GRO_CB(skb)->frag0 != skb->data) + gro_pull_from_frag0(skb, hlen); + + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; + } + __skb_pull(skb, hlen); + + /* + * This works because the only protocols we care about don't require + * special handling. + * We'll fix it up properly in napi_frags_finish() + */ + skb->protocol = eth->h_proto; + + return skb; +} + +gro_result_t napi_gro_frags(struct napi_struct *napi) +{ + gro_result_t ret; + struct sk_buff *skb = napi_frags_skb(napi); + + trace_napi_gro_frags_entry(skb); + + ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_frags); + +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); + +void gro_init(struct gro_node *gro) +{ + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&gro->hash[i].list); + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +void gro_cleanup(struct gro_node *gro) +{ + struct sk_buff *skb, *n; + + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + list_for_each_entry_safe(skb, n, &gro->hash[i].list, list) + kfree_skb(skb); + + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + list_for_each_entry_safe(skb, n, &gro->rx_list, list) + kfree_skb(skb); + + gro->rx_count = 0; +} diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c new file mode 100644 index 000000000000..a725d21159a6 --- /dev/null +++ b/net/core/gro_cells.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <net/gro_cells.h> +#include <net/hotdata.h> + +struct gro_cell { + struct sk_buff_head napi_skbs; + struct napi_struct napi; + local_lock_t bh_lock; +}; + +int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + bool have_bh_lock = false; + struct gro_cell *cell; + int res; + + rcu_read_lock(); + if (unlikely(!(dev->flags & IFF_UP))) + goto drop; + + if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) { + res = netif_rx(skb); + goto unlock; + } + + local_lock_nested_bh(&gcells->cells->bh_lock); + have_bh_lock = true; + cell = this_cpu_ptr(gcells->cells); + + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { +drop: + dev_core_stats_rx_dropped_inc(dev); + kfree_skb(skb); + res = NET_RX_DROP; + goto unlock; + } + + __skb_queue_tail(&cell->napi_skbs, skb); + if (skb_queue_len(&cell->napi_skbs) == 1) + napi_schedule(&cell->napi); + + res = NET_RX_SUCCESS; + +unlock: + if (have_bh_lock) + local_unlock_nested_bh(&gcells->cells->bh_lock); + rcu_read_unlock(); + return res; +} +EXPORT_SYMBOL(gro_cells_receive); + +/* called under BH context */ +static int gro_cell_poll(struct napi_struct *napi, int budget) +{ + struct gro_cell *cell = container_of(napi, struct gro_cell, napi); + struct sk_buff *skb; + int work_done = 0; + + while (work_done < budget) { + __local_lock_nested_bh(&cell->bh_lock); + skb = __skb_dequeue(&cell->napi_skbs); + __local_unlock_nested_bh(&cell->bh_lock); + if (!skb) + break; + napi_gro_receive(napi, skb); + work_done++; + } + + if (work_done < budget) + napi_complete_done(napi, work_done); + return work_done; +} + +int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) +{ + int i; + + gcells->cells = alloc_percpu(struct gro_cell); + if (!gcells->cells) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + + __skb_queue_head_init(&cell->napi_skbs); + local_lock_init(&cell->bh_lock); + + set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); + + netif_napi_add(dev, &cell->napi, gro_cell_poll); + napi_enable(&cell->napi); + } + return 0; +} +EXPORT_SYMBOL(gro_cells_init); + +struct percpu_free_defer { + struct rcu_head rcu; + void __percpu *ptr; +}; + +static void percpu_free_defer_callback(struct rcu_head *head) +{ + struct percpu_free_defer *defer; + + defer = container_of(head, struct percpu_free_defer, rcu); + free_percpu(defer->ptr); + kfree(defer); +} + +void gro_cells_destroy(struct gro_cells *gcells) +{ + struct percpu_free_defer *defer; + int i; + + if (!gcells->cells) + return; + for_each_possible_cpu(i) { + struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + + napi_disable(&cell->napi); + __netif_napi_del(&cell->napi); + __skb_queue_purge(&cell->napi_skbs); + } + /* We need to observe an rcu grace period before freeing ->cells, + * because netpoll could access dev->napi_list under rcu protection. + * Try hard using call_rcu() instead of synchronize_rcu(), + * because we might be called from cleanup_net(), and we + * definitely do not want to block this critical task. + */ + defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); + if (likely(defer)) { + defer->ptr = gcells->cells; + call_rcu(&defer->rcu, percpu_free_defer_callback); + } else { + /* We do not hold RTNL at this point, synchronize_net() + * would not be able to expedite this sync. + */ + synchronize_rcu_expedited(); + free_percpu(gcells->cells); + } + gcells->cells = NULL; +} +EXPORT_SYMBOL(gro_cells_destroy); diff --git a/net/core/gso.c b/net/core/gso.c new file mode 100644 index 000000000000..bcd156372f4d --- /dev/null +++ b/net/core/gso.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/skbuff.h> +#include <linux/sctp.h> +#include <net/gso.h> +#include <net/gro.h> + +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); +/* openvswitch calls this on rx path, so we need a different check. + */ +static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; + + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + struct sk_buff *segs; + + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + /* We're going to init ->check field in TCP or UDP header */ + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + } + + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + segs = skb_mac_gso_segment(skb, features); + + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + skb_warn_bad_offload(skb); + + return segs; +} +EXPORT_SYMBOL(__skb_gso_segment); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int thlen = 0; + + if (skb->encapsulation) { + thlen = skb_inner_transport_header(skb) - + skb_transport_header(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + thlen += inner_tcp_hdrlen(skb); + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + thlen = tcp_hdrlen(skb); + } else if (unlikely(skb_is_gso_sctp(skb))) { + thlen = sizeof(struct sctphdr); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + thlen = sizeof(struct udphdr); + } + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return thlen + shinfo->gso_size; +} + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS + * + * There are a couple of instances where we have a GSO skb, and we + * want to determine what size it would be after it is segmented. + * + * We might want to check: + * - L3+L4+payload size (e.g. IP forwarding) + * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) + * + * This is a helper to do that correctly considering GSO_BY_FRAGS. + * + * @skb: GSO skb + * + * @seg_len: The segmented length (from skb_gso_*_seglen). In the + * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. + * + * @max_len: The maximum permissible length. + * + * Returns true if the segmented length <= max length. + */ +static inline bool skb_gso_size_check(const struct sk_buff *skb, + unsigned int seg_len, + unsigned int max_len) { + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + + if (shinfo->gso_size != GSO_BY_FRAGS) + return seg_len <= max_len; + + /* Undo this so we can re-use header sizes */ + seg_len -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (seg_len + skb_headlen(iter) > max_len) + return false; + } + + return true; +} + +/** + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. + */ +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) +{ + return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); + +/** + * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? + * + * @skb: GSO skb + * @len: length to validate against + * + * skb_gso_validate_mac_len validates if a given skb will fit a wanted + * length once split, including L2, L3 and L4 headers and the payload. + */ +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) +{ + return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); + diff --git a/net/core/hotdata.c b/net/core/hotdata.c new file mode 100644 index 000000000000..dddd5c287cf0 --- /dev/null +++ b/net/core/hotdata.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/cache.h> +#include <linux/jiffies.h> +#include <linux/list.h> +#include <net/aligned_data.h> +#include <net/hotdata.h> +#include <net/ip.h> +#include <net/proto_memory.h> + +struct net_hotdata net_hotdata __cacheline_aligned = { + .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), + .gro_normal_batch = 8, + + .netdev_budget = 300, + /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ + .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, + + .tstamp_prequeue = 1, + .max_backlog = 1000, + .dev_tx_weight = 64, + .dev_rx_weight = 64, + .sysctl_max_skb_frags = MAX_SKB_FRAGS, + .sysctl_skb_defer_max = 128, + .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE +}; +EXPORT_SYMBOL(net_hotdata); + +struct net_aligned_data net_aligned_data; +EXPORT_IPV6_MOD(net_aligned_data); diff --git a/net/core/hwbm.c b/net/core/hwbm.c new file mode 100644 index 000000000000..ac1a66df9adc --- /dev/null +++ b/net/core/hwbm.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Support for hardware buffer manager. + * + * Copyright (C) 2016 Marvell + * + * Gregory CLEMENT <gregory.clement@free-electrons.com> + */ +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/skbuff.h> +#include <net/hwbm.h> + +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) +{ + if (likely(bm_pool->frag_size <= PAGE_SIZE)) + skb_free_frag(buf); + else + kfree(buf); +} +EXPORT_SYMBOL_GPL(hwbm_buf_free); + +/* Refill processing for HW buffer management */ +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) +{ + int frag_size = bm_pool->frag_size; + void *buf; + + if (likely(frag_size <= PAGE_SIZE)) + buf = netdev_alloc_frag(frag_size); + else + buf = kmalloc(frag_size, gfp); + + if (!buf) + return -ENOMEM; + + if (bm_pool->construct) + if (bm_pool->construct(bm_pool, buf)) { + hwbm_buf_free(bm_pool, buf); + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_GPL(hwbm_pool_refill); + +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num) +{ + int err, i; + + mutex_lock(&bm_pool->buf_lock); + if (bm_pool->buf_num == bm_pool->size) { + pr_warn("pool already filled\n"); + mutex_unlock(&bm_pool->buf_lock); + return bm_pool->buf_num; + } + + if (buf_num + bm_pool->buf_num > bm_pool->size) { + pr_warn("cannot allocate %d buffers for pool\n", + buf_num); + mutex_unlock(&bm_pool->buf_lock); + return 0; + } + + if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) { + pr_warn("Adding %d buffers to the %d current buffers will overflow\n", + buf_num, bm_pool->buf_num); + mutex_unlock(&bm_pool->buf_lock); + return 0; + } + + for (i = 0; i < buf_num; i++) { + err = hwbm_pool_refill(bm_pool, GFP_KERNEL); + if (err < 0) + break; + } + + /* Update BM driver with number of buffers added to pool */ + bm_pool->buf_num += i; + + pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num); + mutex_unlock(&bm_pool->buf_lock); + + return i; +} +EXPORT_SYMBOL_GPL(hwbm_pool_add); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c new file mode 100644 index 000000000000..669b357b73b2 --- /dev/null +++ b/net/core/ieee8021q_helpers.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <linux/types.h> +#include <net/dscp.h> +#include <net/ieee8021q.h> + +/* verify that table covers all 8 traffic types */ +#define TT_MAP_SIZE_OK(tbl) \ + compiletime_assert(ARRAY_SIZE(tbl) == IEEE8021Q_TT_MAX, \ + #tbl " size mismatch") + +/* The following arrays map Traffic Types (TT) to traffic classes (TC) for + * different number of queues as shown in the example provided by + * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and + * Table I-1 "Traffic type to traffic class mapping". + */ +static const u8 ieee8021q_8queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, + [IEEE8021Q_TT_VO] = 5, + [IEEE8021Q_TT_IC] = 6, + [IEEE8021Q_TT_NC] = 7, +}; + +static const u8 ieee8021q_7queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4, + [IEEE8021Q_TT_IC] = 5, + [IEEE8021Q_TT_NC] = 6, +}; + +static const u8 ieee8021q_6queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2, + [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3, + [IEEE8021Q_TT_IC] = 4, + [IEEE8021Q_TT_NC] = 5, +}; + +static const u8 ieee8021q_5queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, + [IEEE8021Q_TT_NC] = 4, +}; + +static const u8 ieee8021q_4queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3, +}; + +static const u8 ieee8021q_3queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2, +}; + +static const u8 ieee8021q_2queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1, +}; + +static const u8 ieee8021q_1queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0, + [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0, +}; + +/** + * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class + * @tt: IEEE 802.1Q Traffic Type + * @num_queues: Number of queues + * + * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based + * on the number of queues configured on the NIC. The mapping is based on the + * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic + * class mapping" and Table I-1 "Traffic type to traffic class mapping". + * + * Return: Traffic Class corresponding to the given Traffic Type or negative + * value in case of error. + */ +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) +{ + if (tt < 0 || tt >= IEEE8021Q_TT_MAX) { + pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt, + IEEE8021Q_TT_MAX); + return -EINVAL; + } + + switch (num_queues) { + case 8: + TT_MAP_SIZE_OK(ieee8021q_8queue_tt_tc_map); + return ieee8021q_8queue_tt_tc_map[tt]; + case 7: + TT_MAP_SIZE_OK(ieee8021q_7queue_tt_tc_map); + return ieee8021q_7queue_tt_tc_map[tt]; + case 6: + TT_MAP_SIZE_OK(ieee8021q_6queue_tt_tc_map); + return ieee8021q_6queue_tt_tc_map[tt]; + case 5: + TT_MAP_SIZE_OK(ieee8021q_5queue_tt_tc_map); + return ieee8021q_5queue_tt_tc_map[tt]; + case 4: + TT_MAP_SIZE_OK(ieee8021q_4queue_tt_tc_map); + return ieee8021q_4queue_tt_tc_map[tt]; + case 3: + TT_MAP_SIZE_OK(ieee8021q_3queue_tt_tc_map); + return ieee8021q_3queue_tt_tc_map[tt]; + case 2: + TT_MAP_SIZE_OK(ieee8021q_2queue_tt_tc_map); + return ieee8021q_2queue_tt_tc_map[tt]; + case 1: + TT_MAP_SIZE_OK(ieee8021q_1queue_tt_tc_map); + return ieee8021q_1queue_tt_tc_map[tt]; + } + + pr_err("Invalid number of queues %d\n", num_queues); + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc); + +/** + * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type + * @dscp: IETF DSCP value + * + * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT). + * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic + * Type, this function is inspired by the RFC8325 documentation which describe + * the mapping between DSCP and 802.11 User Priority (UP) values. + * + * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value + */ +int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + switch (dscp) { + case DSCP_CS0: + /* Comment from RFC8325: + * [RFC4594], Section 4.8, recommends High-Throughput Data be marked + * AF1x (that is, AF11, AF12, and AF13, according to the rules defined + * in [RFC2475]). + * + * By default (as described in Section 2.3), High-Throughput Data will + * map to UP 1 and, thus, to the Background Access Category (AC_BK), + * which is contrary to the intent expressed in [RFC4594]. + + * Unfortunately, there really is no corresponding fit for the High- + * Throughput Data service class within the constrained 4 Access + * Category [IEEE.802.11-2016] model. If the High-Throughput Data + * service class is assigned to the Best Effort Access Category (AC_BE), + * then it would contend with Low-Latency Data (while [RFC4594] + * recommends a distinction in servicing between these service classes) + * as well as with the default service class; alternatively, if it is + * assigned to the Background Access Category (AC_BK), then it would + * receive a less-then-best-effort service and contend with Low-Priority + * Data (as discussed in Section 4.2.10). + * + * As such, since there is no directly corresponding fit for the High- + * Throughout Data service class within the [IEEE.802.11-2016] model, it + * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby + * admitting it to the Best Effort Access Category (AC_BE). + * + * Note: The above text is from RFC8325 which is describing the mapping + * between DSCP and 802.11 User Priority (UP) values. The mapping + * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but + * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q + * Traffic Types BE and BK. + */ + case DSCP_AF11: + case DSCP_AF12: + case DSCP_AF13: + return IEEE8021Q_TT_BE; + /* Comment from RFC8325: + * RFC3662 and RFC4594 both recommend Low-Priority Data be marked + * with DSCP CS1. The Low-Priority Data service class loosely + * corresponds to the [IEEE.802.11-2016] Background Access Category + */ + case DSCP_CS1: + return IEEE8021Q_TT_BK; + case DSCP_CS2: + case DSCP_AF21: + case DSCP_AF22: + case DSCP_AF23: + return IEEE8021Q_TT_EE; + case DSCP_CS3: + case DSCP_AF31: + case DSCP_AF32: + case DSCP_AF33: + return IEEE8021Q_TT_CA; + case DSCP_CS4: + case DSCP_AF41: + case DSCP_AF42: + case DSCP_AF43: + return IEEE8021Q_TT_VI; + case DSCP_CS5: + case DSCP_EF: + case DSCP_VOICE_ADMIT: + return IEEE8021Q_TT_VO; + case DSCP_CS6: + return IEEE8021Q_TT_IC; + case DSCP_CS7: + return IEEE8021Q_TT_NC; + } + + return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp); +} +EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt); diff --git a/net/core/iovec.c b/net/core/iovec.c deleted file mode 100644 index de178e462682..000000000000 --- a/net/core/iovec.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - * iovec manipulation routines. - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Fixes: - * Andrew Lunn : Errors in iovec copying. - * Pedro Roque : Added memcpy_fromiovecend and - * csum_..._fromiovecend. - * Andi Kleen : fixed error handling for 2.1 - * Alexey Kuznetsov: 2.1 optimisations - * Andi Kleen : Fix csum*fromiovecend for IPv6. - */ - -#include <linux/errno.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/net.h> -#include <linux/in6.h> -#include <asm/uaccess.h> -#include <asm/byteorder.h> -#include <net/checksum.h> -#include <net/sock.h> - -/* - * Verify iovec. The caller must ensure that the iovec is big enough - * to hold the message iovec. - * - * Save time not doing access_ok. copy_*_user will make this work - * in any case. - */ - -int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode) -{ - int size, ct, err; - - if (m->msg_namelen) { - if (mode == VERIFY_READ) { - void __user *namep; - namep = (void __user __force *) m->msg_name; - err = move_addr_to_kernel(namep, m->msg_namelen, - address); - if (err < 0) - return err; - } - m->msg_name = address; - } else { - m->msg_name = NULL; - } - - size = m->msg_iovlen * sizeof(struct iovec); - if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) - return -EFAULT; - - m->msg_iov = iov; - err = 0; - - for (ct = 0; ct < m->msg_iovlen; ct++) { - size_t len = iov[ct].iov_len; - - if (len > INT_MAX - err) { - len = INT_MAX - err; - iov[ct].iov_len = len; - } - err += len; - } - - return err; -} - -/* - * Copy kernel to iovec. Returns -EFAULT on error. - */ - -int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, - int offset, int len) -{ - int copy; - for (; len > 0; ++iov) { - /* Skip over the finished iovecs */ - if (unlikely(offset >= iov->iov_len)) { - offset -= iov->iov_len; - continue; - } - copy = min_t(unsigned int, iov->iov_len - offset, len); - if (copy_to_user(iov->iov_base + offset, kdata, copy)) - return -EFAULT; - offset = 0; - kdata += copy; - len -= copy; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_toiovecend); - -/* - * Copy iovec from kernel. Returns -EFAULT on error. - */ - -int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, - int offset, int len) -{ - /* Skip over the finished iovecs */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - } - - while (len > 0) { - u8 __user *base = iov->iov_base + offset; - int copy = min_t(unsigned int, len, iov->iov_len - offset); - - offset = 0; - if (copy_from_user(kdata, base, copy)) - return -EFAULT; - len -= copy; - kdata += copy; - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_fromiovecend); - -/* - * And now for the all-in-one: copy and checksum from a user iovec - * directly to a datagram - * Calls to csum_partial but the last must be in 32 bit chunks - * - * ip_build_xmit must ensure that when fragmenting only the last - * call to this function will be unaligned also. - */ -int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, - int offset, unsigned int len, __wsum *csump) -{ - __wsum csum = *csump; - int partial_cnt = 0, err = 0; - - /* Skip over the finished iovecs */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - } - - while (len > 0) { - u8 __user *base = iov->iov_base + offset; - int copy = min_t(unsigned int, len, iov->iov_len - offset); - - offset = 0; - - /* There is a remnant from previous iov. */ - if (partial_cnt) { - int par_len = 4 - partial_cnt; - - /* iov component is too short ... */ - if (par_len > copy) { - if (copy_from_user(kdata, base, copy)) - goto out_fault; - kdata += copy; - base += copy; - partial_cnt += copy; - len -= copy; - iov++; - if (len) - continue; - *csump = csum_partial(kdata - partial_cnt, - partial_cnt, csum); - goto out; - } - if (copy_from_user(kdata, base, par_len)) - goto out_fault; - csum = csum_partial(kdata - partial_cnt, 4, csum); - kdata += par_len; - base += par_len; - copy -= par_len; - len -= par_len; - partial_cnt = 0; - } - - if (len > copy) { - partial_cnt = copy % 4; - if (partial_cnt) { - copy -= partial_cnt; - if (copy_from_user(kdata + copy, base + copy, - partial_cnt)) - goto out_fault; - } - } - - if (copy) { - csum = csum_and_copy_from_user(base, kdata, copy, - csum, &err); - if (err) - goto out; - } - len -= copy + partial_cnt; - kdata += copy + partial_cnt; - iov++; - } - *csump = csum; -out: - return err; - -out_fault: - err = -EFAULT; - goto out; -} -EXPORT_SYMBOL(csum_partial_copy_fromiovecend); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9c3a839322ba..212cde35affa 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -1,14 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux network device link state notification * * Author: * Stefan Rompf <sux@loplof.de> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * */ #include <linux/module.h> @@ -21,8 +16,9 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/bitops.h> -#include <asm/types.h> +#include <linux/types.h> +#include "dev.h" enum lw_bits { LW_URGENT = 0, @@ -37,11 +33,37 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); -static unsigned char default_operstate(const struct net_device *dev) +static unsigned int default_operstate(const struct net_device *dev) { - if (!netif_carrier_ok(dev)) - return (dev->ifindex != dev->iflink ? - IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); + if (netif_testing(dev)) + return IF_OPER_TESTING; + + /* Some uppers (DSA) have additional sources for being down, so + * first check whether lower is indeed the source of its down state. + */ + if (!netif_carrier_ok(dev)) { + struct net_device *peer; + int iflink; + + /* If called from netdev_run_todo()/linkwatch_sync_dev(), + * dev_net(dev) can be already freed, and RTNL is not held. + */ + if (dev->reg_state <= NETREG_REGISTERED) + iflink = dev_get_iflink(dev); + else + iflink = dev->ifindex; + + if (iflink == dev->ifindex) + return IF_OPER_DOWN; + + ASSERT_RTNL(); + peer = __dev_get_by_index(dev_net(dev), iflink); + if (!peer) + return IF_OPER_DOWN; + + return netif_carrier_ok(peer) ? IF_OPER_DOWN : + IF_OPER_LOWERLAYERDOWN; + } if (netif_dormant(dev)) return IF_OPER_DORMANT; @@ -49,37 +71,37 @@ static unsigned char default_operstate(const struct net_device *dev) return IF_OPER_UP; } - static void rfc2863_policy(struct net_device *dev) { - unsigned char operstate = default_operstate(dev); + unsigned int operstate = default_operstate(dev); - if (operstate == dev->operstate) + if (operstate == READ_ONCE(dev->operstate)) return; - write_lock_bh(&dev_base_lock); - switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; - case IF_LINK_MODE_DEFAULT: default: break; } - dev->operstate = operstate; - - write_unlock_bh(&dev_base_lock); + WRITE_ONCE(dev->operstate, operstate); } void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ - if (!netif_carrier_ok(dev) || netif_dormant(dev)) + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) rfc2863_policy(dev); } @@ -89,10 +111,10 @@ static bool linkwatch_urgent_event(struct net_device *dev) if (!netif_running(dev)) return false; - if (dev->ifindex != dev->iflink) + if (dev->ifindex != dev_get_iflink(dev)) return true; - if (dev->priv_flags & IFF_TEAM_PORT) + if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); @@ -106,7 +128,7 @@ static void linkwatch_add_event(struct net_device *dev) spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); - dev_hold(dev); + netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } @@ -135,9 +157,9 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_wq, &linkwatch_work, 0); + mod_delayed_work(system_dfl_wq, &linkwatch_work, 0); else - schedule_delayed_work(&linkwatch_work, delay); + queue_delayed_work(system_dfl_wq, &linkwatch_work, delay); } @@ -147,7 +169,7 @@ static void linkwatch_do_dev(struct net_device *dev) * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted @@ -161,16 +183,29 @@ static void linkwatch_do_dev(struct net_device *dev) else dev_deactivate(dev); - netdev_state_change(dev); + netif_state_change(dev); } - dev_put(dev); + /* Note: our callers are responsible for calling netdev_tracker_free(). + * This is the reason we use __dev_put() instead of dev_put(). + */ + __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) { - struct net_device *dev; +#define MAX_DO_DEV_PER_LOOP 100 + + int do_dev = MAX_DO_DEV_PER_LOOP; + /* Use a local list here since we add non-urgent + * events back to the global one when called with + * urgent_only=1. + */ LIST_HEAD(wrk); + /* Give urgent case more budget */ + if (urgent_only) + do_dev += MAX_DO_DEV_PER_LOOP; + /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not @@ -189,40 +224,72 @@ static void __linkwatch_run_queue(int urgent_only) spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); - while (!list_empty(&wrk)) { + while (!list_empty(&wrk) && do_dev > 0) { + struct net_device *dev; dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); - if (urgent_only && !linkwatch_urgent_event(dev)) { + if (!netif_device_present(dev) || + (urgent_only && !linkwatch_urgent_event(dev))) { list_add_tail(&dev->link_watch_list, &lweventlist); continue; } + /* We must free netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); + netdev_lock_ops(dev); linkwatch_do_dev(dev); + netdev_unlock_ops(dev); + do_dev--; spin_lock_irq(&lweventlist_lock); } + /* Add the remaining work back to lweventlist */ + list_splice_init(&wrk, &lweventlist); + if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); } -void linkwatch_forget_dev(struct net_device *dev) +static bool linkwatch_clean_dev(struct net_device *dev) { unsigned long flags; - int clean = 0; + bool clean = false; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); - clean = 1; + clean = true; + /* We must release netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); - if (clean) + + return clean; +} + +void __linkwatch_sync_dev(struct net_device *dev) +{ + netdev_ops_assert_locked(dev); + + if (linkwatch_clean_dev(dev)) linkwatch_do_dev(dev); } +void linkwatch_sync_dev(struct net_device *dev) +{ + if (linkwatch_clean_dev(dev)) { + netdev_lock_ops(dev); + linkwatch_do_dev(dev); + netdev_unlock_ops(dev); + } +} /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c new file mode 100644 index 000000000000..9e9fb25314b9 --- /dev/null +++ b/net/core/lock_debug.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Amazon.com Inc. or its affiliates. */ + +#include <linux/init.h> +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/netdev_lock.h> +#include <net/netns/generic.h> + +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + enum netdev_cmd cmd = event; + + /* Keep enum and don't add default to trigger -Werror=switch */ + switch (cmd) { + case NETDEV_XDP_FEAT_CHANGE: + netdev_assert_locked(dev); + fallthrough; + case NETDEV_CHANGE: + case NETDEV_REGISTER: + case NETDEV_UP: + netdev_ops_assert_locked(dev); + fallthrough; + case NETDEV_DOWN: + case NETDEV_REBOOT: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_PRE_CHANGEADDR: + case NETDEV_GOING_DOWN: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_PRE_UP: + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + case NETDEV_POST_INIT: + case NETDEV_PRE_UNINIT: + case NETDEV_RELEASE: + case NETDEV_NOTIFY_PEERS: + case NETDEV_JOIN: + case NETDEV_CHANGEUPPER: + case NETDEV_RESEND_IGMP: + case NETDEV_PRECHANGEMTU: + case NETDEV_CHANGEINFODATA: + case NETDEV_BONDING_INFO: + case NETDEV_PRECHANGEUPPER: + case NETDEV_CHANGELOWERSTATE: + case NETDEV_UDP_TUNNEL_PUSH_INFO: + case NETDEV_UDP_TUNNEL_DROP_INFO: + case NETDEV_CHANGE_TX_QUEUE_LEN: + case NETDEV_CVLAN_FILTER_PUSH_INFO: + case NETDEV_CVLAN_FILTER_DROP_INFO: + case NETDEV_SVLAN_FILTER_PUSH_INFO: + case NETDEV_SVLAN_FILTER_DROP_INFO: + case NETDEV_OFFLOAD_XSTATS_ENABLE: + case NETDEV_OFFLOAD_XSTATS_DISABLE: + case NETDEV_OFFLOAD_XSTATS_REPORT_USED: + case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: + ASSERT_RTNL(); + break; + + case NETDEV_CHANGENAME: + ASSERT_RTNL_NET(net); + break; + } + + return NOTIFY_DONE; +} +EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL"); + +static int rtnl_net_debug_net_id; + +static int __net_init rtnl_net_debug_net_init(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + nb->notifier_call = netdev_debug_event; + + return register_netdevice_notifier_net(net, nb); +} + +static void __net_exit rtnl_net_debug_net_exit(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + unregister_netdevice_notifier_net(net, nb); +} + +static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { + .init = rtnl_net_debug_net_init, + .exit = rtnl_net_debug_net_exit, + .id = &rtnl_net_debug_net_id, + .size = sizeof(struct notifier_block), +}; + +static struct notifier_block rtnl_net_debug_block = { + .notifier_call = netdev_debug_event, +}; + +static int __init rtnl_net_debug_init(void) +{ + int ret; + + ret = register_pernet_subsys(&rtnl_net_debug_net_ops); + if (ret) + return ret; + + ret = register_netdevice_notifier(&rtnl_net_debug_block); + if (ret) + unregister_pernet_subsys(&rtnl_net_debug_net_ops); + + return ret; +} + +subsys_initcall(rtnl_net_debug_init); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c new file mode 100644 index 000000000000..9f40be0c3e71 --- /dev/null +++ b/net/core/lwt_bpf.c @@ -0,0 +1,662 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> + */ + +#include <linux/filter.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/bpf.h> +#include <net/flow.h> +#include <net/lwtunnel.h> +#include <net/gre.h> +#include <net/ip.h> +#include <net/ip6_route.h> +#include <net/ipv6_stubs.h> + +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + +struct bpf_lwt { + struct bpf_lwt_prog in; + struct bpf_lwt_prog out; + struct bpf_lwt_prog xmit; + int family; +}; + +#define MAX_PROG_NAME 256 + +static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct bpf_lwt *)lwt->data; +} + +#define NO_REDIRECT false +#define CAN_REDIRECT true + +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, + struct dst_entry *dst, bool can_redirect) +{ + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int ret; + + /* Disabling BH is needed to protect per-CPU bpf_redirect_info between + * BPF prog and skb_do_redirect(). + */ + local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + bpf_compute_data_pointers(skb); + ret = bpf_prog_run_save_cb(lwt->prog, skb); + + switch (ret) { + case BPF_OK: + case BPF_LWT_REROUTE: + break; + + case BPF_REDIRECT: + if (unlikely(!can_redirect)) { + pr_warn_once("Illegal redirect return code in prog %s\n", + lwt->name ? : "<unknown>"); + ret = BPF_OK; + } else { + skb_reset_mac_header(skb); + skb_do_redirect(skb); + ret = BPF_REDIRECT; + } + break; + + case BPF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + + default: + pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); + kfree_skb(skb); + ret = -EINVAL; + break; + } + + bpf_net_ctx_clear(bpf_net_ctx); + local_bh_enable(); + + return ret; +} + +static int bpf_lwt_input_reroute(struct sk_buff *skb) +{ + enum skb_drop_reason reason; + int err = -EINVAL; + + if (skb->protocol == htons(ETH_P_IP)) { + struct net_device *dev = skb_dst(skb)->dev; + const struct iphdr *iph = ip_hdr(skb); + + dev_hold(dev); + skb_dst_drop(skb); + reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + err = reason ? -EINVAL : 0; + dev_put(dev); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + skb_dst_drop(skb); + err = ipv6_stub->ipv6_route_input(skb); + } else { + err = -EAFNOSUPPORT; + } + + if (err) + goto err; + return dst_input(skb); + +err: + kfree_skb(skb); + return err; +} + +static int bpf_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->in.prog) { + ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); + if (ret < 0) + return ret; + if (ret == BPF_LWT_REROUTE) + return bpf_lwt_input_reroute(skb); + } + + if (unlikely(!dst->lwtstate->orig_input)) { + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_input(skb); +} + +static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->out.prog) { + ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_output)) { + pr_warn_once("orig_output not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_output(net, sk, skb); +} + +static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) +{ + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) + return -ENOMEM; + } + + return 0; +} + +static int bpf_lwt_xmit_reroute(struct sk_buff *skb) +{ + struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); + int oif = l3mdev ? l3mdev->ifindex : 0; + struct dst_entry *dst = NULL; + int err = -EAFNOSUPPORT; + struct sock *sk; + struct net *net; + bool ipv4; + + if (skb->protocol == htons(ETH_P_IP)) + ipv4 = true; + else if (skb->protocol == htons(ETH_P_IPV6)) + ipv4 = false; + else + goto err; + + sk = sk_to_full_sk(skb->sk); + if (sk) { + if (sk->sk_bound_dev_if) + oif = sk->sk_bound_dev_if; + net = sock_net(sk); + } else { + net = dev_net(skb_dst(skb)->dev); + } + + if (ipv4) { + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = {}; + struct rtable *rt; + + fl4.flowi4_oif = oif; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_uid = sock_net_uid(net, sk); + fl4.flowi4_dscp = ip4h_dscp(iph); + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; + fl4.flowi4_proto = iph->protocol; + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto err; + } + dst = &rt->dst; + } else { + struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct flowi6 fl6 = {}; + + fl6.flowi6_oif = oif; + fl6.flowi6_mark = skb->mark; + fl6.flowi6_uid = sock_net_uid(net, sk); + fl6.flowlabel = ip6_flowinfo(iph6); + fl6.flowi6_proto = iph6->nexthdr; + fl6.daddr = iph6->daddr; + fl6.saddr = iph6->saddr; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err; + } + } + if (unlikely(dst->error)) { + err = dst->error; + dst_release(dst); + goto err; + } + + /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it + * was done for the previous dst, so we are doing it here again, in + * case the new dst needs much more space. The call below is a noop + * if there is enough header space in skb. + */ + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto err; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); + if (unlikely(err)) + return net_xmit_errno(err); + + /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ + return LWTUNNEL_XMIT_DONE; + +err: + kfree_skb(skb); + return err; +} + +static int bpf_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->xmit.prog) { + int hh_len = dst->dev->hard_header_len; + __be16 proto = skb->protocol; + int ret; + + ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); + switch (ret) { + case BPF_OK: + /* If the header changed, e.g. via bpf_lwt_push_encap, + * BPF_LWT_REROUTE below should have been used if the + * protocol was also changed. + */ + if (skb->protocol != proto) { + kfree_skb(skb); + return -EINVAL; + } + /* If the header was expanded, headroom might be too + * small for L2 header to come, expand as needed. + */ + ret = xmit_check_hhlen(skb, hh_len); + if (unlikely(ret)) + return ret; + + return LWTUNNEL_XMIT_CONTINUE; + case BPF_REDIRECT: + return LWTUNNEL_XMIT_DONE; + case BPF_LWT_REROUTE: + return bpf_lwt_xmit_reroute(skb); + default: + return ret; + } + } + + return LWTUNNEL_XMIT_CONTINUE; +} + +static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) +{ + if (prog->prog) + bpf_prog_put(prog->prog); + + kfree(prog->name); +} + +static void bpf_destroy_state(struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + bpf_lwt_prog_destroy(&bpf->in); + bpf_lwt_prog_destroy(&bpf->out); + bpf_lwt_prog_destroy(&bpf->xmit); +} + +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, + enum bpf_prog_type type) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, + bpf_prog_policy, NULL); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); + if (!prog->name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, type); + if (IS_ERR(p)) + return PTR_ERR(p); + + prog->prog = p; + + return 0; +} + +static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { + [LWT_BPF_IN] = { .type = NLA_NESTED, }, + [LWT_BPF_OUT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, +}; + +static int bpf_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWT_BPF_MAX + 1]; + struct lwtunnel_state *newts; + struct bpf_lwt *bpf; + int ret; + + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + + ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, + extack); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*bpf)); + if (!newts) + return -ENOMEM; + + newts->type = LWTUNNEL_ENCAP_BPF; + bpf = bpf_lwt_lwtunnel(newts); + + if (tb[LWT_BPF_IN]) { + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, + BPF_PROG_TYPE_LWT_IN); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_OUT]) { + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, + BPF_PROG_TYPE_LWT_OUT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT]) { + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, + BPF_PROG_TYPE_LWT_XMIT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT_HEADROOM]) { + u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); + + if (headroom > LWT_BPF_MAX_HEADROOM) { + ret = -ERANGE; + goto errout; + } + + newts->headroom = headroom; + } + + bpf->family = family; + *ts = newts; + + return 0; + +errout: + bpf_destroy_state(newts); + kfree(newts); + return ret; +} + +static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, + struct bpf_lwt_prog *prog) +{ + struct nlattr *nest; + + if (!prog->prog) + return 0; + + nest = nla_nest_start_noflag(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (prog->name && + nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) + return -EMSGSIZE; + + return 0; +} + +static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + int nest_len = nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ + 0; + + return nest_len + /* LWT_BPF_IN */ + nest_len + /* LWT_BPF_OUT */ + nest_len + /* LWT_BPF_XMIT */ + 0; +} + +static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +{ + /* FIXME: + * The LWT state is currently rebuilt for delete requests which + * results in a new bpf_prog instance. Comparing names for now. + */ + if (!a->name && !b->name) + return 0; + + if (!a->name || !b->name) + return 1; + + return strcmp(a->name, b->name); +} + +static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); + struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); + + return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || + bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || + bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); +} + +static const struct lwtunnel_encap_ops bpf_encap_ops = { + .build_state = bpf_build_state, + .destroy_state = bpf_destroy_state, + .input = bpf_input, + .output = bpf_output, + .xmit = bpf_xmit, + .fill_encap = bpf_fill_encap_info, + .get_encap_size = bpf_encap_nlsize, + .cmp_encap = bpf_encap_cmp, + .owner = THIS_MODULE, +}; + +static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, + int encap_len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; + skb_decrease_gso_size(shinfo, encap_len); + shinfo->gso_segs = 0; + return 0; +} + +static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) +{ + int next_hdr_offset; + void *next_hdr; + __u8 protocol; + + /* SCTP and UDP_L4 gso need more nuanced handling than what + * handle_gso_type() does above: skb_decrease_gso_size() is not enough. + * So at the moment only TCP GSO packets are let through. + */ + if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + return -ENOTSUPP; + + if (ipv4) { + protocol = ip_hdr(skb)->protocol; + next_hdr_offset = sizeof(struct iphdr); + next_hdr = skb_network_header(skb) + next_hdr_offset; + } else { + protocol = ipv6_hdr(skb)->nexthdr; + next_hdr_offset = sizeof(struct ipv6hdr); + next_hdr = skb_network_header(skb) + next_hdr_offset; + } + + switch (protocol) { + case IPPROTO_GRE: + next_hdr_offset += sizeof(struct gre_base_hdr); + if (next_hdr_offset > encap_len) + return -EINVAL; + + if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) + return handle_gso_type(skb, SKB_GSO_GRE_CSUM, + encap_len); + return handle_gso_type(skb, SKB_GSO_GRE, encap_len); + + case IPPROTO_UDP: + next_hdr_offset += sizeof(struct udphdr); + if (next_hdr_offset > encap_len) + return -EINVAL; + + if (((struct udphdr *)next_hdr)->check) + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, + encap_len); + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); + + case IPPROTO_IP: + case IPPROTO_IPV6: + if (ipv4) + return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); + else + return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); + + default: + return -EPROTONOSUPPORT; + } +} + +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) +{ + struct iphdr *iph; + bool ipv4; + int err; + + if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) + return -EINVAL; + + /* validate protocol and length */ + iph = (struct iphdr *)hdr; + if (iph->version == 4) { + ipv4 = true; + if (unlikely(len < iph->ihl * 4)) + return -EINVAL; + } else if (iph->version == 6) { + ipv4 = false; + if (unlikely(len < sizeof(struct ipv6hdr))) + return -EINVAL; + } else { + return -EINVAL; + } + + if (ingress) + err = skb_cow_head(skb, len + skb->mac_len); + else + err = skb_cow_head(skb, + len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); + if (unlikely(err)) + return err; + + /* push the encap headers and fix pointers */ + skb_reset_inner_headers(skb); + skb_reset_inner_mac_header(skb); /* mac header is not yet set */ + skb_set_inner_protocol(skb, skb->protocol); + skb->encapsulation = 1; + skb_push(skb, len); + if (ingress) + skb_postpush_rcsum(skb, iph, len); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), hdr, len); + bpf_compute_data_pointers(skb); + skb_clear_hash(skb); + + if (ipv4) { + skb->protocol = htons(ETH_P_IP); + iph = ip_hdr(skb); + + if (!iph->check) + iph->check = ip_fast_csum((unsigned char *)iph, + iph->ihl); + } else { + skb->protocol = htons(ETH_P_IPV6); + } + + if (skb_is_gso(skb)) + return handle_gso_encap(skb, ipv4, len); + + return 0; +} + +static int __init bpf_lwt_init(void) +{ + return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); +} + +subsys_initcall(bpf_lwt_init) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c new file mode 100644 index 000000000000..f9d76d85d04f --- /dev/null +++ b/net/core/lwtunnel.c @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> + */ + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/lwtunnel.h> +#include <linux/in.h> +#include <linux/init.h> +#include <linux/err.h> + +#include <net/lwtunnel.h> +#include <net/rtnetlink.h> +#include <net/ip6_fib.h> +#include <net/rtnh.h> + +#include "dev.h" + +DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); +EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); + +#ifdef CONFIG_MODULES + +static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) +{ + /* Only lwt encaps implemented without using an interface for + * the encap need to return a string here. + */ + switch (encap_type) { + case LWTUNNEL_ENCAP_MPLS: + return "MPLS"; + case LWTUNNEL_ENCAP_ILA: + return "ILA"; + case LWTUNNEL_ENCAP_SEG6: + return "SEG6"; + case LWTUNNEL_ENCAP_BPF: + return "BPF"; + case LWTUNNEL_ENCAP_SEG6_LOCAL: + return "SEG6LOCAL"; + case LWTUNNEL_ENCAP_RPL: + return "RPL"; + case LWTUNNEL_ENCAP_IOAM6: + return "IOAM6"; + case LWTUNNEL_ENCAP_XFRM: + /* module autoload not supported for encap type */ + return NULL; + case LWTUNNEL_ENCAP_IP6: + case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_NONE: + case __LWTUNNEL_ENCAP_MAX: + /* should not have got here */ + WARN_ON(1); + break; + } + return NULL; +} + +#endif /* CONFIG_MODULES */ + +struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) +{ + struct lwtunnel_state *lws; + + lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); + + return lws; +} +EXPORT_SYMBOL_GPL(lwtunnel_state_alloc); + +static const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int encap_type) +{ + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[encap_type], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net *net, u16 encap_type, + struct nlattr *encap, unsigned int family, + const void *cfg, struct lwtunnel_state **lws, + struct netlink_ext_ack *extack) +{ + const struct lwtunnel_encap_ops *ops; + bool found = false; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG_ATTR(extack, encap, + "Unknown LWT encapsulation type"); + return ret; + } + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state && try_module_get(ops->owner))) + found = true; + rcu_read_unlock(); + + if (found) { + ret = ops->build_state(net, encap, family, cfg, lws, extack); + if (ret) + module_put(ops->owner); + } else { + /* don't rely on -EOPNOTSUPP to detect match as build_state + * handlers could return it + */ + NL_SET_ERR_MSG_ATTR(extack, encap, + "LWT encapsulation type not supported"); + } + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_build_state); + +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); + return ret; + } + + ops = rcu_access_pointer(lwtun_encaps[encap_type]); +#ifdef CONFIG_MODULES + if (!ops) { + const char *encap_type_str = lwtunnel_encap_str(encap_type); + + if (encap_type_str) { + request_module("rtnl-lwt-%s", encap_type_str); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); + } + } +#endif + ret = ops ? 0 : -EOPNOTSUPP; + if (ret < 0) + NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); + +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, + struct netlink_ext_ack *extack) +{ + struct rtnexthop *rtnh = (struct rtnexthop *)attr; + struct nlattr *nla_entype; + struct nlattr *attrs; + u16 encap_type; + int attrlen; + + while (rtnh_ok(rtnh, remaining)) { + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + attrs = rtnh_attrs(rtnh); + nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + + if (nla_entype) { + if (nla_len(nla_entype) < sizeof(u16)) { + NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE"); + return -EINVAL; + } + encap_type = nla_get_u16(nla_entype); + + if (lwtunnel_valid_encap_type(encap_type, extack)) + return -EOPNOTSUPP; + } + } + rtnh = rtnh_next(rtnh, &remaining); + } + + return 0; +} +EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr); + +void lwtstate_free(struct lwtunnel_state *lws) +{ + const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; + + if (ops->destroy_state) { + ops->destroy_state(lws); + kfree_rcu(lws, rcu); + } else { + kfree(lws); + } + module_put(ops->owner); +} +EXPORT_SYMBOL_GPL(lwtstate_free); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, + int encap_attr, int encap_type_attr) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + nest = nla_nest_start_noflag(skb, encap_attr); + if (!nest) + return -EMSGSIZE; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto nla_put_failure; + nla_nest_end(skb, nest); + ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); + if (ret) + goto nla_put_failure; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + + return (ret == -EOPNOTSUPP ? 0 : ret); +} +EXPORT_SYMBOL_GPL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size); + +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!a && !b) + return 0; + + if (!a || !b) + return 1; + + if (a->type != b->type) + return 1; + + if (a->type == LWTUNNEL_ENCAP_NONE || + a->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[a->type]); + if (likely(ops && ops->cmp_encap)) + ret = ops->cmp_encap(a, b); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); + +int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + struct dst_entry *dst; + int ret; + + local_bh_disable(); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; + goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->output)) { + dev_xmit_recursion_inc(); + ret = ops->output(net, sk, skb); + dev_xmit_recursion_dec(); + } + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + goto out; + +drop: + kfree_skb(skb); + +out: + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_output); + +int lwtunnel_xmit(struct sk_buff *skb) +{ + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + struct dst_entry *dst; + int ret; + + local_bh_disable(); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; + goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } + + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->xmit)) { + dev_xmit_recursion_inc(); + ret = ops->xmit(skb); + dev_xmit_recursion_dec(); + } + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + goto out; + +drop: + kfree_skb(skb); + +out: + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_xmit); + +int lwtunnel_input(struct sk_buff *skb) +{ + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + struct dst_entry *dst; + int ret; + + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; + goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->input)) { + dev_xmit_recursion_inc(); + ret = ops->input(skb); + dev_xmit_recursion_dec(); + } + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_input); diff --git a/net/core/mp_dmabuf_devmem.h b/net/core/mp_dmabuf_devmem.h new file mode 100644 index 000000000000..67cd0dd7319c --- /dev/null +++ b/net/core/mp_dmabuf_devmem.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Dmabuf device memory provider. + * + * Authors: Mina Almasry <almasrymina@google.com> + * + */ +#ifndef _NET_MP_DMABUF_DEVMEM_H +#define _NET_MP_DMABUF_DEVMEM_H + +#include <net/netmem.h> + +#if defined(CONFIG_NET_DEVMEM) +int mp_dmabuf_devmem_init(struct page_pool *pool); + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp); + +void mp_dmabuf_devmem_destroy(struct page_pool *pool); + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem); +#else +static inline int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + return -EOPNOTSUPP; +} + +static inline netmem_ref +mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + return 0; +} + +static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ +} + +static inline bool +mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + return false; +} +#endif + +#endif /* _NET_MP_DMABUF_DEVMEM_H */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b7de821f98df..96a3b1a93252 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Generic address resolution entity * @@ -5,11 +6,6 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Fixes: * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. * Harald Welte Add neighbour cache statistics like rtstat @@ -30,7 +26,9 @@ #include <linux/times.h> #include <net/net_namespace.h> #include <net/neighbour.h> +#include <net/arp.h> #include <net/dst.h> +#include <net/ip.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> @@ -38,8 +36,11 @@ #include <linux/random.h> #include <linux/string.h> #include <linux/log2.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> + +#include <trace/events/neigh.h> -#define DEBUG #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ @@ -49,18 +50,38 @@ do { \ #define PNEIGH_HASHMASK 0xF -static void neigh_timer_handler(unsigned long arg); -static void __neigh_notify(struct neighbour *n, int type, int flags); -static void neigh_update_notify(struct neighbour *neigh); -static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); +static void neigh_timer_handler(struct timer_list *t); +static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid); +static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm); -static struct neigh_table *neigh_tables; #ifdef CONFIG_PROC_FS -static const struct file_operations neigh_stat_seq_fops; +static const struct seq_operations neigh_stat_seq_ops; #endif +static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) +{ + int i; + + switch (family) { + default: + DEBUG_NET_WARN_ON_ONCE(1); + fallthrough; /* to avoid panic by null-ptr-deref */ + case AF_INET: + i = NEIGH_ARP_TABLE; + break; + case AF_INET6: + i = NEIGH_ND_TABLE; + break; + } + + return &dev->neighbours[i]; +} + /* - Neighbour hash table buckets are protected with rwlock tbl->lock. + Neighbour hash table buckets are protected with tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks @@ -85,13 +106,8 @@ static const struct file_operations neigh_stat_seq_fops; the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. - - The last lock is neigh_tbl_lock. It is pure SMP lock, protecting - list of neighbour tables. This list is used only in process context, */ -static DEFINE_RWLOCK(neigh_tbl_lock); - static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); @@ -100,10 +116,9 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) static void neigh_cleanup_and_release(struct neighbour *neigh) { - if (neigh->parms->neigh_cleanup) - neigh->parms->neigh_cleanup(neigh); - - __neigh_notify(neigh, RTM_DELNEIGH, 0); + trace_neigh_cleanup_and_release(neigh, 0); + __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } @@ -115,60 +130,186 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? (net_random() % base) + (base >> 1) : 0; + return base ? get_random_u32_below(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } + if (!list_empty(&n->managed_list)) + list_del_init(&n->managed_list); +} + +static void neigh_update_gc_list(struct neighbour *n) +{ + bool on_gc_list, exempt_from_gc; + + spin_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + if (n->dead) + goto out; + + /* remove from the gc list if new state is permanent or if neighbor is + * externally learned / validated; otherwise entry should be on the gc + * list + */ + exempt_from_gc = n->nud_state & NUD_PERMANENT || + n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); + on_gc_list = !list_empty(&n->gc_list); + + if (exempt_from_gc && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!exempt_from_gc && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } +out: + write_unlock(&n->lock); + spin_unlock_bh(&n->tbl->lock); +} + +static void neigh_update_managed_list(struct neighbour *n) +{ + bool on_managed_list, add_to_managed; + + spin_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + if (n->dead) + goto out; + + add_to_managed = n->flags & NTF_MANAGED; + on_managed_list = !list_empty(&n->managed_list); + + if (!add_to_managed && on_managed_list) + list_del_init(&n->managed_list); + else if (add_to_managed && !on_managed_list) + list_add_tail(&n->managed_list, &n->tbl->managed_list); +out: + write_unlock(&n->lock); + spin_unlock_bh(&n->tbl->lock); +} + +static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, + bool *gc_update, bool *managed_update) +{ + u32 ndm_flags, old_flags = neigh->flags; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; + + if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + *notify = 1; + *gc_update = true; + } + if ((old_flags ^ ndm_flags) & NTF_MANAGED) { + if (ndm_flags & NTF_MANAGED) + neigh->flags |= NTF_MANAGED; + else + neigh->flags &= ~NTF_MANAGED; + *notify = 1; + *managed_update = true; + } + if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { + if (ndm_flags & NTF_EXT_VALIDATED) + neigh->flags |= NTF_EXT_VALIDATED; + else + neigh->flags &= ~NTF_EXT_VALIDATED; + *notify = 1; + *gc_update = true; + } +} + +bool neigh_remove_one(struct neighbour *n) +{ + bool retval = false; + + write_lock(&n->lock); + if (refcount_read(&n->refcnt) == 1) { + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + neigh_mark_dead(n); + retval = true; + } + write_unlock(&n->lock); + if (retval) + neigh_cleanup_and_release(n); + return retval; +} static int neigh_forced_gc(struct neigh_table *tbl) { + int max_clean = atomic_read(&tbl->gc_entries) - + READ_ONCE(tbl->gc_thresh2); + u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; + unsigned long tref = jiffies - 5 * HZ; + struct neighbour *n, *tmp; int shrunk = 0; - int i; - struct neigh_hash_table *nht; + int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); - write_lock_bh(&tbl->lock); - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np; - - np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - /* Neighbour record may be discarded if: - * - nobody refers to it. - * - it is not permanent - */ + spin_lock_bh(&tbl->lock); + + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + write_lock(&n->lock); - if (atomic_read(&n->refcnt) == 1 && - !(n->nud_state & NUD_PERMANENT)) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - n->dead = 1; - shrunk = 1; - write_unlock(&n->lock); - neigh_cleanup_and_release(n); - continue; - } + if ((n->nud_state == NUD_FAILED) || + (n->nud_state == NUD_NOARP) || + (tbl->is_multicast && + tbl->is_multicast(n->primary_key)) || + !time_in_range(n->updated, tref, jiffies)) + remove = true; write_unlock(&n->lock); - np = &n->next; + + if (remove && neigh_remove_one(n)) + shrunk++; + if (shrunk >= max_clean) + break; + if (++loop == 16) { + if (ktime_get_ns() > tmax) + goto unlock; + loop = 0; + } } } - tbl->last_flush = jiffies; - - write_unlock_bh(&tbl->lock); + WRITE_ONCE(tbl->last_flush, jiffies); +unlock: + spin_unlock_bh(&tbl->lock); return shrunk; } static void neigh_add_timer(struct neighbour *n, unsigned long when) { + /* Use safe distance from the jiffies - LONG_MAX point while timer + * is running in DELAY/PROBE state but still show to user space + * large times in the past. + */ + unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); + neigh_hold(n); + if (!time_in_range(n->confirmed, mint, jiffies)) + n->confirmed = mint; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); @@ -179,109 +320,205 @@ static void neigh_add_timer(struct neighbour *n, unsigned long when) static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && - del_timer(&n->timer)) { + timer_delete(&n->timer)) { neigh_release(n); return 1; } return 0; } -static void pneigh_queue_purge(struct sk_buff_head *list) +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); + } + return NULL; +} + +static void neigh_parms_qlen_dec(struct net_device *dev, int family) { + struct neigh_parms *p; + + rcu_read_lock(); + p = neigh_get_dev_parms_rcu(dev, family); + if (p) + p->qlen--; + rcu_read_unlock(); +} + +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, + int family) +{ + struct sk_buff_head tmp; + unsigned long flags; struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) { + skb_queue_head_init(&tmp); + spin_lock_irqsave(&list->lock, flags); + skb = skb_peek(list); + while (skb != NULL) { + struct sk_buff *skb_next = skb_peek_next(skb, list); + struct net_device *dev = skb->dev; + + if (net == NULL || net_eq(dev_net(dev), net)) { + neigh_parms_qlen_dec(dev, family); + __skb_unlink(skb, list); + __skb_queue_tail(&tmp, skb); + } + skb = skb_next; + } + spin_unlock_irqrestore(&list->lock, flags); + + while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } } -static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) +static void neigh_flush_one(struct neighbour *n) +{ + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + + write_lock(&n->lock); + + neigh_del_timer(n); + neigh_mark_dead(n); + + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + + neigh_dbg(2, "neigh %p is stray\n", n); + } + + write_unlock(&n->lock); + + neigh_cleanup_and_release(n); +} + +static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) +{ + struct hlist_head *dev_head; + struct hlist_node *tmp; + struct neighbour *n; + + dev_head = neigh_get_dev_table(dev, tbl->family); + + hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { + if (skip_perm && + (n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_VALIDATED)) + continue; + + neigh_flush_one(n); + } +} + +static void neigh_flush_table(struct neigh_table *tbl) { - int i; struct neigh_hash_table *nht; + int i; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - if (dev && n->dev != dev) { - np = &n->next; - continue; - } - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - write_lock(&n->lock); - neigh_del_timer(n); - n->dead = 1; - - if (atomic_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - We must destroy neighbour entry, - but someone still uses it. - - The destroy will be delayed until - the last user releases us, but - we must kill timers etc. and move - it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - n->output = neigh_blackhole; - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); - } + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) + neigh_flush_one(n); } } void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { - write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev); - write_unlock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); + neigh_flush_dev(tbl, dev, false); + spin_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); -int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) +{ + spin_lock_bh(&tbl->lock); + if (likely(dev)) { + neigh_flush_dev(tbl, dev, skip_perm); + } else { + DEBUG_NET_WARN_ON_ONCE(skip_perm); + neigh_flush_table(tbl); + } + spin_unlock_bh(&tbl->lock); + + pneigh_ifdown(tbl, dev, skip_perm); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, + tbl->family); + if (skb_queue_empty_lockless(&tbl->proxy_queue)) + timer_delete_sync(&tbl->proxy_timer); + return 0; +} + +int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) { - write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev); - pneigh_ifdown(tbl, dev); - write_unlock_bh(&tbl->lock); + __neigh_ifdown(tbl, dev, true); + return 0; +} +EXPORT_SYMBOL(neigh_carrier_down); - del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue); +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, false); return 0; } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; - int entries; - - entries = atomic_inc_return(&tbl->entries) - 1; - if (entries >= tbl->gc_thresh3 || - (entries >= tbl->gc_thresh2 && - time_after(now, tbl->last_flush + 5 * HZ))) { - if (!neigh_forced_gc(tbl) && - entries >= tbl->gc_thresh3) + int entries, gc_thresh3; + + if (exempt_from_gc) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; + gc_thresh3 = READ_ONCE(tbl->gc_thresh3); + if (entries >= gc_thresh3 || + (entries >= READ_ONCE(tbl->gc_thresh2) && + time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { + net_info_ratelimited("%s: neighbor table overflow!\n", + tbl->id); + NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; + } } +do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -292,49 +529,50 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; + n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); - setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); + timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; - atomic_set(&n->refcnt, 1); + refcount_set(&n->refcnt, 1); n->dead = 1; + INIT_LIST_HEAD(&n->gc_list); + INIT_LIST_HEAD(&n->managed_list); + + atomic_inc(&tbl->entries); out: return n; out_entries: - atomic_dec(&tbl->entries); + if (!exempt_from_gc) + atomic_dec(&tbl->gc_entries); goto out; } static void neigh_get_hash_rnd(u32 *x) { - get_random_bytes(x, sizeof(*x)); - *x |= 1; + *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { - size_t size = (1 << shift) * sizeof(struct neighbour *); + size_t size = (1 << shift) * sizeof(struct hlist_head); + struct hlist_head *hash_heads; struct neigh_hash_table *ret; - struct neighbour __rcu **buckets; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) - buckets = kzalloc(size, GFP_ATOMIC); - else - buckets = (struct neighbour __rcu **) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); - if (!buckets) { + + hash_heads = kzalloc(size, GFP_ATOMIC); + if (!hash_heads) { kfree(ret); return NULL; } - ret->hash_buckets = buckets; + ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); @@ -346,13 +584,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); - size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); - struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) - kfree(buckets); - else - free_pages((unsigned long)buckets, get_order(size)); + kfree(nht->hash_heads); kfree(nht); } @@ -371,24 +604,17 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { - struct neighbour *n, *next; + struct hlist_node *tmp; + struct neighbour *n; - for (n = rcu_dereference_protected(old_nht->hash_buckets[i], - lockdep_is_held(&tbl->lock)); - n != NULL; - n = next) { + neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); - next = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(n->next, - rcu_dereference_protected( - new_nht->hash_buckets[hash], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(new_nht->hash_buckets[hash], n); + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } @@ -401,72 +627,34 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; - int key_len = tbl->key_len; - u32 hash_val; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { - if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { - if (!atomic_inc_not_zero(&n->refcnt)) - n = NULL; - NEIGH_CACHE_STAT_INC(tbl, hits); - break; - } + rcu_read_lock(); + n = __neigh_lookup_noref(tbl, pkey, dev); + if (n) { + if (!refcount_inc_not_zero(&n->refcnt)) + n = NULL; + NEIGH_CACHE_STAT_INC(tbl, hits); } - rcu_read_unlock_bh(); + rcu_read_unlock(); return n; } EXPORT_SYMBOL(neigh_lookup); -struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, - const void *pkey) +static struct neighbour * +___neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, u32 flags, + bool exempt_from_gc, bool want_ref) { - struct neighbour *n; - int key_len = tbl->key_len; - u32 hash_val; + u32 hash_val, key_len = tbl->key_len; + struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; - - NEIGH_CACHE_STAT_INC(tbl, lookups); - - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); - hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift); - - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { - if (!memcmp(n->primary_key, pkey, key_len) && - net_eq(dev_net(n->dev), net)) { - if (!atomic_inc_not_zero(&n->refcnt)) - n = NULL; - NEIGH_CACHE_STAT_INC(tbl, hits); - break; - } - } - - rcu_read_unlock_bh(); - return n; -} -EXPORT_SYMBOL(neigh_lookup_nodev); - -struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, bool want_ref) -{ - u32 hash_val; - int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); - struct neigh_hash_table *nht; + n = neigh_alloc(tbl, dev, flags, exempt_from_gc); + trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; @@ -474,7 +662,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, memcpy(n->primary_key, pkey, key_len); n->dev = dev; - dev_hold(dev); + netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { @@ -483,7 +671,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, } if (dev->netdev_ops->ndo_neigh_construct) { - error = dev->netdev_ops->ndo_neigh_construct(n); + error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; @@ -497,28 +685,24 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, goto out_neigh_release; } - n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } - for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); - n1 != NULL; - n1 = rcu_dereference_protected(n1->next, - lockdep_is_held(&tbl->lock))) { - if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { + if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; @@ -527,26 +711,41 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, } n->dead = 0; + if (!exempt_from_gc) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + if (n->flags & NTF_MANAGED) + list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); - rcu_assign_pointer(n->next, - rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(nht->hash_buckets[hash_val], n); - write_unlock_bh(&tbl->lock); + hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); + + hlist_add_head_rcu(&n->dev_list, + neigh_get_dev_table(dev, tbl->family)); + + spin_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); out_neigh_release: + if (!exempt_from_gc) + atomic_dec(&tbl->gc_entries); neigh_release(n); goto out; } + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + + return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); +} EXPORT_SYMBOL(__neigh_create); -static u32 pneigh_hash(const void *pkey, int key_len) +static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); @@ -556,137 +755,160 @@ static u32 pneigh_hash(const void *pkey, int key_len) return hash_val; } -static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, - struct net *net, - const void *pkey, - int key_len, - struct net_device *dev) +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) { + struct pneigh_entry *n; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(tbl->phash_buckets[hash_val], + lockdep_is_held(&tbl->phash_lock)); + while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; - n = n->next; - } - return NULL; -} -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, struct net_device *dev) -{ - int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); + } - return __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); + return NULL; } -EXPORT_SYMBOL_GPL(__pneigh_lookup); +EXPORT_IPV6_MOD(pneigh_lookup); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev, int creat) +int pneigh_create(struct neigh_table *tbl, struct net *net, + const void *pkey, struct net_device *dev, + u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; - int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); - - read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); - read_unlock_bh(&tbl->lock); + unsigned int key_len; + u32 hash_val; + int err = 0; - if (n || !creat) - goto out; + mutex_lock(&tbl->phash_lock); - ASSERT_RTNL(); + n = pneigh_lookup(tbl, net, pkey, dev); + if (n) + goto update; - n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); - if (!n) + key_len = tbl->key_len; + n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (!n) { + err = -ENOBUFS; goto out; + } - write_pnet(&n->net, hold_net(net)); + write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; - if (dev) - dev_hold(dev); + netdev_hold(dev, &n->dev_tracker, GFP_KERNEL); if (tbl->pconstructor && tbl->pconstructor(n)) { - if (dev) - dev_put(dev); - release_net(net); + netdev_put(dev, &n->dev_tracker); kfree(n); - n = NULL; + err = -ENOBUFS; goto out; } - write_lock_bh(&tbl->lock); + hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; - tbl->phash_buckets[hash_val] = n; - write_unlock_bh(&tbl->lock); + rcu_assign_pointer(tbl->phash_buckets[hash_val], n); +update: + WRITE_ONCE(n->flags, flags); + n->permanent = permanent; + WRITE_ONCE(n->protocol, protocol); out: - return n; + mutex_unlock(&tbl->phash_lock); + return err; } -EXPORT_SYMBOL(pneigh_lookup); +static void pneigh_destroy(struct rcu_head *rcu) +{ + struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); + + netdev_put(n->dev, &n->dev_tracker); + kfree(n); +} int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - struct pneigh_entry *n, **np; - int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + struct pneigh_entry *n, __rcu **np; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); - for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + mutex_lock(&tbl->phash_lock); + + for (np = &tbl->phash_buckets[hash_val]; + (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { - *np = n->next; - write_unlock_bh(&tbl->lock); + rcu_assign_pointer(*np, n->next); + + mutex_unlock(&tbl->phash_lock); + if (tbl->pdestructor) tbl->pdestructor(n); - if (n->dev) - dev_put(n->dev); - release_net(pneigh_net(n)); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); return 0; } } - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); return -ENOENT; } -static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { - struct pneigh_entry *n, **np; + struct pneigh_entry *n, __rcu **np; + LIST_HEAD(head); u32 h; + mutex_lock(&tbl->phash_lock); + for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, 1)) != NULL) { + if (skip_perm && n->permanent) + goto skip; if (!dev || n->dev == dev) { - *np = n->next; - if (tbl->pdestructor) - tbl->pdestructor(n); - if (n->dev) - dev_put(n->dev); - release_net(pneigh_net(n)); - kfree(n); + rcu_assign_pointer(*np, n->next); + list_add(&n->free_node, &head); continue; } +skip: np = &n->next; } } - return -ENOENT; -} -static void neigh_parms_destroy(struct neigh_parms *parms); + mutex_unlock(&tbl->phash_lock); + + while (!list_empty(&head)) { + n = list_first_entry(&head, typeof(*n), free_node); + list_del(&n->free_node); + + if (tbl->pdestructor) + tbl->pdestructor(n); + + call_rcu(&n->rcu, pneigh_destroy); + } +} static inline void neigh_parms_put(struct neigh_parms *parms) { - if (atomic_dec_and_test(&parms->refcnt)) - neigh_parms_destroy(parms); + if (refcount_dec_and_test(&parms->refcnt)) + kfree(parms); } /* @@ -714,9 +936,9 @@ void neigh_destroy(struct neighbour *neigh) neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) - dev->netdev_ops->ndo_neigh_destroy(neigh); + dev->netdev_ops->ndo_neigh_destroy(dev, neigh); - dev_put(dev); + netdev_put(dev, &neigh->dev_tracker); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); @@ -735,7 +957,7 @@ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); - neigh->output = neigh->ops->output; + WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; @@ -747,96 +969,95 @@ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); - neigh->output = neigh->ops->connected_output; + WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neigh_hash_table *nht; + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np; unsigned int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); - if (atomic_read(&tbl->entries) < tbl->gc_thresh1) - goto out; - /* * periodically recompute ReachableTime from random function */ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; - tbl->last_rand = jiffies; - for (p = &tbl->parms; p; p = p->next) - p->reachable_time = - neigh_rand_reach_time(p->base_reachable_time); + + WRITE_ONCE(tbl->last_rand, jiffies); + list_for_each_entry(p, &tbl->parms_list, list) + neigh_set_reach_time(p); } - for (i = 0 ; i < (1 << nht->hash_shift); i++) { - np = &nht->hash_buckets[i]; + if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) + goto out; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + for (i = 0 ; i < (1 << nht->hash_shift); i++) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); state = n->nud_state; - if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || + (n->flags & + (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); - goto next_elt; + continue; } - if (time_before(n->used, n->confirmed)) + if (time_before(n->used, n->confirmed) && + time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; - if (atomic_read(&n->refcnt) == 1 && + if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || - time_after(jiffies, n->used + n->parms->gc_staletime))) { - *np = n->next; - n->dead = 1; + !time_in_range_open(jiffies, n->used, + n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); - -next_elt: - np = &n->next; } /* * It's fine to release lock here, even if hash table * grows while we are preempted. */ - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); cond_resched(); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } out: - /* Cycle through all hash buckets every base_reachable_time/2 ticks. - * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 - * base_reachable_time. + /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. + * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 + * BASE_REACHABLE_TIME. */ - schedule_delayed_work(&tbl->gc_work, - tbl->parms.base_reachable_time >> 1); - write_unlock_bh(&tbl->lock); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); + spin_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return (n->nud_state & NUD_PROBE) ? - p->ucast_probes : - p->ucast_probes + p->app_probes + p->mcast_probes; + return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : + NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) @@ -867,22 +1088,23 @@ static void neigh_invalidate(struct neighbour *neigh) static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { - struct sk_buff *skb = skb_peek(&neigh->arp_queue); + struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) - skb = skb_copy(skb, GFP_ATOMIC); + skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); - neigh->ops->solicit(neigh, skb); + if (neigh->ops->solicit) + neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); - kfree_skb(skb); + consume_skb(skb); } /* Called when a timer expires for a neighbour entry. */ -static void neigh_timer_handler(unsigned long arg) +static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; - struct neighbour *neigh = (struct neighbour *)arg; + struct neighbour *neigh = timer_container_of(neigh, t, timer); unsigned int state; int notify = 0; @@ -901,50 +1123,61 @@ static void neigh_timer_handler(unsigned long arg) neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, - neigh->used + neigh->parms->delay_probe_time)) { + neigh->used + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is delayed\n", neigh); - neigh->nud_state = NUD_DELAY; + WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_suspect(neigh); - next = now + neigh->parms->delay_probe_time; + next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { neigh_dbg(2, "neigh %p is suspected\n", neigh); - neigh->nud_state = NUD_STALE; + WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; } } else if (state & NUD_DELAY) { if (time_before_eq(now, - neigh->confirmed + neigh->parms->delay_probe_time)) { + neigh->confirmed + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is now reachable\n", neigh); - neigh->nud_state = NUD_REACHABLE; + WRITE_ONCE(neigh->nud_state, NUD_REACHABLE); neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { neigh_dbg(2, "neigh %p is probed\n", neigh); - neigh->nud_state = NUD_PROBE; + WRITE_ONCE(neigh->nud_state, NUD_PROBE); neigh->updated = jiffies; atomic_set(&neigh->probes, 0); - next = now + neigh->parms->retrans_time; + notify = 1; + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + neigh->parms->retrans_time; + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { - neigh->nud_state = NUD_FAILED; + if (neigh->nud_state == NUD_PROBE && + neigh->flags & NTF_EXT_VALIDATED) { + WRITE_ONCE(neigh->nud_state, NUD_STALE); + neigh->updated = jiffies; + } else { + WRITE_ONCE(neigh->nud_state, NUD_FAILED); + neigh_invalidate(neigh); + } notify = 1; - neigh_invalidate(neigh); + goto out; } if (neigh->nud_state & NUD_IN_TIMER) { - if (time_before(next, jiffies + HZ/2)) - next = jiffies + HZ/2; + if (time_before(next, jiffies + HZ/100)) + next = jiffies + HZ/100; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } @@ -956,12 +1189,15 @@ out: } if (notify) - neigh_update_notify(neigh); + neigh_update_notify(neigh, 0); + + trace_neigh_timer_handler(neigh, 0); neigh_release(neigh); } -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok) { int rc; bool immediate_probe = false; @@ -971,44 +1207,56 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; + if (neigh->dead) + goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { - if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; - atomic_set(&neigh->probes, neigh->parms->ucast_probes); - neigh->nud_state = NUD_INCOMPLETE; + atomic_set(&neigh->probes, + NEIGH_VAR(neigh->parms, UCAST_PROBES)); + neigh_del_timer(neigh); + WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); neigh->updated = now; - next = now + max(neigh->parms->retrans_time, HZ/2); + if (!immediate_ok) { + next = now + 1; + } else { + immediate_probe = true; + next = now + max(NEIGH_VAR(neigh->parms, + RETRANS_TIME), + HZ / 100); + } neigh_add_timer(neigh, next); - immediate_probe = true; } else { - neigh->nud_state = NUD_FAILED; + WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); - neigh->nud_state = NUD_DELAY; + neigh_del_timer(neigh); + WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; - neigh_add_timer(neigh, - jiffies + neigh->parms->delay_probe_time); + neigh_add_timer(neigh, jiffies + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > - neigh->parms->queue_len_bytes) { + NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; - kfree_skb(buff); + kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); @@ -1023,7 +1271,16 @@ out_unlock_bh: else write_unlock(&neigh->lock); local_bh_enable(); + trace_neigh_event_send_done(neigh, rc); return rc; + +out_dead: + if (neigh->nud_state & NUD_STALE) + goto out_unlock_bh; + write_unlock_bh(&neigh->lock); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); + trace_neigh_event_send_dead(neigh, 1); + return 1; } EXPORT_SYMBOL(__neigh_event_send); @@ -1038,7 +1295,7 @@ static void neigh_update_hhs(struct neighbour *neigh) if (update) { hh = &neigh->hh; - if (hh->hh_len) { + if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); @@ -1046,8 +1303,6 @@ static void neigh_update_hhs(struct neighbour *neigh) } } - - /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. @@ -1057,26 +1312,29 @@ static void neigh_update_hhs(struct neighbour *neigh) NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. - It also allows to retain current state - if lladdr is unchanged. NEIGH_UPDATE_F_ADMIN means that the change is administrative. - + NEIGH_UPDATE_F_USE means that the entry is user triggered. + NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. + NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed + or invalidated. Caller MUST hold reference count on the entry. */ - -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags) +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) { - u8 old; - int err; - int notify = 0; - struct net_device *dev; + bool gc_update = false, managed_update = false; int update_isrouter = 0; + struct net_device *dev; + int err, notify = 0; + u8 old; + + trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); write_lock_bh(&neigh->lock); @@ -1084,15 +1342,28 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, old = neigh->nud_state; err = -EPERM; + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); + new = old; + goto out; + } if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; + neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update); + if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { + new = old & ~NUD_PERMANENT; + WRITE_ONCE(neigh->nud_state, new); + err = 0; + goto out; + } + if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); - neigh->nud_state = new; + WRITE_ONCE(neigh->nud_state, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1121,14 +1392,18 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, use it, otherwise discard the request. */ err = -EINVAL; - if (!(old & NUD_VALID)) + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; + } lladdr = neigh->ha; } + /* Update confirmed timestamp for neighbour entry after we + * received ARP packet even if it doesn't change IP to MAC binding. + */ if (new & NUD_CONNECTED) neigh->confirmed = jiffies; - neigh->updated = jiffies; /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. @@ -1146,21 +1421,29 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, goto out; } else { if (lladdr == neigh->ha && new == NUD_STALE && - ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || - (old & NUD_CONNECTED)) - ) + !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } + /* Update timestamp only once we know we will make a change to the + * neighbour entry. Otherwise we risk to move the locktime window with + * noop updates and ignore relevant ARP updates. + */ + if (new != old || lladdr != neigh->ha) + neigh->updated = jiffies; + if (new != old) { neigh_del_timer(neigh); + if (new & NUD_PROBE) + atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - neigh->nud_state = new; + WRITE_ONCE(neigh->nud_state, new); + notify = 1; } if (lladdr != neigh->ha) { @@ -1170,7 +1453,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - - (neigh->parms->base_reachable_time << 1); + (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) @@ -1200,12 +1483,13 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, * we can reinject the packet there. */ n2 = NULL; - if (dst) { + if (dst && + READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } - n1->output(n1, skb); + READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); @@ -1216,20 +1500,44 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->arp_queue_len_bytes = 0; } out: - if (update_isrouter) { - neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? - (neigh->flags | NTF_ROUTER) : - (neigh->flags & ~NTF_ROUTER); - } + if (update_isrouter) + neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); - + if (((new ^ old) & NUD_PERMANENT) || gc_update) + neigh_update_gc_list(neigh); + if (managed_update) + neigh_update_managed_list(neigh); if (notify) - neigh_update_notify(neigh); - + neigh_update_notify(neigh, nlmsg_pid); + trace_neigh_update_done(neigh, err); return err; } + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} EXPORT_SYMBOL(neigh_update); +/* Update the neigh to listen temporarily for probe responses, even if it is + * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. + */ +void __neigh_set_probe_once(struct neighbour *neigh) +{ + if (neigh->dead) + return; + neigh->updated = jiffies; + if (!(neigh->nud_state & NUD_FAILED)) + return; + WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); + atomic_set(&neigh->probes, neigh_max_probes(neigh)); + neigh_add_timer(neigh, + jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100)); +} +EXPORT_SYMBOL(__neigh_set_probe_once); + struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) @@ -1238,16 +1546,16 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, - NEIGH_UPDATE_F_OVERRIDE); + NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); /* called with read_lock_bh(&n->lock); */ -static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst) +static void neigh_hh_init(struct neighbour *n) { - struct net_device *dev = dst->dev; - __be16 prot = dst->ops->protocol; + struct net_device *dev = n->dev; + __be16 prot = n->tbl->protocol; struct hh_cache *hh = &n->hh; write_lock_bh(&n->lock); @@ -1261,43 +1569,19 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst) write_unlock_bh(&n->lock); } -/* This function can be used in contexts, where only old dev_queue_xmit - * worked, f.e. if you want to override normal output path (eql, shaper), - * but resolution is not made yet. - */ - -int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - - __skb_pull(skb, skb_network_offset(skb)); - - if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, - skb->len) < 0 && - dev->header_ops->rebuild(skb)) - return 0; - - return dev_queue_xmit(skb); -} -EXPORT_SYMBOL(neigh_compat_output); - /* Slow and careful. */ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); int rc = 0; - if (!dst) - goto discard; - if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; - if (dev->header_ops->cache && !neigh->hh.hh_len) - neigh_hh_init(neigh, dst); + if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) + neigh_hh_init(neigh); do { __skb_pull(skb, skb_network_offset(skb)); @@ -1313,11 +1597,9 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) } out: return rc; -discard: - neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh); out_kfree_skb: rc = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); @@ -1341,7 +1623,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) err = dev_queue_xmit(skb); else { err = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } @@ -1353,9 +1635,23 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) } EXPORT_SYMBOL(neigh_direct_output); -static void neigh_proxy_process(unsigned long arg) +static void neigh_managed_work(struct work_struct *work) { - struct neigh_table *tbl = (struct neigh_table *)arg; + struct neigh_table *tbl = container_of(work, struct neigh_table, + managed_work.work); + struct neighbour *neigh; + + spin_lock_bh(&tbl->lock); + list_for_each_entry(neigh, &tbl->managed_list, managed_list) + neigh_event_send_probe(neigh, NULL, false); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, + NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); + spin_unlock_bh(&tbl->lock); +} + +static void neigh_proxy_process(struct timer_list *t) +{ + struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; @@ -1368,7 +1664,9 @@ static void neigh_proxy_process(unsigned long arg) if (tdif <= 0) { struct net_device *dev = skb->dev; + neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); @@ -1381,19 +1679,29 @@ static void neigh_proxy_process(unsigned long arg) } else if (!sched_next || tdif < sched_next) sched_next = tdif; } - del_timer(&tbl->proxy_timer); + timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); } +static unsigned long neigh_proxy_delay(struct neigh_parms *p) +{ + /* If proxy_delay is zero, do not call get_random_u32_below() + * as it is undefined behavior. + */ + unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); + + return proxy_delay ? + jiffies + get_random_u32_below(proxy_delay) : jiffies; +} + void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long now = jiffies; - unsigned long sched_next = now + (net_random() % p->proxy_delay); + unsigned long sched_next = neigh_proxy_delay(p); - if (tbl->proxy_queue.qlen > p->proxy_qlen) { + if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } @@ -1402,13 +1710,14 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); - if (del_timer(&tbl->proxy_timer)) { + if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); + p->qlen++; mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } @@ -1419,7 +1728,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, { struct neigh_parms *p; - for (p = &tbl->parms; p; p = p->next) { + list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; @@ -1438,23 +1747,25 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; - atomic_set(&p->refcnt, 1); - p->reachable_time = - neigh_rand_reach_time(p->base_reachable_time); + refcount_set(&p->refcnt, 1); + neigh_set_reach_time(p); + p->qlen = 0; + netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); + p->dev = dev; + write_pnet(&p->net, net); + p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + netdev_put(dev, &p->dev_tracker); kfree(p); return NULL; } - dev_hold(dev); - p->dev = dev; - write_pnet(&p->net, hold_net(net)); - p->sysctl_table = NULL; - write_lock_bh(&tbl->lock); - p->next = tbl->parms.next; - tbl->parms.next = p; - write_unlock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); + list_add_rcu(&p->list, &tbl->parms.list); + spin_unlock_bh(&tbl->lock); + + neigh_parms_data_state_cleanall(p); } return p; } @@ -1470,52 +1781,45 @@ static void neigh_rcu_free_parms(struct rcu_head *head) void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { - struct neigh_parms **p; - if (!parms || parms == &tbl->parms) return; - write_lock_bh(&tbl->lock); - for (p = &tbl->parms.next; *p; p = &(*p)->next) { - if (*p == parms) { - *p = parms->next; - parms->dead = 1; - write_unlock_bh(&tbl->lock); - if (parms->dev) - dev_put(parms->dev); - call_rcu(&parms->rcu_head, neigh_rcu_free_parms); - return; - } - } - write_unlock_bh(&tbl->lock); - neigh_dbg(1, "%s: not found\n", __func__); -} -EXPORT_SYMBOL(neigh_parms_release); -static void neigh_parms_destroy(struct neigh_parms *parms) -{ - release_net(neigh_parms_net(parms)); - kfree(parms); + spin_lock_bh(&tbl->lock); + list_del_rcu(&parms->list); + parms->dead = 1; + spin_unlock_bh(&tbl->lock); + + netdev_put(parms->dev, &parms->dev_tracker); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } +EXPORT_SYMBOL(neigh_parms_release); static struct lock_class_key neigh_table_proxy_queue_class; -static void neigh_table_init_no_netlink(struct neigh_table *tbl) +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; + +void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; + INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); + INIT_LIST_HEAD(&tbl->managed_list); + + list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); - atomic_set(&tbl->parms.refcnt, 1); - tbl->parms.reachable_time = - neigh_rand_reach_time(tbl->parms.base_reachable_time); + refcount_set(&tbl->parms.refcnt, 1); + neigh_set_reach_time(&tbl->parms); + tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); #ifdef CONFIG_PROC_FS - if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat, - &neigh_stat_seq_fops, tbl)) + if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, + &neigh_stat_seq_ops, tbl)) panic("cannot create neighbour proc dir entry"); #endif @@ -1533,58 +1837,43 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); - rwlock_init(&tbl->lock); + spin_lock_init(&tbl->lock); + mutex_init(&tbl->phash_lock); + INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); - schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); - setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + tbl->parms.reachable_time); + INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); + + timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; -} - -void neigh_table_init(struct neigh_table *tbl) -{ - struct neigh_table *tmp; - neigh_table_init_no_netlink(tbl); - write_lock(&neigh_tbl_lock); - for (tmp = neigh_tables; tmp; tmp = tmp->next) { - if (tmp->family == tbl->family) - break; - } - tbl->next = neigh_tables; - neigh_tables = tbl; - write_unlock(&neigh_tbl_lock); - - if (unlikely(tmp)) { - pr_err("Registering multiple tables for family %d\n", - tbl->family); - dump_stack(); - } + rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); -int neigh_table_clear(struct neigh_table *tbl) +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */ +int neigh_table_clear(int index, struct neigh_table *tbl) { - struct neigh_table **tp; + RCU_INIT_POINTER(neigh_tables[index], NULL); + synchronize_rcu(); /* It is not clean... Fix it to unload IPv6 module safely */ + cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); - del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue); + timer_delete_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); - write_lock(&neigh_tbl_lock); - for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { - if (*tp == tbl) { - *tp = tbl->next; - break; - } - } - write_unlock(&neigh_tbl_lock); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); @@ -1602,12 +1891,47 @@ int neigh_table_clear(struct neigh_table *tbl) } EXPORT_SYMBOL(neigh_table_clear); -static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) +static struct neigh_table *neigh_find_table(int family) +{ + struct neigh_table *tbl = NULL; + + switch (family) { + case AF_INET: + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); + break; + case AF_INET6: + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); + break; + } + + return tbl; +} + +const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, + [NDA_PROTOCOL] = { .type = NLA_U8 }, + [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), + [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, +}; + +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; + struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; @@ -1616,8 +1940,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); - if (dst_attr == NULL) + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1628,63 +1954,79 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) } } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - struct neighbour *neigh; - - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (nla_len(dst_attr) < tbl->key_len) - goto out; - - if (ndm->ndm_flags & NTF_PROXY) { - err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out; - } + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); + goto out; + } - if (dev == NULL) - goto out; + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; + } - neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); - if (neigh == NULL) { - err = -ENOENT; - goto out; - } + if (dev == NULL) + goto out; - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); - neigh_release(neigh); + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; goto out; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); + spin_lock_bh(&tbl->lock); + neigh_release(neigh); + neigh_remove_one(neigh); + spin_unlock_bh(&tbl->lock); out: return err; } -static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; + struct neighbour *neigh; + void *dst, *lladdr; + u8 protocol = 0; + u32 ndm_flags; int err; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, + nda_policy, extack); if (err < 0) goto out; err = -EINVAL; - if (tb[NDA_DST] == NULL) + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); + ndm_flags = ndm->ndm_flags; + if (tb[NDA_FLAGS_EXT]) { + u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); + + BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < + (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + + hweight32(NTF_EXT_MASK))); + ndm_flags |= (ext << NTF_EXT_SHIFT); + } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { @@ -1692,74 +2034,137 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; + } } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; - struct neighbour *neigh; - void *dst, *lladdr; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); + goto out; + } - if (nla_len(tb[NDA_DST]) < tbl->key_len) + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + + if (tb[NDA_PROTOCOL]) + protocol = nla_get_u8(tb[NDA_PROTOCOL]); + if (ndm_flags & NTF_PROXY) { + if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; - dst = nla_data(tb[NDA_DST]); - lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + } - if (ndm->ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; + err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, + !!(ndm->ndm_state & NUD_PERMANENT)); + goto out; + } - err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); - if (pn) { - pn->flags = ndm->ndm_flags; - err = 0; - } + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); + goto out; + } + + if (tbl->allow_add && !tbl->allow_add(dev, extack)) { + err = -EINVAL; + goto out; + } + + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; + bool exempt_from_gc = ndm_permanent || + ndm_flags & (NTF_EXT_LEARNED | + NTF_EXT_VALIDATED); + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; goto out; } - - if (dev == NULL) + if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); + err = -EINVAL; goto out; + } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; - neigh = neigh_lookup(tbl, dst, dev); - if (neigh == NULL) { - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - err = -ENOENT; - goto out; - } + /* NTF_USE and NTF_MANAGED will result in the neighbor + * being created with an invalid state (NUD_NONE). + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = NUD_NONE; - neigh = __neigh_lookup_errno(tbl, dst, dev); - if (IS_ERR(neigh)) { - err = PTR_ERR(neigh); + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot create externally validated neighbor with an invalid state"); + err = -EINVAL; goto out; } - } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { - err = -EEXIST; + } + + neigh = ___neigh_create(tbl, dst, dev, + ndm_flags & + (NTF_EXT_LEARNED | NTF_MANAGED | + NTF_EXT_VALIDATED), + exempt_from_gc, true); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); + goto out; + } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED do not update the existing + * state other than clearing it if it was + * NUD_PERMANENT. + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot mark neighbor as externally validated with an invalid state"); + err = -EINVAL; neigh_release(neigh); goto out; } - - if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) - flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - if (ndm->ndm_flags & NTF_USE) { - neigh_event_send(neigh, NULL); - err = 0; - } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); - neigh_release(neigh); - goto out; - } - - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~(NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + } + + if (protocol) + neigh->protocol = protocol; + if (ndm_flags & NTF_EXT_LEARNED) + flags |= NEIGH_UPDATE_F_EXT_LEARNED; + if (ndm_flags & NTF_ROUTER) + flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm_flags & NTF_MANAGED) + flags |= NEIGH_UPDATE_F_MANAGED; + if (ndm_flags & NTF_USE) + flags |= NEIGH_UPDATE_F_USE; + if (ndm_flags & NTF_EXT_VALIDATED) + flags |= NEIGH_UPDATE_F_EXT_VALIDATED; + + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) + neigh_event_send(neigh, NULL); + neigh_release(neigh); out: return err; } @@ -1768,31 +2173,44 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { struct nlattr *nest; - nest = nla_nest_start(skb, NDTA_PARMS); + nest = nla_nest_start_noflag(skb, NDTA_PARMS); if (nest == NULL) return -ENOBUFS; if ((parms->dev && - nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || - nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) || - nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) || + nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) || + nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || + nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, + NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, - parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) || - nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) || - nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) || - nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) || - nla_put_u32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes) || - nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || + NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || + nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || + nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || + nla_put_u32(skb, NDTPA_UCAST_PROBES, + NEIGH_VAR(parms, UCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_PROBES, + NEIGH_VAR(parms, MCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_REPROBES, + NEIGH_VAR(parms, MCAST_REPROBES)) || + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time), + NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, - parms->base_reachable_time) || - nla_put_msecs(skb, NDTPA_GC_STALETIME, parms->gc_staletime) || + NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_GC_STALETIME, + NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, - parms->delay_probe_time) || - nla_put_msecs(skb, NDTPA_RETRANS_TIME, parms->retrans_time) || - nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay) || - nla_put_msecs(skb, NDTPA_PROXY_DELAY, parms->proxy_delay) || - nla_put_msecs(skb, NDTPA_LOCKTIME, parms->locktime)) + NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_RETRANS_TIME, + NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, + NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_PROXY_DELAY, + NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_LOCKTIME, + NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, + NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -1812,22 +2230,21 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || - nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) || - nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || - nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || - nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) + nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), + NDTA_PAD) || + nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || + nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || + nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; - unsigned int flush_delta = now - tbl->last_flush; - unsigned int rand_delta = now - tbl->last_rand; + long flush_delta = now - READ_ONCE(tbl->last_flush); + long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, @@ -1835,14 +2252,12 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_proxy_qlen = tbl->proxy_queue.qlen, + .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); + nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); - rcu_read_unlock_bh(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; @@ -1858,19 +2273,21 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); - ndst.ndts_allocs += st->allocs; - ndst.ndts_destroys += st->destroys; - ndst.ndts_hash_grows += st->hash_grows; - ndst.ndts_res_failed += st->res_failed; - ndst.ndts_lookups += st->lookups; - ndst.ndts_hits += st->hits; - ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast; - ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; - ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; - ndst.ndts_forced_gc_runs += st->forced_gc_runs; + ndst.ndts_allocs += READ_ONCE(st->allocs); + ndst.ndts_destroys += READ_ONCE(st->destroys); + ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); + ndst.ndts_res_failed += READ_ONCE(st->res_failed); + ndst.ndts_lookups += READ_ONCE(st->lookups); + ndst.ndts_hits += READ_ONCE(st->hits); + ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); + ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); + ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); + ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); + ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } - if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) + if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, + NDTA_PAD)) goto nla_put_failure; } @@ -1878,11 +2295,10 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; - read_unlock_bh(&tbl->lock); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -1901,8 +2317,6 @@ static int neightbl_fill_param_info(struct sk_buff *skb, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; @@ -1911,10 +2325,9 @@ static int neightbl_fill_param_info(struct sk_buff *skb, neightbl_fill_parms(skb, parms) < 0) goto errout; - read_unlock_bh(&tbl->lock); - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; errout: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -1931,10 +2344,12 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, @@ -1942,18 +2357,21 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, + [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, }; -static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct nlattr *tb[NDTA_MAX + 1]; struct neigh_table *tbl; struct ndtmsg *ndtmsg; - struct nlattr *tb[NDTA_MAX+1]; - int err; + bool found = false; + int err, tidx; - err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, - nl_neightbl_policy); + err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, + nl_neightbl_policy, extack); if (err < 0) goto errout; @@ -1963,33 +2381,43 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) } ndtmsg = nlmsg_data(nlh); - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { + + rcu_read_lock(); + + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { + tbl = rcu_dereference(neigh_tables[tidx]); + if (!tbl) + continue; + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; - if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { + found = true; break; + } } - if (tbl == NULL) { + if (!found) { + rcu_read_unlock(); err = -ENOENT; - goto errout_locked; + goto errout; } /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; struct neigh_parms *p; int i, ifindex = 0; - err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS], - nl_ntbl_parm_policy); + err = nla_parse_nested_deprecated(tbp, NDTPA_MAX, + tb[NDTA_PARMS], + nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; @@ -2008,44 +2436,71 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) switch (i) { case NDTPA_QUEUE_LEN: - p->queue_len_bytes = nla_get_u32(tbp[i]) * - SKB_TRUESIZE(ETH_FRAME_LEN); + NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, + nla_get_u32(tbp[i]) * + SKB_TRUESIZE(ETH_FRAME_LEN)); break; case NDTPA_QUEUE_LENBYTES: - p->queue_len_bytes = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, + nla_get_u32(tbp[i])); break; case NDTPA_PROXY_QLEN: - p->proxy_qlen = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, PROXY_QLEN, + nla_get_u32(tbp[i])); break; case NDTPA_APP_PROBES: - p->app_probes = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, APP_PROBES, + nla_get_u32(tbp[i])); break; case NDTPA_UCAST_PROBES: - p->ucast_probes = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, UCAST_PROBES, + nla_get_u32(tbp[i])); break; case NDTPA_MCAST_PROBES: - p->mcast_probes = nla_get_u32(tbp[i]); + NEIGH_VAR_SET(p, MCAST_PROBES, + nla_get_u32(tbp[i])); + break; + case NDTPA_MCAST_REPROBES: + NEIGH_VAR_SET(p, MCAST_REPROBES, + nla_get_u32(tbp[i])); break; case NDTPA_BASE_REACHABLE_TIME: - p->base_reachable_time = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, + nla_get_msecs(tbp[i])); + /* update reachable_time as well, otherwise, the change will + * only be effective after the next time neigh_periodic_work + * decides to recompute it (can be multiple minutes) + */ + neigh_set_reach_time(p); break; case NDTPA_GC_STALETIME: - p->gc_staletime = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, GC_STALETIME, + nla_get_msecs(tbp[i])); break; case NDTPA_DELAY_PROBE_TIME: - p->delay_probe_time = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, DELAY_PROBE_TIME, + nla_get_msecs(tbp[i])); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); + break; + case NDTPA_INTERVAL_PROBE_TIME_MS: + NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, + nla_get_msecs(tbp[i])); break; case NDTPA_RETRANS_TIME: - p->retrans_time = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, RETRANS_TIME, + nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: - p->anycast_delay = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, ANYCAST_DELAY, + nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: - p->proxy_delay = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, PROXY_DELAY, + nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: - p->locktime = nla_get_msecs(tbp[i]); + NEIGH_VAR_SET(p, LOCKTIME, + nla_get_msecs(tbp[i])); break; } } @@ -2058,50 +2513,88 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout_tbl_lock; if (tb[NDTA_THRESH1]) - tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); + WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) - tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]); + WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) - tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]); + WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) - tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]); + WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; errout_tbl_lock: - write_unlock_bh(&tbl->lock); -errout_locked: - read_unlock(&neigh_tbl_lock); + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); errout: return err; } +static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ndtmsg *ndtm; + + ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); + if (!ndtm) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); + return -EINVAL; + } + + if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); + return -EINVAL; + } + + return 0; +} + static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; - family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + if (cb->strict_check) { + int err = neightbl_valid_dump_info(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; + + rcu_read_lock(); - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) { + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; + tbl = rcu_dereference(neigh_tables[tidx]); + if (!tbl) + continue; + if (tidx < tbl_skip || (family && tbl->family != family)) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, - NLM_F_MULTI) <= 0) + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, + NLM_F_MULTI) < 0) break; - for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + nidx = 0; + p = list_next_entry(&tbl->parms, list); + list_for_each_entry_from_rcu(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; @@ -2110,9 +2603,9 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, - NLM_F_MULTI) <= 0) + NLM_F_MULTI) < 0) goto out; next: nidx++; @@ -2121,7 +2614,8 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: - read_unlock(&neigh_tbl_lock); + rcu_read_unlock(); + cb->args[0] = tidx; cb->args[1] = nidx; @@ -2131,6 +2625,7 @@ out: static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { + u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; @@ -2140,11 +2635,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (nlh == NULL) return -EMSGSIZE; + neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; + neigh_flags = neigh->flags & NTF_OLD_MASK; + ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = neigh->flags; + ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; @@ -2166,14 +2664,20 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); - ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1; + ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) + goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -2184,156 +2688,435 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { + u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; + u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; + neigh_flags = READ_ONCE(pn->flags); + neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; + neigh_flags &= NTF_OLD_MASK; + ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = pn->flags | NTF_PROXY; - ndm->ndm_type = NDA_DST; - ndm->ndm_ifindex = pn->dev->ifindex; + ndm->ndm_flags = neigh_flags | NTF_PROXY; + ndm->ndm_type = RTN_UNICAST; + ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + protocol = READ_ONCE(pn->protocol); + if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) + goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } -static void neigh_update_notify(struct neighbour *neigh) +static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) { call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); - __neigh_notify(neigh, RTM_NEWNEIGH, 0); + __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); } +static bool neigh_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; + + /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another + * invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) +{ + if (filter_idx && (!dev || dev->ifindex != filter_idx)) + return true; + + return false; +} + +struct neigh_dump_filter { + int master_idx; + int dev_idx; +}; + static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, + struct neigh_dump_filter *filter) { struct net *net = sock_net(skb->sk); struct neighbour *n; - int rc, h, s_h = cb->args[1]; + int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; + unsigned int flags = NLM_F_MULTI; - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); + if (filter->dev_idx || filter->master_idx) + flags |= NLM_F_DUMP_FILTERED; + + nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; - for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; - n != NULL; - n = rcu_dereference_bh(n->next)) { - if (!net_eq(dev_net(n->dev), net)) - continue; - if (idx < s_idx) + idx = 0; + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { + if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI) <= 0) { - rc = -1; + if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || + neigh_master_filtered(n->dev, filter->master_idx)) + goto next; + err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags); + if (err < 0) goto out; - } next: idx++; } } - rc = skb->len; out: - rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; - return rc; + return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, + struct neigh_dump_filter *filter) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); - int rc, h, s_h = cb->args[3]; + int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; + unsigned int flags = NLM_F_MULTI; - read_lock_bh(&tbl->lock); + if (filter->dev_idx || filter->master_idx) + flags |= NLM_F_DUMP_FILTERED; for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { - if (dev_net(n->dev) != net) - continue; - if (idx < s_idx) + for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; + n; + n = rcu_dereference(n->next)) { + if (idx < s_idx || pneigh_net(n) != net) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, tbl) <= 0) { - read_unlock_bh(&tbl->lock); - rc = -1; + if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || + neigh_master_filtered(n->dev, filter->master_idx)) + goto next; + err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags, tbl); + if (err < 0) goto out; - } next: idx++; } } - read_unlock_bh(&tbl->lock); - rc = skb->len; out: cb->args[3] = h; cb->args[4] = idx; - return rc; + return err; +} + +static int neigh_valid_dump_req(const struct nlmsghdr *nlh, + bool strict_check, + struct neigh_dump_filter *filter, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + int err, i; + + if (strict_check) { + struct ndmsg *ndm; + + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); + return -EINVAL; + } + + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || + ndm->ndm_state || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), + tb, NDA_MAX, nda_policy, + extack); + } else { + err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); + } + if (err < 0) + return err; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + /* all new attributes should require strict_check */ + switch (i) { + case NDA_IFINDEX: + filter->dev_idx = nla_get_u32(tb[i]); + break; + case NDA_MASTER: + filter->master_idx = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); + return -EINVAL; + } + } + } + + return 0; } static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; + struct neigh_dump_filter filter = {}; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; - read_lock(&neigh_tbl_lock); - family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ - if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) && - ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY) + if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && + ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; + err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); + if (err < 0 && cb->strict_check) + return err; + err = 0; + s_t = cb->args[0]; - for (tbl = neigh_tables, t = 0; tbl; - tbl = tbl->next, t++) { + rcu_read_lock(); + for (t = 0; t < NEIGH_NR_TABLES; t++) { + tbl = rcu_dereference(neigh_tables[t]); + + if (!tbl) + continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) - err = pneigh_dump_table(tbl, skb, cb); + err = pneigh_dump_table(tbl, skb, cb, &filter); else - err = neigh_dump_table(tbl, skb, cb); + err = neigh_dump_table(tbl, skb, cb, &filter); if (err < 0) break; } - read_unlock(&neigh_tbl_lock); + rcu_read_unlock(); cb->args[0] = t; - return skb->len; + return err; +} + +static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct ndmsg *ndm; + int err, i; + + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); + return ERR_PTR(-EINVAL); + } + + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); + return ERR_PTR(-EINVAL); + } + + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); + return ERR_PTR(-EINVAL); + } + + if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { + NL_SET_ERR_MSG(extack, "No device specified"); + return ERR_PTR(-EINVAL); + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); + if (err < 0) + return ERR_PTR(err); + + for (i = 0; i <= NDA_MAX; ++i) { + switch (i) { + case NDA_DST: + if (!tb[i]) { + NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); + return ERR_PTR(-EINVAL); + } + break; + default: + if (!tb[i]) + continue; + + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); + return ERR_PTR(-EINVAL); + } + } + + return ndm; +} + +static inline size_t neigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static inline size_t pneigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + u32 pid = NETLINK_CB(in_skb).portid; + struct nlattr *tb[NDA_MAX + 1]; + struct net_device *dev = NULL; + u32 seq = nlh->nlmsg_seq; + struct neigh_table *tbl; + struct neighbour *neigh; + struct sk_buff *skb; + struct ndmsg *ndm; + void *dst; + int err; + + ndm = neigh_valid_get_req(nlh, tb, extack); + if (IS_ERR(ndm)) + return PTR_ERR(ndm); + + if (ndm->ndm_flags & NTF_PROXY) + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + else + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + rcu_read_lock(); + + tbl = neigh_find_table(ndm->ndm_family); + if (!tbl) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + err = -EAFNOSUPPORT; + goto err_unlock; + } + + if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + err = -EINVAL; + goto err_unlock; + } + + dst = nla_data(tb[NDA_DST]); + + if (ndm->ndm_ifindex) { + dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + err = -ENODEV; + goto err_unlock; + } + } + + if (ndm->ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + pn = pneigh_lookup(tbl, net, dst, dev); + if (!pn) { + NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); + err = -ENOENT; + goto err_unlock; + } + + err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) + goto err_unlock; + } else { + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + err = -ENOENT; + goto err_unlock; + } + + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + neigh_release(neigh); + if (err) + goto err_unlock; + } + + rcu_read_unlock(); + + return rtnl_unicast(skb, net, pid); +err_unlock: + rcu_read_unlock(); + kfree_skb(skb); + return err; } void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) @@ -2341,20 +3124,18 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void int chain; struct neigh_hash_table *nht; - rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); + rcu_read_lock(); + nht = rcu_dereference(tbl->nht); - read_lock(&tbl->lock); /* avoid resizes */ + spin_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; - for (n = rcu_dereference_bh(nht->hash_buckets[chain]); - n != NULL; - n = rcu_dereference_bh(n->next)) + neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } - read_unlock(&tbl->lock); - rcu_read_unlock_bh(); + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); @@ -2362,29 +3143,25 @@ EXPORT_SYMBOL(neigh_for_each); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { - int chain; struct neigh_hash_table *nht; + int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[chain]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - n->dead = 1; - } else - np = &n->next; + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + neigh_mark_dead(n); + } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); @@ -2393,45 +3170,101 @@ void __neigh_for_each_release(struct neigh_table *tbl, } EXPORT_SYMBOL(__neigh_for_each_release); +int neigh_xmit(int index, struct net_device *dev, + const void *addr, struct sk_buff *skb) +{ + int err = -EAFNOSUPPORT; + + if (likely(index < NEIGH_NR_TABLES)) { + struct neigh_table *tbl; + struct neighbour *neigh; + + rcu_read_lock(); + tbl = rcu_dereference(neigh_tables[index]); + if (!tbl) + goto out_unlock; + if (index == NEIGH_ARP_TABLE) { + u32 key = *((u32 *)addr); + + neigh = __ipv4_neigh_lookup_noref(dev, key); + } else { + neigh = __neigh_lookup_noref(tbl, addr, dev); + } + if (!neigh) + neigh = __neigh_create(tbl, addr, dev, false); + err = PTR_ERR(neigh); + if (IS_ERR(neigh)) { + rcu_read_unlock(); + goto out_kfree_skb; + } + err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock: + rcu_read_unlock(); + } + else if (index == NEIGH_LINK_TABLE) { + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + addr, NULL, skb->len); + if (err < 0) + goto out_kfree_skb; + err = dev_queue_xmit(skb); + } +out: + return err; +out_kfree_skb: + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(neigh_xmit); + #ifdef CONFIG_PROC_FS -static struct neighbour *neigh_get_first(struct seq_file *seq) +static struct neighbour *neigh_get_valid(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); + + if (!net_eq(dev_net(n->dev), net)) + return NULL; + + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); + if (!v) + return NULL; + if (pos) + return v; + } + + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + return n; + + if (READ_ONCE(n->nud_state) & ~NUD_NOARP) + return n; + + return NULL; +} + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; - struct neighbour *n = NULL; - int bucket = state->bucket; + struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { - n = rcu_dereference_bh(nht->hash_buckets[bucket]); - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - loff_t fakep = 0; - void *v; - - v = state->neigh_sub_iter(state, n, &fakep); - if (!v) - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - if (n->nud_state & ~NUD_NOARP) - break; -next: - n = rcu_dereference_bh(n->next); + while (++state->bucket < (1 << nht->hash_shift)) { + neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { + tmp = neigh_get_valid(seq, n, NULL); + if (tmp) + return tmp; } - - if (n) - break; } - state->bucket = bucket; - return n; + return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, @@ -2439,46 +3272,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; - struct net *net = seq_file_net(seq); - struct neigh_hash_table *nht = state->nht; + struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); + if (v) return n; } - n = rcu_dereference_bh(n->next); - - while (1) { - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - void *v = state->neigh_sub_iter(state, n, pos); - if (v) - return n; - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - if (n->nud_state & ~NUD_NOARP) - break; -next: - n = rcu_dereference_bh(n->next); + hlist_for_each_entry_continue(n, hash) { + tmp = neigh_get_valid(seq, n, pos); + if (tmp) { + n = tmp; + goto out; } - - if (n) - break; - - if (++state->bucket >= (1 << nht->hash_shift)) - break; - - n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); } + n = neigh_get_first(seq); +out: if (n && pos) --(*pos); + return n; } @@ -2503,13 +3318,14 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; - int bucket = state->bucket; + int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = tbl->phash_buckets[bucket]; + pn = rcu_dereference(tbl->phash_buckets[bucket]); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -2527,15 +3343,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = pn->next; + pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = tbl->phash_buckets[state->bucket]; + + pn = rcu_dereference(tbl->phash_buckets[state->bucket]); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -2575,16 +3393,18 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) - __acquires(rcu_bh) + __acquires(tbl->lock) + __acquires(rcu) { struct neigh_seq_state *state = seq->private; state->tbl = tbl; - state->bucket = 0; + state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); - rcu_read_lock_bh(); - state->nht = rcu_dereference_bh(tbl->nht); + rcu_read_lock(); + state->nht = rcu_dereference(tbl->nht); + spin_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } @@ -2618,9 +3438,14 @@ out: EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) - __releases(rcu_bh) + __releases(tbl->lock) + __releases(rcu) { - rcu_read_unlock_bh(); + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); @@ -2628,7 +3453,7 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct neigh_table *tbl = seq->private; + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) @@ -2645,7 +3470,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct neigh_table *tbl = seq->private; + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -2654,6 +3479,7 @@ static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } + (*pos)++; return NULL; } @@ -2664,16 +3490,17 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct neigh_table *tbl = seq->private; + struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n"); + seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } - seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx %08lx\n", + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx " + "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, @@ -2690,7 +3517,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) st->periodic_gc_runs, st->forced_gc_runs, - st->unres_discards + st->unres_discards, + st->table_fulls ); return 0; @@ -2702,48 +3530,22 @@ static const struct seq_operations neigh_stat_seq_ops = { .stop = neigh_stat_seq_stop, .show = neigh_stat_seq_show, }; - -static int neigh_stat_seq_open(struct inode *inode, struct file *file) -{ - int ret = seq_open(file, &neigh_stat_seq_ops); - - if (!ret) { - struct seq_file *sf = file->private_data; - sf->private = PDE_DATA(inode); - } - return ret; -}; - -static const struct file_operations neigh_stat_seq_fops = { - .owner = THIS_MODULE, - .open = neigh_stat_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - #endif /* CONFIG_PROC_FS */ -static inline size_t neigh_nlmsg_size(void) +static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid) { - return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ - + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(4); /* NDA_PROBES */ -} - -static void __neigh_notify(struct neighbour *n, int type, int flags) -{ - struct net *net = dev_net(n->dev); struct sk_buff *skb; int err = -ENOBUFS; + struct net *net; + rcu_read_lock(); + net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; - err = neigh_fill_info(skb, n, 0, 0, type, flags); + err = neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2751,31 +3553,29 @@ static void __neigh_notify(struct neighbour *n, int type, int flags) goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; + goto out; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +out: + rcu_read_unlock(); } -#ifdef CONFIG_ARPD void neigh_app_ns(struct neighbour *n) { - __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); + __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); -#endif /* CONFIG_ARPD */ #ifdef CONFIG_SYSCTL -static int zero; static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); -static int proc_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_unres_qlen(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; - tmp.extra1 = &zero; + tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = &unres_qlen_max; tmp.data = &size; @@ -2787,125 +3587,198 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write, return ret; } -enum { - NEIGH_VAR_MCAST_PROBE, - NEIGH_VAR_UCAST_PROBE, - NEIGH_VAR_APP_PROBE, - NEIGH_VAR_RETRANS_TIME, - NEIGH_VAR_BASE_REACHABLE_TIME, - NEIGH_VAR_DELAY_PROBE_TIME, - NEIGH_VAR_GC_STALETIME, - NEIGH_VAR_QUEUE_LEN, - NEIGH_VAR_QUEUE_LEN_BYTES, - NEIGH_VAR_PROXY_QLEN, - NEIGH_VAR_ANYCAST_DELAY, - NEIGH_VAR_PROXY_DELAY, - NEIGH_VAR_LOCKTIME, - NEIGH_VAR_RETRANS_TIME_MS, - NEIGH_VAR_BASE_REACHABLE_TIME_MS, - NEIGH_VAR_GC_INTERVAL, - NEIGH_VAR_GC_THRESH1, - NEIGH_VAR_GC_THRESH2, - NEIGH_VAR_GC_THRESH3, - NEIGH_VAR_MAX -}; +static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, + int index) +{ + struct net_device *dev; + int family = neigh_parms_family(p); + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct neigh_parms *dst_p = + neigh_get_dev_parms_rcu(dev, family); + + if (dst_p && !test_bit(index, dst_p->data_state)) + dst_p->data[index] = p->data[index]; + } + rcu_read_unlock(); +} + +static void neigh_proc_update(const struct ctl_table *ctl, int write) +{ + struct net_device *dev = ctl->extra1; + struct neigh_parms *p = ctl->extra2; + struct net *net = neigh_parms_net(p); + int index = (int *) ctl->data - p->data; + + if (!write) + return; + + set_bit(index, p->data_state); + if (index == NEIGH_VAR_DELAY_PROBE_TIME) + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); + if (!dev) /* NULL dev means this is default value */ + neigh_copy_dflt_parms(net, p, index); +} + +static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct ctl_table tmp = *ctl; + int ret; + + tmp.extra1 = SYSCTL_ZERO; + tmp.extra2 = SYSCTL_INT_MAX; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + neigh_proc_update(ctl, write); + return ret; +} + +static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *ctl; + int ret; + + int min = msecs_to_jiffies(1); + + tmp.extra1 = &min; + tmp.extra2 = NULL; + + ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); + neigh_proc_update(ctl, write); + return ret; +} + +int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec); + +int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); + +static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} + +int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); + +static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} + +static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct neigh_parms *p = ctl->extra2; + int ret; + + if (strcmp(ctl->procname, "base_reachable_time") == 0) + ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0) + ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); + else + ret = -1; + + if (write && ret == 0) { + /* update reachable_time as well, otherwise, the change will + * only be effective after the next time neigh_periodic_work + * decides to recompute it + */ + neigh_set_reach_time(p); + } + return ret; +} + +#define NEIGH_PARMS_DATA_OFFSET(index) \ + (&((struct neigh_parms *) 0)->data[index]) + +#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ + [NEIGH_VAR_ ## attr] = { \ + .procname = name, \ + .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ + .maxlen = sizeof(int), \ + .mode = mval, \ + .proc_handler = proc, \ + } + +#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) + +#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) + +#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) + +#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { - [NEIGH_VAR_MCAST_PROBE] = { - .procname = "mcast_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - [NEIGH_VAR_UCAST_PROBE] = { - .procname = "ucast_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - [NEIGH_VAR_APP_PROBE] = { - .procname = "app_solicit", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - [NEIGH_VAR_RETRANS_TIME] = { - .procname = "retrans_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_userhz_jiffies, - }, - [NEIGH_VAR_BASE_REACHABLE_TIME] = { - .procname = "base_reachable_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NEIGH_VAR_DELAY_PROBE_TIME] = { - .procname = "delay_first_probe_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NEIGH_VAR_GC_STALETIME] = { - .procname = "gc_stale_time", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NEIGH_VAR_QUEUE_LEN] = { - .procname = "unres_qlen", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_unres_qlen, - }, - [NEIGH_VAR_QUEUE_LEN_BYTES] = { - .procname = "unres_qlen_bytes", - .maxlen = sizeof(int), - .mode = 0644, - .extra1 = &zero, - .proc_handler = proc_dointvec_minmax, - }, - [NEIGH_VAR_PROXY_QLEN] = { - .procname = "proxy_qlen", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - [NEIGH_VAR_ANYCAST_DELAY] = { - .procname = "anycast_delay", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_userhz_jiffies, - }, - [NEIGH_VAR_PROXY_DELAY] = { - .procname = "proxy_delay", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_userhz_jiffies, - }, - [NEIGH_VAR_LOCKTIME] = { - .procname = "locktime", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_userhz_jiffies, - }, - [NEIGH_VAR_RETRANS_TIME_MS] = { - .procname = "retrans_time_ms", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, - [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = { - .procname = "base_reachable_time_ms", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), + NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, + "interval_probe_time_ms"), + NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), + NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), + NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), + NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), @@ -2916,88 +3789,103 @@ static struct neigh_sysctl_table { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + .proc_handler = proc_dointvec_minmax, }, - {}, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, - char *p_name, proc_handler *handler) + proc_handler *handler) { + int i; struct neigh_sysctl_table *t; - const char *dev_name_source = NULL; + const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; + char *p_name; + size_t neigh_vars_size; - t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; - t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes; - t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes; - t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes; - t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time; - t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time; - t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time; - t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime; - t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes; - t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes; - t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen; - t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay; - t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay; - t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime; - t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time; - t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time; + for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { + t->neigh_vars[i].data += (long) p; + t->neigh_vars[i].extra1 = dev; + t->neigh_vars[i].extra2 = p; + } + neigh_vars_size = ARRAY_SIZE(t->neigh_vars); if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, - sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); + neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { + struct neigh_table *tbl = p->tbl; dev_name_source = "default"; - t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); - t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; - t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; - t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } - if (handler) { /* RetransTime */ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; - t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev; /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; - t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev; /* RetransTime (in milliseconds)*/ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; - t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; - t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev; + } else { + /* Those handlers will update p->reachable_time after + * base_reachable_time(_ms) is set to ensure the new timer starts being + * applied after the next neighbour update instead of waiting for + * neigh_periodic_work to update its value (can be multiple minutes) + * So any handler that replaces them should do this as well + */ + /* ReachableTime */ + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = + neigh_proc_base_reachable_time; + /* ReachableTime (in milliseconds) */ + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = + neigh_proc_base_reachable_time; } - /* Don't export sysctls to unprivileged users */ - if (neigh_parms_net(p)->user_ns != &init_user_ns) - t->neigh_vars[0].procname = NULL; + switch (neigh_parms_family(p)) { + case AF_INET: + p_name = "ipv4"; + break; + case AF_INET6: + p_name = "ipv6"; + break; + default: + BUG(); + } snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); - t->sysctl_header = - register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars); + t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p), + neigh_path, t->neigh_vars, + neigh_vars_size); if (!t->sysctl_header) goto free; @@ -3024,18 +3912,21 @@ EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ +static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, + {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, + {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, +}; + static int __init neigh_init(void) { - rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL); - - rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, - NULL); - rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL); - + rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } subsys_initcall(neigh_init); - diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 2bf83299600a..70e0e9a3b650 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -1,54 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/wext.h> +#include <net/hotdata.h> -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) +#include "dev.h" -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) - -extern struct list_head ptype_all __read_mostly; -extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; - -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos) { - struct net *net = seq_file_net(seq); + unsigned long ifindex = *pos; struct net_device *dev; - struct hlist_head *h; - unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, name_hlist) { - if (++count == offset) - return dev; + for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { + *pos = dev->ifindex; + return dev; } - return NULL; } -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) -{ - struct net_device *dev; - unsigned int bucket; - - do { - dev = dev_from_same_bucket(seq, pos); - if (dev) - return dev; - - bucket = get_bucket(*pos) + 1; - *pos = set_bucket_offset(bucket, 1); - } while (bucket < NETDEV_HASHENTRIES); - - return NULL; -} - -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. - */ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { @@ -56,16 +26,13 @@ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) if (!*pos) return SEQ_START_TOKEN; - if (get_bucket(*pos) >= NETDEV_HASHENTRIES) - return NULL; - - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void dev_seq_stop(struct seq_file *seq, void *v) @@ -115,6 +82,16 @@ static int dev_seq_show(struct seq_file *seq, void *v) return 0; } +static u32 softnet_input_pkt_queue_len(struct softnet_data *sd) +{ + return skb_queue_len_lockless(&sd->input_pkt_queue); +} + +static u32 softnet_process_queue_len(struct softnet_data *sd) +{ + return skb_queue_len_lockless(&sd->process_queue); +} + static struct softnet_data *softnet_get_online(loff_t *pos) { struct softnet_data *sd = NULL; @@ -146,6 +123,8 @@ static void softnet_seq_stop(struct seq_file *seq, void *v) static int softnet_seq_show(struct seq_file *seq, void *v) { struct softnet_data *sd = v; + u32 input_qlen = softnet_input_pkt_queue_len(sd); + u32 process_qlen = softnet_process_queue_len(sd); unsigned int flow_limit_count = 0; #ifdef CONFIG_NET_FLOW_LIMIT @@ -153,16 +132,27 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); + /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) - flow_limit_count = fl->count; + flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif + /* the index is the CPU id owing this sd. Since offline CPUs are not + * displayed, it would be othrwise not trivial for the user-space + * mapping the data a specific CPU + */ seq_printf(seq, - "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x\n", + READ_ONCE(sd->processed), + numa_drop_read(&sd->drop_counters), + READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps, flow_limit_count); + 0, /* was cpu_collision */ + READ_ONCE(sd->received_rps), flow_limit_count, + input_qlen + process_qlen, (int)seq->index, + input_qlen, process_qlen); return 0; } @@ -173,20 +163,6 @@ static const struct seq_operations dev_seq_ops = { .show = dev_seq_show, }; -static int dev_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &dev_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, - .open = dev_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - static const struct seq_operations softnet_seq_ops = { .start = softnet_seq_start, .next = softnet_seq_next, @@ -194,26 +170,30 @@ static const struct seq_operations softnet_seq_ops = { .show = softnet_seq_show, }; -static int softnet_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &softnet_seq_ops); -} - -static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, - .open = softnet_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void *ptype_get_idx(loff_t pos) +static void *ptype_get_idx(struct seq_file *seq, loff_t pos) { + struct list_head *ptype_list = NULL; struct packet_type *pt = NULL; + struct net_device *dev; loff_t i = 0; int t; - list_for_each_entry_rcu(pt, &ptype_all, list) { + for_each_netdev_rcu(seq_file_net(seq), dev) { + ptype_list = &dev->ptype_all; + list_for_each_entry_rcu(pt, ptype_list, list) { + if (i == pos) + return pt; + ++i; + } + } + + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) { if (i == pos) return pt; ++i; @@ -233,24 +213,50 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; + return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); + struct net_device *dev; struct packet_type *pt; struct list_head *nxt; int hash; ++*pos; if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); + return ptype_get_idx(seq, 0); pt = v; nxt = pt->list.next; - if (pt->type == htons(ETH_P_ALL)) { - if (nxt != &ptype_all) + if (pt->dev) { + if (nxt != &pt->dev->ptype_all) + goto found; + + dev = pt->dev; + for_each_netdev_continue_rcu(seq_file_net(seq), dev) { + if (!list_empty(&dev->ptype_all)) { + nxt = dev->ptype_all.next; + goto found; + } + } + nxt = net->ptype_all.next; + goto net_ptype_all; + } + + if (pt->af_packet_net) { +net_ptype_all: + if (nxt != &net->ptype_all && nxt != &net->ptype_specific) goto found; + + if (nxt == &net->ptype_all) { + /* continue with ->ptype_specific if it's not empty */ + nxt = net->ptype_specific.next; + if (nxt != &net->ptype_specific) + goto found; + } + hash = 0; nxt = ptype_base[0].next; } else @@ -277,13 +283,14 @@ static int ptype_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-8s %pf\n", + seq_printf(seq, " %-8s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } @@ -297,31 +304,18 @@ static const struct seq_operations ptype_seq_ops = { .show = ptype_seq_show, }; -static int ptype_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ptype_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, - .open = ptype_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - - static int __net_init dev_proc_net_init(struct net *net) { int rc = -ENOMEM; - if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops)) + if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops, + sizeof(struct seq_net_private))) goto out; - if (!proc_create("softnet_stat", S_IRUGO, net->proc_net, - &softnet_seq_fops)) + if (!proc_create_seq("softnet_stat", 0444, net->proc_net, + &softnet_seq_ops)) goto out_dev; - if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops)) + if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops, + sizeof(struct seq_net_private))) goto out_softnet; if (wext_proc_init(net)) @@ -362,15 +356,10 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - int i; - - seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, - dev->name, ha->refcount, ha->global_use); - - for (i = 0; i < dev->addr_len; i++) - seq_printf(seq, "%02x", ha->addr[i]); - - seq_putc(seq, '\n'); + seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", + dev->ifindex, dev->name, + ha->refcount, ha->global_use, + (int)dev->addr_len, ha->addr); } netif_addr_unlock_bh(dev); return 0; @@ -383,23 +372,10 @@ static const struct seq_operations dev_mc_seq_ops = { .show = dev_mc_seq_show, }; -static int dev_mc_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &dev_mc_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, - .open = dev_mc_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - static int __net_init dev_mc_net_init(struct net *net) { - if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops)) + if (!proc_create_net("dev_mcast", 0, net->proc_net, &dev_mc_seq_ops, + sizeof(struct seq_net_private))) return -ENOMEM; return 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 981fed397d1d..ca878525ad7c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * net-sysfs.c - network device class and attributes * * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/capability.h> @@ -14,6 +10,8 @@ #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> +#include <linux/sched/signal.h> +#include <linux/sched/isolation.h> #include <linux/nsproxy.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -22,20 +20,108 @@ #include <linux/export.h> #include <linux/jiffies.h> #include <linux/pm_runtime.h> - +#include <linux/of.h> +#include <linux/of_net.h> +#include <linux/cpu.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> +#include <net/rps.h> + +#include "dev.h" #include "net-sysfs.h" #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; -static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; -static const char fmt_udec[] = "%u\n"; +static const char fmt_uint[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; +/* Caller holds RTNL, netdev->lock or RCU */ static inline int dev_isalive(const struct net_device *dev) { - return dev->reg_state <= NETREG_REGISTERED; + return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; +} + +/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, + * when unregistering a net device and accessing associated sysfs files. The + * potential deadlock is as follow: + * + * CPU 0 CPU 1 + * + * rtnl_lock vfs_read + * unregister_netdevice_many kernfs_seq_start + * device_del / kobject_put kernfs_get_active (kn->active++) + * kernfs_drain sysfs_kf_seq_show + * wait_event( rtnl_lock + * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release + * -> waits on CPU 1 to decrease kn->active the rtnl lock. + * + * The historical fix was to use rtnl_trylock with restart_syscall to bail out + * of sysfs operations when the lock couldn't be taken. This fixed the above + * issue as it allowed CPU 1 to bail out of the ABBA situation. + * + * But it came with performances issues, as syscalls are being restarted in + * loops when there was contention on the rtnl lock, with huge slow downs in + * specific scenarios (e.g. lots of virtual interfaces created and userspace + * daemons querying their attributes). + * + * The idea below is to bail out of the active kernfs_node protection + * (kn->active) while trying to take the rtnl lock. + * + * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The + * net device is guaranteed to be alive if this returns successfully. + */ +static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, + struct net_device *ndev) +{ + struct kernfs_node *kn; + int ret = 0; + + /* First, we hold a reference to the net device as the unregistration + * path might run in parallel. This will ensure the net device and the + * associated sysfs objects won't be freed while we try to take the rtnl + * lock. + */ + dev_hold(ndev); + /* sysfs_break_active_protection was introduced to allow self-removal of + * devices and their associated sysfs files by bailing out of the + * sysfs/kernfs protection. We do this here to allow the unregistration + * path to complete in parallel. The following takes a reference on the + * kobject and the kernfs_node being accessed. + * + * This works because we hold a reference onto the net device and the + * unregistration path will wait for us eventually in netdev_run_todo + * (outside an rtnl lock section). + */ + kn = sysfs_break_active_protection(kobj, attr); + /* We can now try to take the rtnl lock. This can't deadlock us as the + * unregistration path is able to drain sysfs files (kernfs_node) thanks + * to the above dance. + */ + if (rtnl_lock_interruptible()) { + ret = -ERESTARTSYS; + goto unbreak; + } + /* Check dismantle on the device hasn't started, otherwise deny the + * operation. + */ + if (!dev_isalive(ndev)) { + rtnl_unlock(); + ret = -ENODEV; + goto unbreak; + } + /* We are now sure the device dismantle hasn't started nor that it can + * start before we exit the locking section as we hold the rtnl lock. + * There's no need to keep unbreaking the sysfs protection nor to hold + * a net device reference from that point; that was only needed to take + * the rtnl lock. + */ +unbreak: + sysfs_unbreak_active_protection(kn); + dev_put(ndev); + + return ret; } /* use same locking rules as GIF* ioctl's */ @@ -43,29 +129,36 @@ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, ssize_t (*format)(const struct net_device *, char *)) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); - if (dev_isalive(net)) - ret = (*format)(net, buf); - read_unlock(&dev_base_lock); + rcu_read_lock(); + if (dev_isalive(ndev)) + ret = (*format)(ndev, buf); + rcu_read_unlock(); return ret; } /* generate a show function for simple field */ #define NETDEVICE_SHOW(field, format_string) \ -static ssize_t format_##field(const struct net_device *net, char *buf) \ +static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sprintf(buf, format_string, net->field); \ + return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ -static ssize_t show_##field(struct device *dev, \ +static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ return netdev_show(dev, attr, buf, format_##field); \ -} +} \ + +#define NETDEVICE_SHOW_RO(field, format_string) \ +NETDEVICE_SHOW(field, format_string); \ +static DEVICE_ATTR_RO(field) +#define NETDEVICE_SHOW_RW(field, format_string) \ +NETDEVICE_SHOW(field, format_string); \ +static DEVICE_ATTR_RW(field) /* use same locking and permission rules as SIF* ioctl's */ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, @@ -75,7 +168,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; - int ret = -EINVAL; + int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -84,104 +177,216 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (ret) goto err; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + goto err; + + ret = (*set)(netdev, new); + if (ret == 0) + ret = len; + + rtnl_unlock(); + err: + return ret; +} + +/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */ +static ssize_t +netdev_lock_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *netdev = to_net_dev(dev); + struct net *net = dev_net(netdev); + unsigned long new; + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + ret = kstrtoul(buf, 0, &new); + if (ret) + return ret; + + netdev_lock(netdev); if (dev_isalive(netdev)) { - if ((ret = (*set)(netdev, new)) == 0) + ret = (*set)(netdev, new); + if (ret == 0) ret = len; } - rtnl_unlock(); - err: + netdev_unlock(netdev); + return ret; } -NETDEVICE_SHOW(dev_id, fmt_hex); -NETDEVICE_SHOW(addr_assign_type, fmt_dec); -NETDEVICE_SHOW(addr_len, fmt_dec); -NETDEVICE_SHOW(iflink, fmt_dec); -NETDEVICE_SHOW(ifindex, fmt_dec); -NETDEVICE_SHOW(type, fmt_dec); -NETDEVICE_SHOW(link_mode, fmt_dec); +NETDEVICE_SHOW_RO(dev_id, fmt_hex); +NETDEVICE_SHOW_RO(dev_port, fmt_dec); +NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); +NETDEVICE_SHOW_RO(addr_len, fmt_dec); +NETDEVICE_SHOW_RO(ifindex, fmt_dec); +NETDEVICE_SHOW_RO(type, fmt_dec); +NETDEVICE_SHOW_RO(link_mode, fmt_dec); -/* use same locking rules as GIFHWADDR ioctl's */ -static ssize_t show_address(struct device *dev, struct device_attribute *attr, +static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + + return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev)); +} +static DEVICE_ATTR_RO(iflink); + +static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) +{ + return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); +} + +static ssize_t name_assign_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) + ret = netdev_show(dev, attr, buf, format_name_assign_type); + + return ret; +} +static DEVICE_ATTR_RO(name_assign_type); + +/* use same locking rules as GIFHWADDR ioctl's (netif_get_mac_address()) */ +static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); - if (dev_isalive(net)) - ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len); - read_unlock(&dev_base_lock); + down_read(&dev_addr_sem); + + rcu_read_lock(); + if (dev_isalive(ndev)) + ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); + rcu_read_unlock(); + + up_read(&dev_addr_sem); return ret; } +static DEVICE_ATTR_RO(address); -static ssize_t show_broadcast(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t broadcast_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); - if (dev_isalive(net)) - return sysfs_format_mac(buf, net->broadcast, net->addr_len); - return -EINVAL; + struct net_device *ndev = to_net_dev(dev); + int ret = -EINVAL; + + rcu_read_lock(); + if (dev_isalive(ndev)) + ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); + rcu_read_unlock(); + return ret; } +static DEVICE_ATTR_RO(broadcast); -static int change_carrier(struct net_device *net, unsigned long new_carrier) +static int change_carrier(struct net_device *dev, unsigned long new_carrier) { - if (!netif_running(net)) + if (!netif_running(dev)) return -EINVAL; - return dev_change_carrier(net, (bool) new_carrier); + return dev_change_carrier(dev, (bool)new_carrier); } -static ssize_t store_carrier(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) +static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) { + struct net_device *netdev = to_net_dev(dev); + + /* The check is also done in change_carrier; this helps returning early + * without hitting the locking section in netdev_store. + */ + if (!netdev->netdev_ops->ndo_change_carrier) + return -EOPNOTSUPP; + return netdev_store(dev, attr, buf, len, change_carrier); } -static ssize_t show_carrier(struct device *dev, +static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + int ret; + + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + + ret = -EINVAL; if (netif_running(netdev)) { - return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); + /* Synchronize carrier state with link watch, + * see also rtnl_getlink(). + */ + linkwatch_sync_dev(netdev); + + ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } - return -EINVAL; + + rtnl_unlock(); + return ret; } +static DEVICE_ATTR_RW(carrier); -static ssize_t show_speed(struct device *dev, +static ssize_t speed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the locking section below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) + ret = sysfs_emit(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; } +static DEVICE_ATTR_RO(speed); -static ssize_t show_duplex(struct device *dev, +static ssize_t duplex_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the locking section below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) { + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; - switch (cmd.duplex) { + + switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; @@ -192,98 +397,173 @@ static ssize_t show_duplex(struct device *dev, duplex = "unknown"; break; } - ret = sprintf(buf, "%s\n", duplex); + ret = sysfs_emit(buf, "%s\n", duplex); } } rtnl_unlock(); return ret; } +static DEVICE_ATTR_RO(duplex); -static ssize_t show_dormant(struct device *dev, +static ssize_t testing_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev)); return -EINVAL; } +static DEVICE_ATTR_RO(testing); + +static ssize_t dormant_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RO(dormant); static const char *const operstates[] = { "unknown", "notpresent", /* currently unused */ "down", "lowerlayerdown", - "testing", /* currently unused */ + "testing", "dormant", "up" }; -static ssize_t show_operstate(struct device *dev, +static ssize_t operstate_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; - read_lock(&dev_base_lock); - operstate = netdev->operstate; + operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; - read_unlock(&dev_base_lock); if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ - return sprintf(buf, "%s\n", operstates[operstate]); + return sysfs_emit(buf, "%s\n", operstates[operstate]); +} +static DEVICE_ATTR_RO(operstate); + +static ssize_t carrier_changes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sysfs_emit(buf, fmt_dec, + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); +} +static DEVICE_ATTR_RO(carrier_changes); + +static ssize_t carrier_up_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); +} +static DEVICE_ATTR_RO(carrier_up_count); + +static ssize_t carrier_down_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); } +static DEVICE_ATTR_RO(carrier_down_count); /* read-write attributes */ -NETDEVICE_SHOW(mtu, fmt_dec); -static int change_mtu(struct net_device *net, unsigned long new_mtu) +static int change_mtu(struct net_device *dev, unsigned long new_mtu) { - return dev_set_mtu(net, (int) new_mtu); + return dev_set_mtu(dev, (int)new_mtu); } -static ssize_t store_mtu(struct device *dev, struct device_attribute *attr, +static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_mtu); } +NETDEVICE_SHOW_RW(mtu, fmt_dec); -NETDEVICE_SHOW(flags, fmt_hex); - -static int change_flags(struct net_device *net, unsigned long new_flags) +static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(net, (unsigned int) new_flags); + return dev_change_flags(dev, (unsigned int)new_flags, NULL); } -static ssize_t store_flags(struct device *dev, struct device_attribute *attr, +static ssize_t flags_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_flags); } +NETDEVICE_SHOW_RW(flags, fmt_hex); + +static ssize_t tx_queue_len_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; -NETDEVICE_SHOW(tx_queue_len, fmt_ulong); + return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); +} +NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); -static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - net->tx_queue_len = new_len; + netdev_set_gro_flush_timeout(dev, val); return 0; } -static ssize_t store_tx_queue_len(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) +static ssize_t gro_flush_timeout_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_tx_queue_len); + return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout); } +NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); -static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + if (val > S32_MAX) + return -ERANGE; + + netdev_set_defer_hard_irqs(dev, (u32)val); + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_lock_store(dev, attr, buf, len, + change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint); + +static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); @@ -298,65 +578,231 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); +err: rtnl_unlock(); - return ret < 0 ? ret : len; + return ret; } -static ssize_t show_ifalias(struct device *dev, +static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); - ssize_t ret = 0; + char tmp[IFALIASZ]; + ssize_t ret; - if (!rtnl_trylock()) - return restart_syscall(); - if (netdev->ifalias) - ret = sprintf(buf, "%s\n", netdev->ifalias); - rtnl_unlock(); + ret = dev_get_alias(netdev, tmp, sizeof(tmp)); + if (ret > 0) + ret = sysfs_emit(buf, "%s\n", tmp); return ret; } +static DEVICE_ATTR_RW(ifalias); -NETDEVICE_SHOW(group, fmt_dec); - -static int change_group(struct net_device *net, unsigned long new_group) +static int change_group(struct net_device *dev, unsigned long new_group) { - dev_set_group(net, (int) new_group); + dev_set_group(dev, (int)new_group); return 0; } -static ssize_t store_group(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) +static ssize_t group_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_group); } +NETDEVICE_SHOW(group, fmt_dec); +static DEVICE_ATTR(netdev_group, 0644, group_show, group_store); + +static int change_proto_down(struct net_device *dev, unsigned long proto_down) +{ + return dev_change_proto_down(dev, (bool)proto_down); +} + +static ssize_t proto_down_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_proto_down); +} +NETDEVICE_SHOW_RW(proto_down, fmt_dec); + +static ssize_t phys_port_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + struct netdev_phys_item_id ppid; + ssize_t ret; + + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); + + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_id); + +static ssize_t phys_port_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + char name[IFNAMSIZ]; + ssize_t ret; + + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sysfs_emit(buf, "%s\n", name); + + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_name); + +static ssize_t phys_switch_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + struct netdev_phys_item_id ppid = { }; + ssize_t ret; + + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + + ret = netif_get_port_parent_id(netdev, &ppid, false); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); + + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_switch_id); + +static struct attribute *netdev_phys_attrs[] __ro_after_init = { + &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr, + NULL, +}; + +static umode_t netdev_phys_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + struct device *dev = kobj_to_dev(kobj); + struct net_device *netdev = to_net_dev(dev); + + if (attr == &dev_attr_phys_port_id.attr) { + if (!netdev->netdev_ops->ndo_get_phys_port_id) + return 0; + } else if (attr == &dev_attr_phys_port_name.attr) { + if (!netdev->netdev_ops->ndo_get_phys_port_name && + !netdev->devlink_port) + return 0; + } else if (attr == &dev_attr_phys_switch_id.attr) { + if (!netdev->netdev_ops->ndo_get_port_parent_id && + !netdev->devlink_port) + return 0; + } + + return attr->mode; +} + +static const struct attribute_group netdev_phys_group = { + .attrs = netdev_phys_attrs, + .is_visible = netdev_phys_is_visible, +}; + +static ssize_t threaded_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + rcu_read_lock(); + + if (dev_isalive(netdev)) + ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + + rcu_read_unlock(); + + return ret; +} + +static int modify_napi_threaded(struct net_device *dev, unsigned long val) +{ + int ret; -static struct device_attribute net_class_attributes[] = { - __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), - __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), - __ATTR(dev_id, S_IRUGO, show_dev_id, NULL), - __ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias), - __ATTR(iflink, S_IRUGO, show_iflink, NULL), - __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), - __ATTR(type, S_IRUGO, show_type, NULL), - __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), - __ATTR(address, S_IRUGO, show_address, NULL), - __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), - __ATTR(carrier, S_IRUGO | S_IWUSR, show_carrier, store_carrier), - __ATTR(speed, S_IRUGO, show_speed, NULL), - __ATTR(duplex, S_IRUGO, show_duplex, NULL), - __ATTR(dormant, S_IRUGO, show_dormant, NULL), - __ATTR(operstate, S_IRUGO, show_operstate, NULL), - __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), - __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), - __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, - store_tx_queue_len), - __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group), - {} + if (list_empty(&dev->napi_list)) + return -EOPNOTSUPP; + + if (val != 0 && val != 1) + return -EOPNOTSUPP; + + ret = netif_set_threaded(dev, val); + + return ret; +} + +static ssize_t threaded_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded); +} +static DEVICE_ATTR_RW(threaded); + +static struct attribute *net_class_attrs[] __ro_after_init = { + &dev_attr_netdev_group.attr, + &dev_attr_type.attr, + &dev_attr_dev_id.attr, + &dev_attr_dev_port.attr, + &dev_attr_iflink.attr, + &dev_attr_ifindex.attr, + &dev_attr_name_assign_type.attr, + &dev_attr_addr_assign_type.attr, + &dev_attr_addr_len.attr, + &dev_attr_link_mode.attr, + &dev_attr_address.attr, + &dev_attr_broadcast.attr, + &dev_attr_speed.attr, + &dev_attr_duplex.attr, + &dev_attr_dormant.attr, + &dev_attr_testing.attr, + &dev_attr_operstate.attr, + &dev_attr_carrier_changes.attr, + &dev_attr_ifalias.attr, + &dev_attr_carrier.attr, + &dev_attr_mtu.attr, + &dev_attr_flags.attr, + &dev_attr_tx_queue_len.attr, + &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, + &dev_attr_proto_down.attr, + &dev_attr_carrier_up_count.attr, + &dev_attr_carrier_down_count.attr, + &dev_attr_threaded.attr, + NULL, }; +ATTRIBUTE_GROUPS(net_class); /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, @@ -367,28 +813,28 @@ static ssize_t netstat_show(const struct device *d, ssize_t ret = -EINVAL; WARN_ON(offset > sizeof(struct rtnl_link_stats64) || - offset % sizeof(u64) != 0); + offset % sizeof(u64) != 0); - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset)); + ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } /* generate a read-only statistics attribute */ #define NETSTAT_ENTRY(name) \ -static ssize_t show_##name(struct device *d, \ - struct device_attribute *attr, char *buf) \ +static ssize_t name##_show(struct device *d, \ + struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ offsetof(struct rtnl_link_stats64, name)); \ } \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) +static DEVICE_ATTR_RO(name) NETSTAT_ENTRY(rx_packets); NETSTAT_ENTRY(tx_packets); @@ -413,8 +859,9 @@ NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); +NETSTAT_ENTRY(rx_nohandler); -static struct attribute *netstat_attrs[] = { +static struct attribute *netstat_attrs[] __ro_after_init = { &dev_attr_rx_packets.attr, &dev_attr_tx_packets.attr, &dev_attr_rx_bytes.attr, @@ -438,65 +885,69 @@ static struct attribute *netstat_attrs[] = { &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, + &dev_attr_rx_nohandler.attr, NULL }; - -static struct attribute_group netstat_group = { +static const struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) static struct attribute *wireless_attrs[] = { NULL }; -static struct attribute_group wireless_group = { +static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; + +static bool wireless_group_needed(struct net_device *ndev) +{ +#if IS_ENABLED(CONFIG_CFG80211) + if (ndev->ieee80211_ptr) + return true; +#endif +#if IS_ENABLED(CONFIG_WIRELESS_EXT) + if (ndev->wireless_handlers) + return true; #endif + return false; +} + +#else /* CONFIG_SYSFS */ +#define net_class_groups NULL #endif /* CONFIG_SYSFS */ -#ifdef CONFIG_RPS -/* - * RX queue sysfs structures and functions. - */ -struct rx_queue_attribute { - struct attribute attr; - ssize_t (*show)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, char *buf); - ssize_t (*store)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, const char *buf, size_t len); -}; -#define to_rx_queue_attr(_attr) container_of(_attr, \ - struct rx_queue_attribute, attr) +#ifdef CONFIG_SYSFS +#define to_rx_queue_attr(_attr) \ + container_of(_attr, struct rx_queue_attribute, attr) #define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->show) return -EIO; - return attribute->show(queue, attribute, buf); + return attribute->show(queue, buf); } static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { - struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->store) return -EIO; - return attribute->store(queue, attribute, buf, count); + return attribute->store(queue, buf, count); } static const struct sysfs_ops rx_queue_sysfs_ops = { @@ -504,13 +955,12 @@ static const struct sysfs_ops rx_queue_sysfs_ops = { .store = rx_queue_attr_store, }; -static ssize_t show_rps_map(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attribute, char *buf) +#ifdef CONFIG_RPS +static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) { struct rps_map *map; cpumask_var_t mask; - size_t len = 0; - int i; + int i, len; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; @@ -521,77 +971,93 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, for (i = 0; i < map->len; i++) cpumask_set_cpu(map->cpus[i], mask); - len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); - if (PAGE_SIZE - len < 3) { - rcu_read_unlock(); - free_cpumask_var(mask); - return -EINVAL; - } + len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask)); rcu_read_unlock(); - free_cpumask_var(mask); - len += sprintf(buf + len, "\n"); - return len; + + return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t store_rps_map(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attribute, - const char *buf, size_t len) +static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, + cpumask_var_t mask) { + static DEFINE_MUTEX(rps_map_mutex); struct rps_map *old_map, *map; - cpumask_var_t mask; - int err, cpu, i; - static DEFINE_SPINLOCK(rps_map_lock); - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); - if (err) { - free_cpumask_var(mask); - return err; - } + int cpu, i; map = kzalloc(max_t(unsigned int, - RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), - GFP_KERNEL); - if (!map) { - free_cpumask_var(mask); + RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), + GFP_KERNEL); + if (!map) return -ENOMEM; - } i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) map->cpus[i++] = cpu; - if (i) + if (i) { map->len = i; - else { + } else { kfree(map); map = NULL; } - spin_lock(&rps_map_lock); + mutex_lock(&rps_map_mutex); old_map = rcu_dereference_protected(queue->rps_map, - lockdep_is_held(&rps_map_lock)); + mutex_is_locked(&rps_map_mutex)); rcu_assign_pointer(queue->rps_map, map); - spin_unlock(&rps_map_lock); if (map) - static_key_slow_inc(&rps_needed); - if (old_map) { + static_branch_inc(&rps_needed); + if (old_map) + static_branch_dec(&rps_needed); + + mutex_unlock(&rps_map_mutex); + + if (old_map) kfree_rcu(old_map, rcu); - static_key_slow_dec(&rps_needed); + return 0; +} + +int rps_cpumask_housekeeping(struct cpumask *mask) +{ + if (!cpumask_empty(mask)) { + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); + if (cpumask_empty(mask)) + return -EINVAL; } + return 0; +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + const char *buf, size_t len) +{ + cpumask_var_t mask; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) + goto out; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto out; + + err = netdev_rx_queue_set_rps_mask(queue, mask); + +out: free_cpumask_var(mask); - return len; + return err ? : len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, char *buf) { struct rps_dev_flow_table *flow_table; @@ -600,10 +1066,10 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) - val = (unsigned long)flow_table->mask + 1; + val = 1UL << flow_table->log; rcu_read_unlock(); - return sprintf(buf, "%lu\n", val); + return sysfs_emit(buf, "%lu\n", val); } static void rps_dev_flow_table_release(struct rcu_head *rcu) @@ -614,8 +1080,7 @@ static void rps_dev_flow_table_release(struct rcu_head *rcu) } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, - const char *buf, size_t len) + const char *buf, size_t len) { unsigned long mask, count; struct rps_dev_flow_table *table, *old_table; @@ -637,8 +1102,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); /* On 64 bit arches, must check mask fits in table->mask (u32), - * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1) - * doesnt overflow. + * and on 32bit arches, must check + * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. */ #if BITS_PER_LONG > 32 if (mask > (unsigned long)(u32)mask) @@ -654,11 +1119,14 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, if (!table) return -ENOMEM; - table->mask = mask; - for (count = 0; count <= mask; count++) + table->log = ilog2(mask) + 1; + for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; - } else + table->flows[count].filter = RPS_NO_FILTER; + } + } else { table = NULL; + } spin_lock(&rps_dev_flow_lock); old_table = rcu_dereference_protected(queue->rps_flow_table, @@ -672,27 +1140,30 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return len; } -static struct rx_queue_attribute rps_cpus_attribute = - __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); +static struct rx_queue_attribute rps_cpus_attribute __ro_after_init + = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); +static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init + = __ATTR(rps_flow_cnt, 0644, + show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); +#endif /* CONFIG_RPS */ -static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = - __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, - show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); - -static struct attribute *rx_queue_default_attrs[] = { +static struct attribute *rx_queue_default_attrs[] __ro_after_init = { +#ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, +#endif NULL }; +ATTRIBUTE_GROUPS(rx_queue_default); static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); +#ifdef CONFIG_RPS struct rps_map *map; struct rps_dev_flow_table *flow_table; - map = rcu_dereference_protected(queue->rps_map, 1); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); @@ -704,55 +1175,191 @@ static void rx_queue_release(struct kobject *kobj) RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); } +#endif memset(kobj, 0, sizeof(*kobj)); - dev_put(queue->dev); + netdev_put(queue->dev, &queue->dev_tracker); } -static struct kobj_type rx_queue_ktype = { +static const void *rx_queue_namespace(const struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct device *dev = &queue->dev->dev; + const void *ns = NULL; + + if (dev->class && dev->class->namespace) + ns = dev->class->namespace(dev); + + return ns; +} + +static void rx_queue_get_ownership(const struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = rx_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + +static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_attrs = rx_queue_default_attrs, + .namespace = rx_queue_namespace, + .get_ownership = rx_queue_get_ownership, }; -static int rx_queue_add_kobject(struct net_device *net, int index) +static int rx_queue_default_mask(struct net_device *dev, + struct netdev_rx_queue *queue) { - struct netdev_rx_queue *queue = net->_rx + index; +#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) + struct cpumask *rps_default_mask; + int res = 0; + + mutex_lock(&rps_default_mask_mutex); + + rps_default_mask = dev_net(dev)->core.rps_default_mask; + if (rps_default_mask && !cpumask_empty(rps_default_mask)) + res = netdev_rx_queue_set_rps_mask(queue, rps_default_mask); + + mutex_unlock(&rps_default_mask_mutex); + + return res; +#else + return 0; +#endif +} + +static int rx_queue_add_kobject(struct net_device *dev, int index) +{ + struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; - kobj->kset = net->queues_kset; + /* Rx queues are cleared in rx_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); + return -EAGAIN; + } + + /* Kobject_put later will trigger rx_queue_release call which + * decreases dev refcount: Take that reference here + */ + netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); + + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, - "rx-%u", index); - if (error) { - kobject_put(kobj); - return error; + "rx-%u", index); + if (error) + goto err; + + queue->groups = rx_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + + if (dev->sysfs_rx_queue_group) { + error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); + if (error) + goto err_default_groups; } + error = rx_queue_default_mask(dev, queue); + if (error) + goto err_default_groups; + kobject_uevent(kobj, KOBJ_ADD); - dev_hold(queue->dev); return error; + +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); +err: + kobject_put(kobj); + return error; } -#endif /* CONFIG_RPS */ + +static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid, + kgid_t kgid) +{ + struct netdev_rx_queue *queue = dev->_rx + index; + struct kobject *kobj = &queue->kobj; + int error; + + error = sysfs_change_owner(kobj, kuid, kgid); + if (error) + return error; + + if (dev->sysfs_rx_queue_group) + error = sysfs_group_change_owner( + kobj, dev->sysfs_rx_queue_group, kuid, kgid); + + return error; +} +#endif /* CONFIG_SYSFS */ int -net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS int i; int error = 0; +#ifndef CONFIG_RPS + if (!dev->sysfs_rx_queue_group) + return 0; +#endif for (i = old_num; i < new_num; i++) { - error = rx_queue_add_kobject(net, i); + error = rx_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } - while (--i >= new_num) - kobject_put(&net->_rx[i].kobj); + while (--i >= new_num) { + struct netdev_rx_queue *queue = &dev->_rx[i]; + struct kobject *kobj = &queue->kobj; + + if (!check_net(dev_net(dev))) + kobj->uevent_suppress = 1; + if (dev->sysfs_rx_queue_group) + sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + sysfs_remove_groups(kobj, queue->groups); + kobject_put(kobj); + } + + return error; +#else + return 0; +#endif +} + +static int net_rx_queue_change_owner(struct net_device *dev, int num, + kuid_t kuid, kgid_t kgid) +{ +#ifdef CONFIG_SYSFS + int error = 0; + int i; + +#ifndef CONFIG_RPS + if (!dev->sysfs_rx_queue_group) + return 0; +#endif + for (i = 0; i < num; i++) { + error = rx_queue_change_owner(dev, i, kuid, kgid); + if (error) + break; + } return error; #else @@ -766,39 +1373,42 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, char *buf); - ssize_t (*store)(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, const char *buf, size_t len); + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len); }; -#define to_netdev_queue_attr(_attr) container_of(_attr, \ - struct netdev_queue_attribute, attr) +#define to_netdev_queue_attr(_attr) \ + container_of(_attr, struct netdev_queue_attribute, attr) #define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) static ssize_t netdev_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + const struct netdev_queue_attribute *attribute + = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->show) return -EIO; - return attribute->show(queue, attribute, buf); + return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { - struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + const struct netdev_queue_attribute *attribute + = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->store) return -EIO; - return attribute->store(queue, attribute, buf, count); + return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -806,21 +1416,119 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; -static ssize_t show_trans_timeout(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, - char *buf) +static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); + + return sysfs_emit(buf, fmt_ulong, trans_timeout); +} + +static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { - unsigned long trans_timeout; + struct net_device *dev = queue->dev; + unsigned int i; + + i = queue - dev->_tx; + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + +static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + int num_tc, tc, index, ret; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; + + index = get_netdev_queue_index(queue); + + /* If queue belongs to subordinate dev use its TC mapping */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + + rtnl_unlock(); + + if (tc < 0) + return -EINVAL; + + /* We can report the traffic class one of two ways: + * Subordinate device traffic classes are reported with the traffic + * class first, and then the subordinate class so for example TC0 on + * subordinate device 2 will be reported as "0-2". If the queue + * belongs to the root device it will be reported with just the + * traffic class, so just "0" for TC 0 for example. + */ + return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) : + sysfs_emit(buf, "%d\n", tc); +} + +#ifdef CONFIG_XPS +static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); +} + +static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) +{ + int err, index = get_netdev_queue_index(queue); + struct net_device *dev = queue->dev; + u32 rate = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; - spin_lock_irq(&queue->_xmit_lock); - trans_timeout = queue->trans_timeout; - spin_unlock_irq(&queue->_xmit_lock); + /* The check is also done later; this helps returning early without + * hitting the locking section below. + */ + if (!dev->netdev_ops->ndo_set_tx_maxrate) + return -EOPNOTSUPP; - return sprintf(buf, "%lu", trans_timeout); + err = kstrtou32(buf, 10, &rate); + if (err < 0) + return err; + + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) + return err; + + err = -EOPNOTSUPP; + netdev_lock_ops(dev); + if (dev->netdev_ops->ndo_set_tx_maxrate) + err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); + netdev_unlock_ops(dev); + + if (!err) { + queue->tx_maxrate = rate; + rtnl_unlock(); + return len; + } + + rtnl_unlock(); + return err; } -static struct netdev_queue_attribute queue_trans_timeout = - __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); +static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init + = __ATTR_RW(tx_maxrate); +#endif + +static struct netdev_queue_attribute queue_trans_timeout __ro_after_init + = __ATTR_RO(tx_timeout); + +static struct netdev_queue_attribute queue_traffic_class __ro_after_init + = __ATTR_RO(traffic_class); #ifdef CONFIG_BQL /* @@ -828,7 +1536,7 @@ static struct netdev_queue_attribute queue_trans_timeout = */ static ssize_t bql_show(char *buf, unsigned int value) { - return sprintf(buf, "%u\n", value); + return sysfs_emit(buf, "%u\n", value); } static ssize_t bql_set(const char *buf, const size_t count, @@ -837,9 +1545,9 @@ static ssize_t bql_set(const char *buf, const size_t count, unsigned int value; int err; - if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) + if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) { value = DQL_MAX_LIMIT; - else { + } else { err = kstrtouint(buf, 10, &value); if (err < 0) return err; @@ -852,18 +1560,17 @@ static ssize_t bql_set(const char *buf, const size_t count, return count; } -static ssize_t bql_show_hold_time(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, - char *buf) +static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } -static ssize_t bql_set_hold_time(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, - const char *buf, size_t len) +static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -878,130 +1585,230 @@ static ssize_t bql_set_hold_time(struct netdev_queue *queue, return len; } -static struct netdev_queue_attribute bql_hold_time_attribute = - __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time, - bql_set_hold_time); +static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init + = __ATTR(hold_time, 0644, + bql_show_hold_time, bql_set_hold_time); + +static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); +} + +static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct dql *dql = &queue->dql; + unsigned int value; + int err; + + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + + value = msecs_to_jiffies(value); + if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG)) + return -ERANGE; + + if (!dql->stall_thrs && value) + dql->last_reap = jiffies; + /* Force last_reap to be live */ + smp_wmb(); + dql->stall_thrs = value; + + return len; +} + +static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = + __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); + +static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); +} -static ssize_t bql_show_inflight(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, - char *buf) +static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) +{ + WRITE_ONCE(queue->dql.stall_max, 0); + return len; +} + +static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = + __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); + +static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); + return sysfs_emit(buf, "%lu\n", dql->stall_cnt); } -static struct netdev_queue_attribute bql_inflight_attribute = - __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL); +static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = + __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); + +static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed); +} + +static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = + __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ -static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - struct netdev_queue_attribute *attr, \ - char *buf) \ +static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ -static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ - struct netdev_queue_attribute *attr, \ +static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ } \ \ -static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \ - __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \ - bql_set_ ## NAME); +static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ + = __ATTR(NAME, 0644, \ + bql_show_ ## NAME, bql_set_ ## NAME) -BQL_ATTR(limit, limit) -BQL_ATTR(limit_max, max_limit) -BQL_ATTR(limit_min, min_limit) +BQL_ATTR(limit, limit); +BQL_ATTR(limit_max, max_limit); +BQL_ATTR(limit_min, min_limit); -static struct attribute *dql_attrs[] = { +static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_attribute.attr, &bql_limit_max_attribute.attr, &bql_limit_min_attribute.attr, &bql_hold_time_attribute.attr, &bql_inflight_attribute.attr, + &bql_stall_thrs_attribute.attr, + &bql_stall_cnt_attribute.attr, + &bql_stall_max_attribute.attr, NULL }; -static struct attribute_group dql_group = { +static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; +#else +/* Fake declaration, all the code using it should be dead */ +static const struct attribute_group dql_group = {}; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, + int tc, char *buf, enum xps_map_type type) { - struct net_device *dev = queue->dev; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) - if (queue == &dev->_tx[i]) - break; + struct xps_dev_maps *dev_maps; + unsigned long *mask; + unsigned int nr_ids; + int j, len; - BUG_ON(i >= dev->num_tx_queues); + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps[type]); - return i; -} + /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 + * when dev_maps hasn't been allocated yet, to be backward compatible. + */ + nr_ids = dev_maps ? dev_maps->nr_ids : + (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); + mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); + if (!mask) { + rcu_read_unlock(); + return -ENOMEM; + } -static ssize_t show_xps_map(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, char *buf) -{ - struct net_device *dev = queue->dev; - struct xps_dev_maps *dev_maps; - cpumask_var_t mask; - unsigned long index; - size_t len = 0; - int i; + if (!dev_maps || tc >= dev_maps->num_tc) + goto out_no_maps; - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + for (j = 0; j < nr_ids; j++) { + int i, tci = j * dev_maps->num_tc + tc; + struct xps_map *map; - index = get_netdev_queue_index(queue); + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); - if (dev_maps) { - for_each_possible_cpu(i) { - struct xps_map *map = - rcu_dereference(dev_maps->cpu_map[i]); - if (map) { - int j; - for (j = 0; j < map->len; j++) { - if (map->queues[j] == index) { - cpumask_set_cpu(i, mask); - break; - } - } + for (i = map->len; i--;) { + if (map->queues[i] == index) { + __set_bit(j, mask); + break; } } } +out_no_maps: rcu_read_unlock(); - len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); - if (PAGE_SIZE - len < 3) { - free_cpumask_var(mask); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); + bitmap_free(mask); + + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + unsigned int index; + int len, tc, ret; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + index = get_netdev_queue_index(queue); + + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + rtnl_unlock(); return -EINVAL; } - free_cpumask_var(mask); - len += sprintf(buf + len, "\n"); + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); + rtnl_unlock(); + + len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); + + dev_put(dev); return len; } -static ssize_t store_xps_map(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, - const char *buf, size_t len) +static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct net_device *dev = queue->dev; - unsigned long index; + unsigned int index; cpumask_var_t mask; int err; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1016,76 +1823,237 @@ static ssize_t store_xps_map(struct netdev_queue *queue, return err; } + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { + free_cpumask_var(mask); + return err; + } + err = netif_set_xps_queue(dev, mask, index); + rtnl_unlock(); free_cpumask_var(mask); return err ? : len; } -static struct netdev_queue_attribute xps_cpus_attribute = - __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); +static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init + = __ATTR_RW(xps_cpus); + +static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + unsigned int index; + int tc, ret; + + index = get_netdev_queue_index(queue); + + ret = sysfs_rtnl_lock(kobj, attr, dev); + if (ret) + return ret; + + tc = netdev_txq_to_tc(dev, index); + + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); + rtnl_unlock(); + + ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; + dev_put(dev); + return ret; +} + +static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct net_device *dev = queue->dev; + struct net *net = dev_net(dev); + unsigned long *mask; + unsigned int index; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); + if (!mask) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, mask, dev->num_rx_queues); + if (err) { + bitmap_free(mask); + return err; + } + + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { + bitmap_free(mask); + return err; + } + + cpus_read_lock(); + err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); + cpus_read_unlock(); + + rtnl_unlock(); + + bitmap_free(mask); + return err ? : len; +} + +static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init + = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ -static struct attribute *netdev_queue_default_attrs[] = { +static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_trans_timeout.attr, + &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &xps_rxqs_attribute.attr, + &queue_tx_maxrate.attr, #endif NULL }; +ATTRIBUTE_GROUPS(netdev_queue_default); static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); memset(kobj, 0, sizeof(*kobj)); - dev_put(queue->dev); + netdev_put(queue->dev, &queue->dev_tracker); +} + +static const void *netdev_queue_namespace(const struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + struct device *dev = &queue->dev->dev; + const void *ns = NULL; + + if (dev->class && dev->class->namespace) + ns = dev->class->namespace(dev); + + return ns; } -static struct kobj_type netdev_queue_ktype = { +static void netdev_queue_get_ownership(const struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = netdev_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + +static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_attrs = netdev_queue_default_attrs, + .namespace = netdev_queue_namespace, + .get_ownership = netdev_queue_get_ownership, }; -static int netdev_queue_add_kobject(struct net_device *net, int index) +static bool netdev_uses_bql(const struct net_device *dev) { - struct netdev_queue *queue = net->_tx + index; + if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE)) + return false; + + return IS_ENABLED(CONFIG_BQL); +} + +static int netdev_queue_add_kobject(struct net_device *dev, int index) +{ + struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; - kobj->kset = net->queues_kset; + /* Tx queues are cleared in netdev_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); + return -EAGAIN; + } + + /* Kobject_put later will trigger netdev_queue_release call + * which decreases dev refcount: Take that reference here + */ + netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); + + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, - "tx-%u", index); + "tx-%u", index); if (error) - goto exit; + goto err; -#ifdef CONFIG_BQL - error = sysfs_create_group(kobj, &dql_group); + queue->groups = netdev_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); if (error) - goto exit; -#endif + goto err; - kobject_uevent(kobj, KOBJ_ADD); - dev_hold(queue->dev); + if (netdev_uses_bql(dev)) { + error = sysfs_create_group(kobj, &dql_group); + if (error) + goto err_default_groups; + } + kobject_uevent(kobj, KOBJ_ADD); return 0; -exit: + +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); +err: kobject_put(kobj); return error; } + +static int tx_queue_change_owner(struct net_device *ndev, int index, + kuid_t kuid, kgid_t kgid) +{ + struct netdev_queue *queue = ndev->_tx + index; + struct kobject *kobj = &queue->kobj; + int error; + + error = sysfs_change_owner(kobj, kuid, kgid); + if (error) + return error; + + if (netdev_uses_bql(ndev)) + error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); + + return error; +} #endif /* CONFIG_SYSFS */ int -netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; + /* Tx queue kobjects are allowed to be updated when a device is being + * unregistered, but solely to remove queues from qdiscs. Any path + * adding queues should be fixed. + */ + WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num, + "New queues can't be registered after device unregistration."); + for (i = old_num; i < new_num; i++) { - error = netdev_queue_add_kobject(net, i); + error = netdev_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; @@ -1093,11 +2061,15 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } while (--i >= new_num) { - struct netdev_queue *queue = net->_tx + i; + struct netdev_queue *queue = dev->_tx + i; -#ifdef CONFIG_BQL - sysfs_remove_group(&queue->kobj, &dql_group); -#endif + if (!check_net(dev_net(dev))) + queue->kobj.uevent_suppress = 1; + + if (netdev_uses_bql(dev)) + sysfs_remove_group(&queue->kobj, &dql_group); + + sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } @@ -1107,28 +2079,44 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) #endif /* CONFIG_SYSFS */ } -static int register_queue_kobjects(struct net_device *net) +static int net_tx_queue_change_owner(struct net_device *dev, int num, + kuid_t kuid, kgid_t kgid) +{ +#ifdef CONFIG_SYSFS + int error = 0; + int i; + + for (i = 0; i < num; i++) { + error = tx_queue_change_owner(dev, i, kuid, kgid); + if (error) + break; + } + + return error; +#else + return 0; +#endif /* CONFIG_SYSFS */ +} + +static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS - net->queues_kset = kset_create_and_add("queues", - NULL, &net->dev.kobj); - if (!net->queues_kset) + dev->queues_kset = kset_create_and_add("queues", + NULL, &dev->dev.kobj); + if (!dev->queues_kset) return -ENOMEM; + real_rx = dev->real_num_rx_queues; #endif + real_tx = dev->real_num_tx_queues; -#ifdef CONFIG_RPS - real_rx = net->real_num_rx_queues; -#endif - real_tx = net->real_num_tx_queues; - - error = net_rx_queue_update_kobjects(net, 0, real_rx); + error = net_rx_queue_update_kobjects(dev, 0, real_rx); if (error) goto error; rxq = real_rx; - error = netdev_queue_update_kobjects(net, 0, real_tx); + error = netdev_queue_update_kobjects(dev, 0, real_tx); if (error) goto error; txq = real_tx; @@ -1136,33 +2124,73 @@ static int register_queue_kobjects(struct net_device *net) return 0; error: - netdev_queue_update_kobjects(net, txq, 0); - net_rx_queue_update_kobjects(net, rxq, 0); + netdev_queue_update_kobjects(dev, txq, 0); + net_rx_queue_update_kobjects(dev, rxq, 0); +#ifdef CONFIG_SYSFS + kset_unregister(dev->queues_kset); +#endif return error; } -static void remove_queue_kobjects(struct net_device *net) +static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid) +{ + int error = 0, real_rx = 0, real_tx = 0; + +#ifdef CONFIG_SYSFS + if (ndev->queues_kset) { + error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid); + if (error) + return error; + } + real_rx = ndev->real_num_rx_queues; +#endif + real_tx = ndev->real_num_tx_queues; + + error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid); + if (error) + return error; + + error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid); + if (error) + return error; + + return 0; +} + +static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; -#ifdef CONFIG_RPS - real_rx = net->real_num_rx_queues; +#ifdef CONFIG_SYSFS + real_rx = dev->real_num_rx_queues; #endif - real_tx = net->real_num_tx_queues; + real_tx = dev->real_num_tx_queues; + + net_rx_queue_update_kobjects(dev, real_rx, 0); + netdev_queue_update_kobjects(dev, real_tx, 0); - net_rx_queue_update_kobjects(net, real_rx, 0); - netdev_queue_update_kobjects(net, real_tx, 0); + netdev_lock_ops(dev); + dev->real_num_rx_queues = 0; + dev->real_num_tx_queues = 0; + netdev_unlock_ops(dev); #ifdef CONFIG_SYSFS - kset_unregister(net->queues_kset); + kset_unregister(dev->queues_kset); #endif } +static bool net_current_may_mount(void) +{ + struct net *net = current->nsproxy->net_ns; + + return ns_capable(net->user_ns, CAP_SYS_ADMIN); +} + static void *net_grab_current_ns(void) { struct net *ns = current->nsproxy->net_ns; #ifdef CONFIG_NET_NS if (ns) - atomic_inc(&ns->passive); + refcount_inc(&ns->passive); #endif return ns; } @@ -1177,8 +2205,9 @@ static const void *net_netlink_ns(struct sock *sk) return sock_net(sk); } -struct kobj_ns_type_operations net_ns_type_operations = { +const struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, + .current_may_mount = net_current_may_mount, .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, @@ -1186,9 +2215,9 @@ struct kobj_ns_type_operations net_ns_type_operations = { }; EXPORT_SYMBOL_GPL(net_ns_type_operations); -static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) +static int netdev_uevent(const struct device *d, struct kobj_uevent_env *env) { - struct net_device *dev = to_net_dev(d); + const struct net_device *dev = to_net_dev(d); int retval; /* pass interface to uevent. */ @@ -1198,7 +2227,8 @@ static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) /* pass ifindex to uevent. * ifindex is useful as it won't change (interface name may change) - * and is what RtNetlink uses natively. */ + * and is what RtNetlink uses natively. + */ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); exit: @@ -1215,38 +2245,84 @@ static void netdev_release(struct device *d) BUG_ON(dev->reg_state != NETREG_RELEASED); - kfree(dev->ifalias); - kfree((char *)dev - dev->padded); + /* no need to wait for rcu grace period: + * device is dead and about to be freed. + */ + kfree(rcu_access_pointer(dev->ifalias)); + kvfree(dev); } -static const void *net_namespace(struct device *d) +static const void *net_namespace(const struct device *d) { - struct net_device *dev; - dev = container_of(d, struct net_device, dev); + const struct net_device *dev = to_net_dev(d); + return dev_net(dev); } -static struct class net_class = { +static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) +{ + const struct net_device *dev = to_net_dev(d); + const struct net *net = dev_net(dev); + + net_ns_get_ownership(net, uid, gid); +} + +static const struct class net_class = { .name = "net", .dev_release = netdev_release, -#ifdef CONFIG_SYSFS - .dev_attrs = net_class_attributes, -#endif /* CONFIG_SYSFS */ + .dev_groups = net_class_groups, .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, + .get_ownership = net_get_ownership, }; +#ifdef CONFIG_OF +static int of_dev_node_match(struct device *dev, const void *data) +{ + for (; dev; dev = dev->parent) { + if (dev->of_node == data) + return 1; + } + + return 0; +} + +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */ +struct net_device *of_find_net_device_by_node(struct device_node *np) +{ + struct device *dev; + + dev = class_find_device(&net_class, NULL, np, of_dev_node_match); + if (!dev) + return NULL; + + return to_net_dev(dev); +} +EXPORT_SYMBOL(of_find_net_device_by_node); +#endif + /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ -void netdev_unregister_kobject(struct net_device * net) +void netdev_unregister_kobject(struct net_device *ndev) { - struct device *dev = &(net->dev); + struct device *dev = &ndev->dev; + + if (!check_net(dev_net(ndev))) + dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); - remove_queue_kobjects(net); + remove_queue_kobjects(ndev); pm_runtime_set_memalloc_noio(dev, false); @@ -1254,18 +2330,18 @@ void netdev_unregister_kobject(struct net_device * net) } /* Create sysfs entries for network device. */ -int netdev_register_kobject(struct net_device *net) +int netdev_register_kobject(struct net_device *ndev) { - struct device *dev = &(net->dev); - const struct attribute_group **groups = net->sysfs_groups; + struct device *dev = &ndev->dev; + const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; device_initialize(dev); dev->class = &net_class; - dev->platform_data = net; + dev->platform_data = ndev; dev->groups = groups; - dev_set_name(dev, "%s", net->name); + dev_set_name(dev, "%s", ndev->name); #ifdef CONFIG_SYSFS /* Allow for a device specific group */ @@ -1273,22 +2349,17 @@ int netdev_register_kobject(struct net_device *net) groups++; *groups++ = &netstat_group; + *groups++ = &netdev_phys_group; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) - if (net->ieee80211_ptr) + if (wireless_group_needed(ndev)) *groups++ = &wireless_group; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) - else if (net->wireless_handlers) - *groups++ = &wireless_group; -#endif -#endif #endif /* CONFIG_SYSFS */ error = device_add(dev); if (error) return error; - error = register_queue_kobjects(net); + error = register_queue_kobjects(ndev); if (error) { device_del(dev); return error; @@ -1299,19 +2370,52 @@ int netdev_register_kobject(struct net_device *net) return error; } -int netdev_class_create_file(struct class_attribute *class_attr) +/* Change owner for sysfs entries when moving network devices across network + * namespaces owned by different user namespaces. + */ +int netdev_change_owner(struct net_device *ndev, const struct net *net_old, + const struct net *net_new) +{ + kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID; + kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID; + struct device *dev = &ndev->dev; + int error; + + net_ns_get_ownership(net_old, &old_uid, &old_gid); + net_ns_get_ownership(net_new, &new_uid, &new_gid); + + /* The network namespace was changed but the owning user namespace is + * identical so there's no need to change the owner of sysfs entries. + */ + if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid)) + return 0; + + error = device_change_owner(dev, new_uid, new_gid); + if (error) + return error; + + error = queue_change_owner(ndev, new_uid, new_gid); + if (error) + return error; + + return 0; +} + +int netdev_class_create_file_ns(const struct class_attribute *class_attr, + const void *ns) { - return class_create_file(&net_class, class_attr); + return class_create_file_ns(&net_class, class_attr, ns); } -EXPORT_SYMBOL(netdev_class_create_file); +EXPORT_SYMBOL(netdev_class_create_file_ns); -void netdev_class_remove_file(struct class_attribute *class_attr) +void netdev_class_remove_file_ns(const struct class_attribute *class_attr, + const void *ns) { - class_remove_file(&net_class, class_attr); + class_remove_file_ns(&net_class, class_attr, ns); } -EXPORT_SYMBOL(netdev_class_remove_file); +EXPORT_SYMBOL(netdev_class_remove_file_ns); -int netdev_kobject_init(void) +int __init netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); return class_register(&net_class); diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index bd7751ec1c4d..e938f25e8e86 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -1,11 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_SYSFS_H__ #define __NET_SYSFS_H__ -int netdev_kobject_init(void); +int __init netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); int netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num); +int netdev_change_owner(struct net_device *, const struct net *net_old, + const struct net *net_new); + +extern struct mutex rps_default_mask_mutex; #endif diff --git a/net/core/net-traces.c b/net/core/net-traces.c index ba3c0120786c..f2fa34b1d78d 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * consolidates trace point definitions * @@ -22,7 +23,7 @@ #include <linux/net_dropmon.h> #include <linux/slab.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <asm/bitops.h> #define CREATE_TRACE_POINTS @@ -31,7 +32,37 @@ #include <trace/events/napi.h> #include <trace/events/sock.h> #include <trace/events/udp.h> +#include <trace/events/tcp.h> +#include <trace/events/fib.h> +#include <trace/events/qdisc.h> +#if IS_ENABLED(CONFIG_BRIDGE) +#include <trace/events/bridge.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); +EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); +EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); +EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full); +#endif + +#if IS_ENABLED(CONFIG_PAGE_POOL) +#include <trace/events/page_pool.h> +#endif + +#include <trace/events/neigh.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_timer_handler); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); + +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); + +EXPORT_TRACEPOINT_SYMBOL_GPL(udp_fail_queue_rcv_skb); + +EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f97652036754..a6e6a964a287 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/workqueue.h> @@ -15,6 +16,15 @@ #include <linux/file.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/net_namespace.h> +#include <linux/sched/task.h> +#include <linux/uidgid.h> +#include <linux/proc_fs.h> +#include <linux/nstree.h> + +#include <net/aligned_data.h> +#include <net/sock.h> +#include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -24,47 +34,67 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); -struct net init_net = { - .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), -}; +/* Protects net_namespace_list. Nests iside rtnl_lock() */ +DECLARE_RWSEM(net_rwsem); +EXPORT_SYMBOL_GPL(net_rwsem); + +#ifdef CONFIG_KEYS +static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; +#endif + +struct net init_net; EXPORT_SYMBOL(init_net); +static bool init_net_initialized; +/* + * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, + * init_net_initialized and first_device pointer. + * This is internal net namespace object. Please, don't use it + * outside. + */ +DECLARE_RWSEM(pernet_ops_rwsem); + +#define MIN_PERNET_OPS_ID \ + ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) + #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; static struct net_generic *net_alloc_generic(void) { + unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); + unsigned int generic_size; struct net_generic *ng; - size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + generic_size = offsetof(struct net_generic, ptr[gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) - ng->len = max_gen_ptrs; + ng->s.len = gen_ptrs; return ng; } -static int net_assign_generic(struct net *net, int id, void *data) +static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; - BUG_ON(!mutex_is_locked(&net_mutex)); - BUG_ON(id == 0); + BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, - lockdep_is_held(&net_mutex)); - ng = old_ng; - if (old_ng->len >= id) - goto assign; + lockdep_is_held(&pernet_ops_rwsem)); + if (old_ng->s.len > id) { + old_ng->ptr[id] = data; + return 0; + } ng = net_alloc_generic(); - if (ng == NULL) + if (!ng) return -ENOMEM; /* @@ -78,21 +108,22 @@ static int net_assign_generic(struct net *net, int id, void *data) * the old copy for kfree after a grace period. */ - memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); + memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID], + (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *)); + ng->ptr[id] = data; rcu_assign_pointer(net->gen, ng); - kfree_rcu(old_ng, rcu); -assign: - ng->ptr[id - 1] = data; + kfree_rcu(old_ng, s.rcu); return 0; } static int ops_init(const struct pernet_operations *ops, struct net *net) { + struct net_generic *ng; int err = -ENOMEM; void *data = NULL; - if (ops->id && ops->size) { + if (ops->id) { data = kzalloc(ops->size, GFP_KERNEL); if (!data) goto out; @@ -107,6 +138,12 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; + if (ops->id) { + ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&pernet_ops_rwsem)); + ng->ptr[*ops->id] = NULL; + } + cleanup: kfree(data); @@ -114,22 +151,56 @@ out: return err; } -static void ops_free(const struct pernet_operations *ops, struct net *net) +static void ops_pre_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) { - if (ops->id && ops->size) { - int id = *ops->id; - kfree(net_generic(net, id)); + struct net *net; + + if (ops->pre_exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->pre_exit(net); } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { - list_for_each_entry(net, net_exit_list, exit_list) + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); + cond_resched(); + } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -138,36 +209,248 @@ static void ops_free_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; - if (ops->size && ops->id) { + + if (ops->id) { list_for_each_entry(net, net_exit_list, exit_list) - ops_free(ops, net); + kfree(net_generic(net, *ops->id)); } } -/* - * setup_net runs the initializers for the network namespace object. +static void ops_undo_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list, + bool expedite_rcu) +{ + const struct pernet_operations *saved_ops; + bool hold_rtnl = false; + + if (!ops) + ops = list_entry(ops_list, typeof(*ops), list); + + saved_ops = ops; + + list_for_each_entry_continue_reverse(ops, ops_list, list) { + hold_rtnl |= !!ops->exit_rtnl; + ops_pre_exit_list(ops, net_exit_list); + } + + /* Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so the + * rcu_barrier() after ops_undo_list() isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + if (expedite_rcu) + synchronize_rcu_expedited(); + else + synchronize_rcu(); + + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_exit_list(ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_free_list(ops, net_exit_list); +} + +static void ops_undo_single(struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + LIST_HEAD(ops_list); + + list_add(&ops->list, &ops_list); + ops_undo_list(&ops_list, NULL, net_exit_list, false); + list_del(&ops->list); +} + +/* should be called with nsid_lock held */ +static int alloc_netid(struct net *net, struct net *peer, int reqid) +{ + int min = 0, max = 0; + + if (reqid >= 0) { + min = reqid; + max = reqid + 1; + } + + return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); +} + +/* This function is used by idr_for_each(). If net is equal to peer, the + * function returns the id so that idr_for_each() stops. Because we cannot + * returns the id 0 (idr_for_each() will not stop), we return the magic value + * NET_ID_ZERO (-1) for it. */ -static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) +#define NET_ID_ZERO -1 +static int net_eq_idr(int id, void *net, void *peer) { - /* Must be called with net_mutex held */ - const struct pernet_operations *ops, *saved_ops; - int error = 0; - LIST_HEAD(net_exit_list); + if (net_eq(net, peer)) + return id ? : NET_ID_ZERO; + return 0; +} + +/* Must be called from RCU-critical section or with nsid_lock held */ +static int __peernet2id(const struct net *net, struct net *peer) +{ + int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); + + /* Magic value for id 0. */ + if (id == NET_ID_ZERO) + return 0; + if (id > 0) + return id; + + return NETNSA_NSID_NOT_ASSIGNED; +} + +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh, gfp_t gfp); +/* This function returns the id of a peer netns. If no id is assigned, one will + * be allocated and returned. + */ +int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) +{ + int id; + + if (!check_net(net)) + return NETNSA_NSID_NOT_ASSIGNED; + + spin_lock(&net->nsid_lock); + id = __peernet2id(net, peer); + if (id >= 0) { + spin_unlock(&net->nsid_lock); + return id; + } + + /* When peer is obtained from RCU lists, we may race with + * its cleanup. Check whether it's alive, and this guarantees + * we never hash a peer back to net->netns_ids, after it has + * just been idr_remove()'d from there in cleanup_net(). + */ + if (!maybe_get_net(peer)) { + spin_unlock(&net->nsid_lock); + return NETNSA_NSID_NOT_ASSIGNED; + } + + id = alloc_netid(net, peer, -1); + spin_unlock(&net->nsid_lock); - atomic_set(&net->count, 1); - atomic_set(&net->passive, 1); + put_net(peer); + if (id < 0) + return NETNSA_NSID_NOT_ASSIGNED; + + rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); + + return id; +} +EXPORT_SYMBOL_GPL(peernet2id_alloc); + +/* This function returns, if assigned, the id of a peer netns. */ +int peernet2id(const struct net *net, struct net *peer) +{ + int id; + + rcu_read_lock(); + id = __peernet2id(net, peer); + rcu_read_unlock(); + + return id; +} +EXPORT_SYMBOL(peernet2id); + +/* This function returns true is the peer netns has an id assigned into the + * current netns. + */ +bool peernet_has_id(const struct net *net, struct net *peer) +{ + return peernet2id(net, peer) >= 0; +} + +struct net *get_net_ns_by_id(const struct net *net, int id) +{ + struct net *peer; + + if (id < 0) + return NULL; + + rcu_read_lock(); + peer = idr_find(&net->netns_ids, id); + if (peer) + peer = maybe_get_net(peer); + rcu_read_unlock(); + + return peer; +} +EXPORT_SYMBOL_GPL(get_net_ns_by_id); + +static __net_init void preinit_net_sysctl(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + net->core.sysctl_tstamp_allow_data = 1; + net->core.sysctl_txq_reselection = msecs_to_jiffies(1000); +} + +/* init code that must occur even if setup_net() is not called. */ +static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) +{ + int ret; + + ret = ns_common_init(net); + if (ret) + return ret; + + refcount_set(&net->passive, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); + ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); + + get_random_bytes(&net->hash_mix, sizeof(u32)); net->dev_base_seq = 1; net->user_ns = user_ns; -#ifdef NETNS_REFCNT_DEBUG - atomic_set(&net->use_count, 0); + idr_init(&net->netns_ids); + spin_lock_init(&net->nsid_lock); + mutex_init(&net->ipv4.ra_mutex); + +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + mutex_init(&net->rtnl_mutex); + lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); #endif + INIT_LIST_HEAD(&net->ptype_all); + INIT_LIST_HEAD(&net->ptype_specific); + preinit_net_sysctl(net); + return 0; +} + +/* + * setup_net runs the initializers for the network namespace object. + */ +static __net_init int setup_net(struct net *net) +{ + /* Must be called with pernet_ops_rwsem held */ + const struct pernet_operations *ops; + LIST_HEAD(net_exit_list); + int error = 0; + + net->net_cookie = ns_tree_gen_id(net); + list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); if (error < 0) goto out_undo; } + down_write(&net_rwsem); + list_add_tail_rcu(&net->list, &net_namespace_list); + up_write(&net_rwsem); + ns_tree_add_raw(net); out: return error; @@ -176,21 +459,23 @@ out_undo: * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); - + ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } - #ifdef CONFIG_NET_NS -static struct kmem_cache *net_cachep; +static struct ucounts *inc_net_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES); +} + +static void dec_net_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); +} + +static struct kmem_cache *net_cachep __ro_after_init; static struct workqueue_struct *netns_wq; static struct net *net_alloc(void) @@ -206,160 +491,293 @@ static struct net *net_alloc(void) if (!net) goto out_free; +#ifdef CONFIG_KEYS + net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL); + if (!net->key_domain) + goto out_free_2; + refcount_set(&net->key_domain->usage, 1); +#endif + rcu_assign_pointer(net->gen, ng); out: return net; +#ifdef CONFIG_KEYS +out_free_2: + kmem_cache_free(net_cachep, net); + net = NULL; +#endif out_free: kfree(ng); goto out; } -static void net_free(struct net *net) +static LLIST_HEAD(defer_free_list); + +static void net_complete_free(void) { -#ifdef NETNS_REFCNT_DEBUG - if (unlikely(atomic_read(&net->use_count) != 0)) { - pr_emerg("network namespace not free! Usage: %d\n", - atomic_read(&net->use_count)); - return; + struct llist_node *kill_list; + struct net *net, *next; + + /* Get the list of namespaces to free from last round. */ + kill_list = llist_del_all(&defer_free_list); + + llist_for_each_entry_safe(net, next, kill_list, defer_free_list) + kmem_cache_free(net_cachep, net); + +} + +void net_passive_dec(struct net *net) +{ + if (refcount_dec_and_test(&net->passive)) { + kfree(rcu_access_pointer(net->gen)); + + /* There should not be any trackers left there. */ + ref_tracker_dir_exit(&net->notrefcnt_tracker); + + /* Wait for an extra rcu_barrier() before final free. */ + llist_add(&net->defer_free_list, &defer_free_list); } -#endif - kfree(net->gen); - kmem_cache_free(net_cachep, net); } void net_drop_ns(void *p) { - struct net *ns = p; - if (ns && atomic_dec_and_test(&ns->passive)) - net_free(ns); + struct net *net = (struct net *)p; + + if (net) + net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { + struct ucounts *ucounts; struct net *net; int rv; if (!(flags & CLONE_NEWNET)) return get_net(old_net); + ucounts = inc_net_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + net = net_alloc(); - if (!net) - return ERR_PTR(-ENOMEM); + if (!net) { + rv = -ENOMEM; + goto dec_ucounts; + } + rv = preinit_net(net, user_ns); + if (rv < 0) + goto dec_ucounts; + net->ucounts = ucounts; get_user_ns(user_ns); - mutex_lock(&net_mutex); - rv = setup_net(net, user_ns); - if (rv == 0) { - rtnl_lock(); - list_add_tail_rcu(&net->list, &net_namespace_list); - rtnl_unlock(); - } - mutex_unlock(&net_mutex); + rv = down_read_killable(&pernet_ops_rwsem); + if (rv < 0) + goto put_userns; + + rv = setup_net(net); + + up_read(&pernet_ops_rwsem); + if (rv < 0) { +put_userns: + ns_common_free(net); +#ifdef CONFIG_KEYS + key_remove_domain(net->key_domain); +#endif put_user_ns(user_ns); - net_drop_ns(net); + net_passive_dec(net); +dec_ucounts: + dec_net_namespaces(ucounts); return ERR_PTR(rv); } return net; } -static DEFINE_SPINLOCK(cleanup_list_lock); -static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ +/** + * net_ns_get_ownership - get sysfs ownership data for @net + * @net: network namespace in question (can be NULL) + * @uid: kernel user ID for sysfs objects + * @gid: kernel group ID for sysfs objects + * + * Returns the uid/gid pair of root in the user namespace associated with the + * given network namespace. + */ +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) +{ + if (net) { + kuid_t ns_root_uid = make_kuid(net->user_ns, 0); + kgid_t ns_root_gid = make_kgid(net->user_ns, 0); + + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; + } else { + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + } +} +EXPORT_SYMBOL_GPL(net_ns_get_ownership); + +static void unhash_nsid(struct net *net, struct net *last) +{ + struct net *tmp; + /* This function is only called from cleanup_net() work, + * and this work is the only process, that may delete + * a net from net_namespace_list. So, when the below + * is executing, the list may only grow. Thus, we do not + * use for_each_net_rcu() or net_rwsem. + */ + for_each_net(tmp) { + int id; + + spin_lock(&tmp->nsid_lock); + id = __peernet2id(tmp, net); + if (id >= 0) + idr_remove(&tmp->netns_ids, id); + spin_unlock(&tmp->nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, + GFP_KERNEL); + if (tmp == last) + break; + } + spin_lock(&net->nsid_lock); + idr_destroy(&net->netns_ids); + spin_unlock(&net->nsid_lock); +} + +static LLIST_HEAD(cleanup_list); + +struct task_struct *cleanup_net_task; static void cleanup_net(struct work_struct *work) { - const struct pernet_operations *ops; - struct net *net, *tmp; - LIST_HEAD(net_kill_list); + struct llist_node *net_kill_list; + struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); + WRITE_ONCE(cleanup_net_task, current); + /* Atomically snapshot the list of namespaces to cleanup */ - spin_lock_irq(&cleanup_list_lock); - list_replace_init(&cleanup_list, &net_kill_list); - spin_unlock_irq(&cleanup_list_lock); + net_kill_list = llist_del_all(&cleanup_list); - mutex_lock(&net_mutex); + down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ - rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) { + down_write(&net_rwsem); + llist_for_each_entry(net, net_kill_list, cleanup_list) { + ns_tree_remove(net); list_del_rcu(&net->list); - list_add_tail(&net->exit_list, &net_exit_list); } - rtnl_unlock(); - - /* - * Another CPU might be rcu-iterating the list, wait for it. - * This needs to be before calling the exit() notifiers, so - * the rcu_barrier() below isn't sufficient alone. + /* Cache last net. After we unlock rtnl, no one new net + * added to net_namespace_list can assign nsid pointer + * to a net from net_kill_list (see peernet2id_alloc()). + * So, we skip them in unhash_nsid(). + * + * Note, that unhash_nsid() does not delete nsid links + * between net_kill_list's nets, as they've already + * deleted from net_namespace_list. But, this would be + * useless anyway, as netns_ids are destroyed there. */ - synchronize_rcu(); + last = list_last_entry(&net_namespace_list, struct net, list); + up_write(&net_rwsem); - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); + llist_for_each_entry(net, net_kill_list, cleanup_list) { + unhash_nsid(net, last); + list_add_tail(&net->exit_list, &net_exit_list); + } - /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true); - mutex_unlock(&net_mutex); + up_read(&pernet_ops_rwsem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. */ rcu_barrier(); + net_complete_free(); + /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + ns_common_free(net); + dec_net_namespaces(net->ucounts); +#ifdef CONFIG_KEYS + key_remove_domain(net->key_domain); +#endif put_user_ns(net->user_ns); - net_drop_ns(net); + net_passive_dec(net); } + WRITE_ONCE(cleanup_net_task, NULL); } + +/** + * net_ns_barrier - wait until concurrent net_cleanup_work is done + * + * cleanup_net runs from work queue and will first remove namespaces + * from the global list, then run net exit functions. + * + * Call this in module exit path to make sure that all netns + * ->exit ops have been invoked before the function is removed. + */ +void net_ns_barrier(void) +{ + down_write(&pernet_ops_rwsem); + up_write(&pernet_ops_rwsem); +} +EXPORT_SYMBOL(net_ns_barrier); + static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { + ref_tracker_dir_exit(&net->refcnt_tracker); /* Cleanup the network namespace in process context */ - unsigned long flags; - - spin_lock_irqsave(&cleanup_list_lock, flags); - list_add(&net->cleanup_list, &cleanup_list); - spin_unlock_irqrestore(&cleanup_list_lock, flags); - - queue_work(netns_wq, &net_cleanup_work); + if (llist_add(&net->cleanup_list, &cleanup_list)) + queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); -struct net *get_net_ns_by_fd(int fd) +/** + * get_net_ns - increment the refcount of the network namespace + * @ns: common namespace (net) + * + * Returns the net's common namespace or ERR_PTR() if ref is zero. + */ +struct ns_common *get_net_ns(struct ns_common *ns) { - struct proc_ns *ei; - struct file *file; struct net *net; - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return ERR_CAST(file); - - ei = get_proc_ns(file_inode(file)); - if (ei->ns_ops == &netns_operations) - net = get_net(ei->ns); - else - net = ERR_PTR(-EINVAL); - - fput(file); - return net; + net = maybe_get_net(container_of(ns, struct net, ns)); + if (net) + return &net->ns; + return ERR_PTR(-EINVAL); } +EXPORT_SYMBOL_GPL(get_net_ns); -#else struct net *get_net_ns_by_fd(int fd) { + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return ERR_PTR(-EBADF); + + if (proc_ns_file(fd_file(f))) { + struct ns_common *ns = get_proc_ns(file_inode(fd_file(f))); + if (ns->ops == &netns_operations) + return get_net(container_of(ns, struct net, ns)); + } + return ERR_PTR(-EINVAL); } +EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif struct net *get_net_ns_by_pid(pid_t pid) @@ -373,38 +791,474 @@ struct net *get_net_ns_by_pid(pid_t pid) tsk = find_task_by_vpid(pid); if (tsk) { struct nsproxy *nsproxy; - nsproxy = task_nsproxy(tsk); + task_lock(tsk); + nsproxy = tsk->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); + task_unlock(tsk); } rcu_read_unlock(); return net; } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); -static __net_init int net_ns_net_init(struct net *net) +#ifdef CONFIG_NET_NS_REFCNT_TRACKER +static void net_ns_net_debugfs(struct net *net) +{ + ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt", + net->net_cookie, net->ns.inum); + ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt", + net->net_cookie, net->ns.inum); +} + +static int __init init_net_debugfs(void) +{ + ref_tracker_dir_debugfs(&init_net.refcnt_tracker); + ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker); + net_ns_net_debugfs(&init_net); + return 0; +} +late_initcall(init_net_debugfs); +#else +static void net_ns_net_debugfs(struct net *net) { - return proc_alloc_inum(&net->proc_inum); } +#endif -static __net_exit void net_ns_net_exit(struct net *net) +static __net_init int net_ns_net_init(struct net *net) { - proc_free_inum(net->proc_inum); + net_ns_net_debugfs(net); + return 0; } static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, - .exit = net_ns_net_exit, }; -static int __init net_ns_init(void) +static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { + [NETNSA_NONE] = { .type = NLA_UNSPEC }, + [NETNSA_NSID] = { .type = NLA_S32 }, + [NETNSA_PID] = { .type = NLA_U32 }, + [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, +}; + +static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[NETNSA_MAX + 1]; + struct nlattr *nla; + struct net *peer; + int nsid, err; + + err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, extack); + if (err < 0) + return err; + if (!tb[NETNSA_NSID]) { + NL_SET_ERR_MSG(extack, "nsid is missing"); + return -EINVAL; + } + nsid = nla_get_s32(tb[NETNSA_NSID]); + + if (tb[NETNSA_PID]) { + peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); + nla = tb[NETNSA_PID]; + } else if (tb[NETNSA_FD]) { + peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); + nla = tb[NETNSA_FD]; + } else { + NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); + return -EINVAL; + } + if (IS_ERR(peer)) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); + return PTR_ERR(peer); + } + + spin_lock(&net->nsid_lock); + if (__peernet2id(net, peer) >= 0) { + spin_unlock(&net->nsid_lock); + err = -EEXIST; + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, + "Peer netns already has a nsid assigned"); + goto out; + } + + err = alloc_netid(net, peer, nsid); + spin_unlock(&net->nsid_lock); + if (err >= 0) { + rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, + nlh, GFP_KERNEL); + err = 0; + } else if (err == -ENOSPC && nsid >= 0) { + err = -EEXIST; + NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]); + NL_SET_ERR_MSG(extack, "The specified nsid is already used"); + } +out: + put_net(peer); + return err; +} + +static int rtnl_net_get_size(void) +{ + return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ + ; +} + +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; + bool add_ref; + int ref_nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) +{ + struct nlmsghdr *nlh; + struct rtgenmsg *rth; + + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); + if (!nlh) + return -EMSGSIZE; + + rth = nlmsg_data(nlh); + rth->rtgen_family = AF_UNSPEC; + + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) + goto nla_put_failure; + + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int rtnl_net_valid_getid_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int i, err; + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), + tb, NETNSA_MAX, rtnl_net_policy, + extack); + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, + extack); + if (err) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case NETNSA_PID: + case NETNSA_FD: + case NETNSA_NSID: + case NETNSA_TARGET_NSID: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; + struct net *peer, *target = net; + struct nlattr *nla; + struct sk_buff *msg; + int err; + + err = rtnl_net_valid_getid_req(skb, nlh, tb, extack); + if (err < 0) + return err; + if (tb[NETNSA_PID]) { + peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); + nla = tb[NETNSA_PID]; + } else if (tb[NETNSA_FD]) { + peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); + nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; + } else { + NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); + return -EINVAL; + } + + if (IS_ERR(peer)) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); + return PTR_ERR(peer); + } + + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); + } + + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out; + } + + fillargs.nsid = peernet2id(target, peer); + err = rtnl_net_fill(msg, &fillargs); + if (err < 0) + goto err_out; + + err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid); + goto out; + +err_out: + nlmsg_free(msg); +out: + if (fillargs.add_ref) + put_net(target); + put_net(peer); + return err; +} + +struct rtnl_net_dump_cb { + struct net *tgt_net; + struct net *ref_net; + struct sk_buff *skb; + struct net_fill_args fillargs; + int idx; + int s_idx; +}; + +/* Runs in RCU-critical section. */ +static int rtnl_net_dumpid_one(int id, void *peer, void *data) +{ + struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; + int ret; + + if (net_cb->idx < net_cb->s_idx) + goto cont; + + net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); + if (ret < 0) + return ret; + +cont: + net_cb->idx++; + return 0; +} + +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, + extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; + net_cb->tgt_net = net; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtnl_net_dump_cb net_cb = { + .tgt_net = sock_net(skb->sk), + .skb = skb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, + .idx = 0, + .s_idx = cb->args[0], + }; + int err = 0; + + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; + } + + rcu_read_lock(); + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + rcu_read_unlock(); + + cb->args[0] = net_cb.idx; +end: + if (net_cb.fillargs.add_ref) + put_net(net_cb.tgt_net); + return err; +} + +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh, gfp_t gfp) +{ + struct net_fill_args fillargs = { + .portid = portid, + .seq = nlh ? nlh->nlmsg_seq : 0, + .cmd = cmd, + .nsid = id, + }; + struct sk_buff *msg; + int err = -ENOMEM; + + msg = nlmsg_new(rtnl_net_get_size(), gfp); + if (!msg) + goto out; + + err = rtnl_net_fill(msg, &fillargs); + if (err < 0) + goto err_out; + + rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp); + return; + +err_out: + nlmsg_free(msg); +out: + rtnl_set_sk_err(net, RTNLGRP_NSID, err); +} + +#ifdef CONFIG_NET_NS +static void __init netns_ipv4_struct_check(void) +{ + /* TX readonly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_early_retrans); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_win_divisor); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_rtt_log); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_autocorking); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_snd_mss); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_notsent_lowat); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_limit_output_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_rtt_wlen); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_wmem); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_ip_fwd_use_pmtu); + + /* RX readonly hotpath cache line */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_moderate_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rcvbuf_low_rtt); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_ip_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_l3mdev_accept); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_reordering); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rmem); +} +#endif + +static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid, + .dumpit = rtnl_net_dumpid, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + +void __init net_ns_init(void) { struct net_generic *ng; #ifdef CONFIG_NET_NS + netns_ipv4_struct_check(); net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, - SLAB_PANIC, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); @@ -418,33 +1272,42 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - mutex_lock(&net_mutex); - if (setup_net(&init_net, &init_user_ns)) - panic("Could not setup the initial network namespace"); +#ifdef CONFIG_KEYS + init_net.key_domain = &init_net_key_domain; +#endif + /* + * This currently cannot fail as the initial network namespace + * has a static inode number. + */ + if (preinit_net(&init_net, &init_user_ns)) + panic("Could not preinitialize the initial network namespace"); - rtnl_lock(); - list_add_tail_rcu(&init_net.list, &net_namespace_list); - rtnl_unlock(); + down_write(&pernet_ops_rwsem); + if (setup_net(&init_net)) + panic("Could not setup the initial network namespace"); - mutex_unlock(&net_mutex); + init_net_initialized = true; + up_write(&pernet_ops_rwsem); - register_pernet_subsys(&net_ns_ops); + if (register_pernet_subsys(&net_ns_ops)) + panic("Could not register network namespace subsystems"); - return 0; + rtnl_register_many(net_ns_rtnl_msg_handlers); } -pure_initcall(net_ns_init); - #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + LIST_HEAD(net_exit_list); struct net *net; int error; - LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); - if (ops->init || (ops->id && ops->size)) { + if (ops->init || ops->id) { + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { error = ops_init(ops, net); if (error) @@ -457,21 +1320,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { - struct net *net; LIST_HEAD(net_exit_list); + struct net *net; - list_del(&ops->list); + /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + + list_del(&ops->list); + ops_undo_single(ops, &net_exit_list); } #else @@ -479,15 +1342,24 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + if (!init_net_initialized) { + list_add_tail(&ops->list, list); + return 0; + } + return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { - LIST_HEAD(net_exit_list); - list_add(&init_net.exit_list, &net_exit_list); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + if (!init_net_initialized) { + list_del(&ops->list); + } else { + LIST_HEAD(net_exit_list); + + list_add(&init_net.exit_list, &net_exit_list); + ops_undo_single(ops, &net_exit_list); + } } #endif /* CONFIG_NET_NS */ @@ -499,23 +1371,26 @@ static int register_pernet_operations(struct list_head *list, { int error; + if (WARN_ON(!!ops->id ^ !!ops->size)) + return -EINVAL; + if (ops->id) { -again: - error = ida_get_new_above(&net_generic_ids, 1, ops->id); - if (error < 0) { - if (error == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } + error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, + GFP_KERNEL); + if (error < 0) return error; - } - max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); + *ops->id = error; + /* This does not require READ_ONCE as writers already hold + * pernet_ops_rwsem. But WRITE_ONCE is needed to protect + * net_alloc_generic. + */ + WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1)); } error = __register_pernet_operations(list, ops); if (error) { rcu_barrier(); if (ops->id) - ida_remove(&net_generic_ids, *ops->id); + ida_free(&net_generic_ids, *ops->id); } return error; @@ -523,11 +1398,10 @@ again: static void unregister_pernet_operations(struct pernet_operations *ops) { - __unregister_pernet_operations(ops); rcu_barrier(); if (ops->id) - ida_remove(&net_generic_ids, *ops->id); + ida_free(&net_generic_ids, *ops->id); } /** @@ -552,9 +1426,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops) int register_pernet_subsys(struct pernet_operations *ops) { int error; - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); error = register_pernet_operations(first_device, ops); - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); @@ -570,9 +1444,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys); */ void unregister_pernet_subsys(struct pernet_operations *ops) { - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); @@ -598,11 +1472,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys); int register_pernet_device(struct pernet_operations *ops) { int error; - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); @@ -618,40 +1492,41 @@ EXPORT_SYMBOL_GPL(register_pernet_device); */ void unregister_pernet_device(struct pernet_operations *ops) { - mutex_lock(&net_mutex); + down_write(&pernet_ops_rwsem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); #ifdef CONFIG_NET_NS -static void *netns_get(struct task_struct *task) +static struct ns_common *netns_get(struct task_struct *task) { struct net *net = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); - rcu_read_unlock(); + task_unlock(task); - return net; + return net ? &net->ns : NULL; } -static void netns_put(void *ns) +static void netns_put(struct ns_common *ns) { - put_net(ns); + put_net(to_net_ns(ns)); } -static int netns_install(struct nsproxy *nsproxy, void *ns) +static int netns_install(struct nsset *nsset, struct ns_common *ns) { - struct net *net = ns; + struct nsproxy *nsproxy = nsset->nsproxy; + struct net *net = to_net_ns(ns); if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); @@ -659,18 +1534,16 @@ static int netns_install(struct nsproxy *nsproxy, void *ns) return 0; } -static unsigned int netns_inum(void *ns) +static struct user_namespace *netns_owner(struct ns_common *ns) { - struct net *net = ns; - return net->proc_inum; + return to_net_ns(ns)->user_ns; } const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, - .inum = netns_inum, + .owner = netns_owner, }; #endif diff --git a/net/core/net_test.c b/net/core/net_test.c new file mode 100644 index 000000000000..9c3a590865d2 --- /dev/null +++ b/net/core/net_test.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <kunit/test.h> + +/* GSO */ + +#include <linux/skbuff.h> + +static const char hdr[] = "abcdefgh"; +#define GSO_TEST_SIZE 1000 + +static void __init_skb(struct sk_buff *skb) +{ + skb_reset_mac_header(skb); + memcpy(skb_mac_header(skb), hdr, sizeof(hdr)); + + /* skb_segment expects skb->data at start of payload */ + skb_pull(skb, sizeof(hdr)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + /* proto is arbitrary, as long as not ETH_P_TEB or vlan */ + skb->protocol = htons(ETH_P_ATALK); + skb_shinfo(skb)->gso_size = GSO_TEST_SIZE; +} + +enum gso_test_nr { + GSO_TEST_LINEAR, + GSO_TEST_NO_GSO, + GSO_TEST_FRAGS, + GSO_TEST_FRAGS_PURE, + GSO_TEST_GSO_PARTIAL, + GSO_TEST_FRAG_LIST, + GSO_TEST_FRAG_LIST_PURE, + GSO_TEST_FRAG_LIST_NON_UNIFORM, + GSO_TEST_GSO_BY_FRAGS, +}; + +struct gso_test_case { + enum gso_test_nr id; + const char *name; + + /* input */ + unsigned int linear_len; + unsigned int nr_frags; + const unsigned int *frags; + unsigned int nr_frag_skbs; + const unsigned int *frag_skbs; + + /* output as expected */ + unsigned int nr_segs; + const unsigned int *segs; +}; + +static struct gso_test_case cases[] = { + { + .id = GSO_TEST_NO_GSO, + .name = "no_gso", + .linear_len = GSO_TEST_SIZE, + .nr_segs = 1, + .segs = (const unsigned int[]) { GSO_TEST_SIZE }, + }, + { + .id = GSO_TEST_LINEAR, + .name = "linear", + .linear_len = GSO_TEST_SIZE + GSO_TEST_SIZE + 1, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 }, + }, + { + .id = GSO_TEST_FRAGS, + .name = "frags", + .linear_len = GSO_TEST_SIZE, + .nr_frags = 2, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, 1 }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 }, + }, + { + .id = GSO_TEST_FRAGS_PURE, + .name = "frags_pure", + .nr_frags = 3, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 }, + }, + { + .id = GSO_TEST_GSO_PARTIAL, + .name = "gso_partial", + .linear_len = GSO_TEST_SIZE, + .nr_frags = 2, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, 3 }, + .nr_segs = 2, + .segs = (const unsigned int[]) { 2 * GSO_TEST_SIZE, 3 }, + }, + { + /* commit 89319d3801d1: frag_list on mss boundaries */ + .id = GSO_TEST_FRAG_LIST, + .name = "frag_list", + .linear_len = GSO_TEST_SIZE, + .nr_frag_skbs = 2, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE }, + }, + { + .id = GSO_TEST_FRAG_LIST_PURE, + .name = "frag_list_pure", + .nr_frag_skbs = 2, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + .nr_segs = 2, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + }, + { + /* commit 43170c4e0ba7: GRO of frag_list trains */ + .id = GSO_TEST_FRAG_LIST_NON_UNIFORM, + .name = "frag_list_non_uniform", + .linear_len = GSO_TEST_SIZE, + .nr_frag_skbs = 4, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, 1, GSO_TEST_SIZE, 2 }, + .nr_segs = 4, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE, 3 }, + }, + { + /* commit 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") and + * commit 90017accff61 ("sctp: Add GSO support") + * + * "there will be a cover skb with protocol headers and + * children ones containing the actual segments" + */ + .id = GSO_TEST_GSO_BY_FRAGS, + .name = "gso_by_frags", + .nr_frag_skbs = 4, + .frag_skbs = (const unsigned int[]) { 100, 200, 300, 400 }, + .nr_segs = 4, + .segs = (const unsigned int[]) { 100, 200, 300, 400 }, + }, +}; + +static void gso_test_case_to_desc(struct gso_test_case *t, char *desc) +{ + sprintf(desc, "%s", t->name); +} + +KUNIT_ARRAY_PARAM(gso_test, cases, gso_test_case_to_desc); + +static void gso_test_func(struct kunit *test) +{ + const int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct sk_buff *skb, *segs, *cur, *next, *last; + const struct gso_test_case *tcase; + netdev_features_t features; + struct page *page; + int i; + + tcase = test->param_value; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + skb = build_skb(page_address(page), sizeof(hdr) + tcase->linear_len + shinfo_size); + KUNIT_ASSERT_NOT_NULL(test, skb); + __skb_put(skb, sizeof(hdr) + tcase->linear_len); + + __init_skb(skb); + + if (tcase->nr_frags) { + unsigned int pg_off = 0; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + page_ref_add(page, tcase->nr_frags - 1); + + for (i = 0; i < tcase->nr_frags; i++) { + skb_fill_page_desc(skb, i, page, pg_off, tcase->frags[i]); + pg_off += tcase->frags[i]; + } + + KUNIT_ASSERT_LE(test, pg_off, PAGE_SIZE); + + skb->data_len = pg_off; + skb->len += skb->data_len; + skb->truesize += skb->data_len; + } + + if (tcase->frag_skbs) { + unsigned int total_size = 0, total_true_size = 0; + struct sk_buff *frag_skb, *prev = NULL; + + for (i = 0; i < tcase->nr_frag_skbs; i++) { + unsigned int frag_size; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + + frag_size = tcase->frag_skbs[i]; + frag_skb = build_skb(page_address(page), + frag_size + shinfo_size); + KUNIT_ASSERT_NOT_NULL(test, frag_skb); + __skb_put(frag_skb, frag_size); + + if (prev) + prev->next = frag_skb; + else + skb_shinfo(skb)->frag_list = frag_skb; + prev = frag_skb; + + total_size += frag_size; + total_true_size += frag_skb->truesize; + } + + skb->len += total_size; + skb->data_len += total_size; + skb->truesize += total_true_size; + + if (tcase->id == GSO_TEST_GSO_BY_FRAGS) + skb_shinfo(skb)->gso_size = GSO_BY_FRAGS; + } + + features = NETIF_F_SG | NETIF_F_HW_CSUM; + if (tcase->id == GSO_TEST_GSO_PARTIAL) + features |= NETIF_F_GSO_PARTIAL; + + /* TODO: this should also work with SG, + * rather than hit BUG_ON(i >= nfrags) + */ + if (tcase->id == GSO_TEST_FRAG_LIST_NON_UNIFORM) + features &= ~NETIF_F_SG; + + segs = skb_segment(skb, features); + if (IS_ERR(segs)) { + KUNIT_FAIL(test, "segs error %pe", segs); + goto free_gso_skb; + } else if (!segs) { + KUNIT_FAIL(test, "no segments"); + goto free_gso_skb; + } + + last = segs->prev; + for (cur = segs, i = 0; cur; cur = next, i++) { + next = cur->next; + + KUNIT_ASSERT_EQ(test, cur->len, sizeof(hdr) + tcase->segs[i]); + + /* segs have skb->data pointing to the mac header */ + KUNIT_ASSERT_PTR_EQ(test, skb_mac_header(cur), cur->data); + KUNIT_ASSERT_PTR_EQ(test, skb_network_header(cur), cur->data + sizeof(hdr)); + + /* header was copied to all segs */ + KUNIT_ASSERT_EQ(test, memcmp(skb_mac_header(cur), hdr, sizeof(hdr)), 0); + + /* last seg can be found through segs->prev pointer */ + if (!next) + KUNIT_ASSERT_PTR_EQ(test, cur, last); + + consume_skb(cur); + } + + KUNIT_ASSERT_EQ(test, i, tcase->nr_segs); + +free_gso_skb: + consume_skb(skb); +} + +/* IP tunnel flags */ + +#include <net/ip_tunnels.h> + +struct ip_tunnel_flags_test { + const char *name; + + const u16 *src_bits; + const u16 *exp_bits; + u8 src_num; + u8 exp_num; + + __be16 exp_val; + bool exp_comp; +}; + +#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \ + .name = (n), \ + .src_bits = (src), \ + .src_num = ARRAY_SIZE(src), \ + .exp_comp = (comp), \ + .exp_val = (eval), \ + .exp_bits = (exp), \ + .exp_num = ARRAY_SIZE(exp), \ +} + +/* These are __be16-compatible and can be compared as is */ +static const u16 ip_tunnel_flags_1[] = { + IP_TUNNEL_KEY_BIT, + IP_TUNNEL_STRICT_BIT, + IP_TUNNEL_ERSPAN_OPT_BIT, +}; + +/* Due to the previous flags design limitation, setting either + * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT`` + * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they + * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is + * backward-compatible. + */ +#ifdef __LITTLE_ENDIAN +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT +#else +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT +#endif + +static const u16 ip_tunnel_flags_2_src[] = { + IP_TUNNEL_CONFLICT_BIT, +}; + +static const u16 ip_tunnel_flags_2_exp[] = { + IP_TUNNEL_CONFLICT_BIT, + IP_TUNNEL_SIT_ISATAP_BIT, +}; + +/* Bits 17 and higher are not compatible with __be16 flags */ +static const u16 ip_tunnel_flags_3_src[] = { + IP_TUNNEL_VXLAN_OPT_BIT, + 17, + 18, + 20, +}; + +static const u16 ip_tunnel_flags_3_exp[] = { + IP_TUNNEL_VXLAN_OPT_BIT, +}; + +static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = { + IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true, + cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) | + BIT(IP_TUNNEL_STRICT_BIT) | + BIT(IP_TUNNEL_ERSPAN_OPT_BIT)), + ip_tunnel_flags_1), + IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true, + VTI_ISVTI, ip_tunnel_flags_2_exp), + IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false, + cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)), + ip_tunnel_flags_3_exp), +}; + +static void +ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t, + char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} +KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test, + ip_tunnel_flags_test_case_to_desc); + +static void ip_tunnel_flags_test_run(struct kunit *test) +{ + const struct ip_tunnel_flags_test *t = test->param_value; + IP_TUNNEL_DECLARE_FLAGS(src) = { }; + IP_TUNNEL_DECLARE_FLAGS(exp) = { }; + IP_TUNNEL_DECLARE_FLAGS(out); + + for (u32 j = 0; j < t->src_num; j++) + __set_bit(t->src_bits[j], src); + for (u32 j = 0; j < t->exp_num; j++) + __set_bit(t->exp_bits[j], exp); + + KUNIT_ASSERT_EQ(test, t->exp_comp, + ip_tunnel_flags_is_be16_compat(src)); + KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val, + (__force u16)ip_tunnel_flags_to_be16(src)); + + ip_tunnel_flags_from_be16(out, t->exp_val); + KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out)); +} + +static struct kunit_case net_test_cases[] = { + KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), + KUNIT_CASE_PARAM(ip_tunnel_flags_test_run, + ip_tunnel_flags_test_gen_params), + { }, +}; + +static struct kunit_suite net_test_suite = { + .name = "net_core", + .test_cases = net_test_cases, +}; +kunit_test_suite(net_test_suite); + +MODULE_DESCRIPTION("KUnit tests for networking core"); +MODULE_LICENSE("GPL"); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c new file mode 100644 index 000000000000..dff66d8fb325 --- /dev/null +++ b/net/core/netclassid_cgroup.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/core/netclassid_cgroup.c Classid Cgroupfs Handling + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fdtable.h> +#include <linux/sched/task.h> + +#include <net/cls_cgroup.h> +#include <net/sock.h> + +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; +} + +struct cgroup_cls_state *task_cls_state(struct task_struct *p) +{ + return css_cls_state(task_css_check(p, net_cls_cgrp_id, + rcu_read_lock_held() || + rcu_read_lock_bh_held() || + rcu_read_lock_trace_held())); +} +EXPORT_SYMBOL_GPL(task_cls_state); + +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_cls_state *cs; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + return &cs->css; +} + +static int cgrp_css_online(struct cgroup_subsys_state *css) +{ + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css->parent); + + if (parent) + cs->classid = parent->classid; + + return 0; +} + +static void cgrp_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_cls_state(css)); +} + +/* + * To avoid freezing of sockets creation for tasks with big number of threads + * and opened sockets lets release file_lock every 1000 iterated descriptors. + * New sockets will already have been created with new classid. + */ + +struct update_classid_context { + u32 classid; + unsigned int batch; +}; + +#define UPDATE_CLASSID_BATCH 1000 + +static int update_classid_sock(const void *v, struct file *file, unsigned int n) +{ + struct update_classid_context *ctx = (void *)v; + struct socket *sock = sock_from_file(file); + + if (sock) + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); + if (--ctx->batch == 0) { + ctx->batch = UPDATE_CLASSID_BATCH; + return n + 1; + } + return 0; +} + +static void update_classid_task(struct task_struct *p, u32 classid) +{ + struct update_classid_context ctx = { + .classid = classid, + .batch = UPDATE_CLASSID_BATCH + }; + unsigned int fd = 0; + + /* Only update the leader task, when many threads in this task, + * so it can avoid the useless traversal. + */ + if (p != p->group_leader) + return; + + do { + task_lock(p); + fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); + task_unlock(p); + cond_resched(); + } while (fd); +} + +static void cgrp_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + + cgroup_taskset_for_each(p, css, tset) { + update_classid_task(p, css_cls_state(css)->classid); + } +} + +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return css_cls_state(css)->classid; +} + +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) +{ + struct cgroup_cls_state *cs = css_cls_state(css); + struct css_task_iter it; + struct task_struct *p; + + cs->classid = (u32)value; + + css_task_iter_start(css, 0, &it); + while ((p = css_task_iter_next(&it))) + update_classid_task(p, cs->classid); + css_task_iter_end(&it); + + return 0; +} + +static struct cftype ss_files[] = { + { + .name = "classid", + .read_u64 = read_classid, + .write_u64 = write_classid, + }, + { } /* terminate */ +}; + +struct cgroup_subsys net_cls_cgrp_subsys = { + .css_alloc = cgrp_css_alloc, + .css_online = cgrp_css_online, + .css_free = cgrp_css_free, + .attach = cgrp_attach, + .legacy_cftypes = ss_files, +}; diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c new file mode 100644 index 000000000000..ba673e81716f --- /dev/null +++ b/net/core/netdev-genl-gen.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netdev-genl-gen.h" + +#include <uapi/linux/netdev.h> +#include <net/netdev_netlink.h> + +/* Integer value ranges */ +static const struct netlink_range_validation netdev_a_page_pool_id_range = { + .min = 1ULL, + .max = U32_MAX, +}; + +static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = { + .min = 1ULL, + .max = S32_MAX, +}; + +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = { + .max = S32_MAX, +}; + +/* Common nested types */ +const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), + [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), +}; + +const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), +}; + +/* NETDEV_CMD_DEV_GET - do */ +static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { + [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_PAGE_POOL_GET - do */ +#ifdef CONFIG_PAGE_POOL +static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL_ID + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), +}; +#endif /* CONFIG_PAGE_POOL */ + +/* NETDEV_CMD_PAGE_POOL_STATS_GET - do */ +#ifdef CONFIG_PAGE_POOL_STATS +static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAGE_POOL_STATS_INFO + 1] = { + [NETDEV_A_PAGE_POOL_STATS_INFO] = NLA_POLICY_NESTED(netdev_page_pool_info_nl_policy), +}; +#endif /* CONFIG_PAGE_POOL_STATS */ + +/* NETDEV_CMD_QUEUE_GET - do */ +static const struct nla_policy netdev_queue_get_do_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_QUEUE_GET - dump */ +static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IFINDEX + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_NAPI_GET - do */ +static const struct nla_policy netdev_napi_get_do_nl_policy[NETDEV_A_NAPI_ID + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_NAPI_GET - dump */ +static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFINDEX + 1] = { + [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_QSTATS_GET - dump */ +static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { + [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), +}; + +/* NETDEV_CMD_BIND_RX - do */ +static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, + [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), +}; + +/* NETDEV_CMD_NAPI_SET - do */ +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, + [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), + [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2), +}; + +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, +}; + +/* Ops table for netdev */ +static const struct genl_split_ops netdev_nl_ops[] = { + { + .cmd = NETDEV_CMD_DEV_GET, + .doit = netdev_nl_dev_get_doit, + .policy = netdev_dev_get_nl_policy, + .maxattr = NETDEV_A_DEV_IFINDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_DEV_GET, + .dumpit = netdev_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#ifdef CONFIG_PAGE_POOL + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .doit = netdev_nl_page_pool_get_doit, + .policy = netdev_page_pool_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .dumpit = netdev_nl_page_pool_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL */ +#ifdef CONFIG_PAGE_POOL_STATS + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .doit = netdev_nl_page_pool_stats_get_doit, + .policy = netdev_page_pool_stats_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_STATS_INFO, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .dumpit = netdev_nl_page_pool_stats_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL_STATS */ + { + .cmd = NETDEV_CMD_QUEUE_GET, + .doit = netdev_nl_queue_get_doit, + .policy = netdev_queue_get_do_nl_policy, + .maxattr = NETDEV_A_QUEUE_TYPE, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_QUEUE_GET, + .dumpit = netdev_nl_queue_get_dumpit, + .policy = netdev_queue_get_dump_nl_policy, + .maxattr = NETDEV_A_QUEUE_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .doit = netdev_nl_napi_get_doit, + .policy = netdev_napi_get_do_nl_policy, + .maxattr = NETDEV_A_NAPI_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .dumpit = netdev_nl_napi_get_dumpit, + .policy = netdev_napi_get_dump_nl_policy, + .maxattr = NETDEV_A_NAPI_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_QSTATS_GET, + .dumpit = netdev_nl_qstats_get_dumpit, + .policy = netdev_qstats_get_nl_policy, + .maxattr = NETDEV_A_QSTATS_SCOPE, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_BIND_RX, + .doit = netdev_nl_bind_rx_doit, + .policy = netdev_bind_rx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_NAPI_SET, + .doit = netdev_nl_napi_set_doit, + .policy = netdev_napi_set_nl_policy, + .maxattr = NETDEV_A_NAPI_THREADED, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group netdev_nl_mcgrps[] = { + [NETDEV_NLGRP_MGMT] = { "mgmt", }, + [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", }, +}; + +static void __netdev_nl_sock_priv_init(void *priv) +{ + netdev_nl_sock_priv_init(priv); +} + +static void __netdev_nl_sock_priv_destroy(void *priv) +{ + netdev_nl_sock_priv_destroy(priv); +} + +struct genl_family netdev_nl_family __ro_after_init = { + .name = NETDEV_FAMILY_NAME, + .version = NETDEV_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = netdev_nl_ops, + .n_split_ops = ARRAY_SIZE(netdev_nl_ops), + .mcgrps = netdev_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), + .sock_priv_size = sizeof(struct netdev_nl_sock), + .sock_priv_init = __netdev_nl_sock_priv_init, + .sock_priv_destroy = __netdev_nl_sock_priv_destroy, +}; diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h new file mode 100644 index 000000000000..cffc08517a41 --- /dev/null +++ b/net/core/netdev-genl-gen.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _LINUX_NETDEV_GEN_H +#define _LINUX_NETDEV_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/netdev.h> +#include <net/netdev_netlink.h> + +/* Common nested types */ +extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; +extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; + +int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info); +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + NETDEV_NLGRP_MGMT, + NETDEV_NLGRP_PAGE_POOL, +}; + +extern struct genl_family netdev_nl_family; + +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv); +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv); + +#endif /* _LINUX_NETDEV_GEN_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c new file mode 100644 index 000000000000..470fabbeacd9 --- /dev/null +++ b/net/core/netdev-genl.c @@ -0,0 +1,1203 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/busy_poll.h> +#include <net/net_namespace.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/sock.h> +#include <net/xdp.h> +#include <net/xdp_sock.h> +#include <net/page_pool/memory_provider.h> + +#include "dev.h" +#include "devmem.h" +#include "netdev-genl-gen.h" + +struct netdev_nl_dump_ctx { + unsigned long ifindex; + unsigned int rxq_idx; + unsigned int txq_idx; + unsigned int napi_id; +}; + +static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) +{ + NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); + + return (struct netdev_nl_dump_ctx *)cb->ctx; +} + +static int +netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info) +{ + u64 xsk_features = 0; + u64 xdp_rx_meta = 0; + void *hdr; + + netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + +#define XDP_METADATA_KFUNC(_, flag, __, xmo) \ + if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ + xdp_rx_meta |= flag; +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + + if (netdev->xsk_tx_metadata_ops) { + if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) + xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; + if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) + xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) + xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; + } + + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, + netdev->xdp_features, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + xdp_rx_meta, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, + xsk_features, NETDEV_A_DEV_PAD)) + goto err_cancel_msg; + + if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + netdev->xdp_zc_max_segs)) + goto err_cancel_msg; + } + + genlmsg_end(rsp, hdr); + + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static void +netdev_genl_dev_notify(struct net_device *netdev, int cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), + NETDEV_NLGRP_MGMT)) + return; + + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + if (netdev_nl_dev_fill(netdev, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, + 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); +} + +int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device *netdev; + struct sk_buff *rsp; + u32 ifindex; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_free_msg; + } + + err = netdev_nl_dev_fill(netdev, rsp, info); + netdev_unlock(netdev); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + struct net *net = sock_net(skb->sk); + int err; + + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); + if (err < 0) + return err; + } + + return 0; +} + +static int +netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, + const struct genl_info *info) +{ + unsigned long irq_suspend_timeout; + unsigned long gro_flush_timeout; + u32 napi_defer_hard_irqs; + void *hdr; + pid_t pid; + + if (!napi->dev->up) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + goto nla_put_failure; + + if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) + goto nla_put_failure; + + if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) + goto nla_put_failure; + + if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED, + napi_get_threaded(napi))) + goto nla_put_failure; + + if (napi->thread) { + pid = task_pid_nr(napi->thread); + if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) + goto nla_put_failure; + } + + napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); + if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, + napi_defer_hard_irqs)) + goto nla_put_failure; + + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + + gro_flush_timeout = napi_get_gro_flush_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + gro_flush_timeout)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + struct sk_buff *rsp; + u32 napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); + if (napi) { + err = netdev_nl_napi_fill_one(rsp, napi, info); + netdev_unlock(napi->dev); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + if (err) { + goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; + goto err_free_msg; + } + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + struct napi_struct *napi; + unsigned int prev_id; + int err = 0; + + if (!netdev->up) + return err; + + prev_id = UINT_MAX; + list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (!napi_id_valid(napi->napi_id)) + continue; + + /* Dump continuation below depends on the list being sorted */ + WARN_ON_ONCE(napi->napi_id >= prev_id); + prev_id = napi->napi_id; + + if (ctx->napi_id && napi->napi_id >= ctx->napi_id) + continue; + + err = netdev_nl_napi_fill_one(rsp, napi, info); + if (err) + return err; + ctx->napi_id = napi->napi_id; + } + return err; +} + +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_NAPI_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); + + if (ifindex) { + netdev = netdev_get_by_index_lock(net, ifindex); + if (netdev) { + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + netdev_unlock(netdev); + } else { + err = -ENODEV; + } + } else { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->napi_id = 0; + } + } + + return err; +} + +static int +netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) +{ + u64 irq_suspend_timeout = 0; + u64 gro_flush_timeout = 0; + u8 threaded = 0; + u32 defer = 0; + + if (info->attrs[NETDEV_A_NAPI_THREADED]) { + int ret; + + threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); + ret = napi_set_threaded(napi, threaded); + if (ret) + return ret; + } + + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { + defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); + napi_set_defer_hard_irqs(napi, defer); + } + + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { + gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); + napi_set_gro_flush_timeout(napi, gro_flush_timeout); + } + + return 0; +} + +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + unsigned int napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); + if (napi) { + err = netdev_nl_napi_set_config(napi, info); + netdev_unlock(napi->dev); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + return err; +} + +static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) +{ + if (napi && napi_id_valid(napi->napi_id)) + return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); + return 0; +} + +static int +netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, + u32 q_idx, u32 q_type, const struct genl_info *info) +{ + struct pp_memory_provider_params *params; + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || + nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(netdev, q_idx); + if (nla_put_napi_id(rsp, rxq->napi)) + goto nla_put_failure; + + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) + goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif + + break; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(netdev, q_idx); + if (nla_put_napi_id(rsp, txq->napi)) + goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (txq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif + break; + } + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, + u32 q_type) +{ + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + if (q_id >= netdev->real_num_rx_queues) + return -EINVAL; + return 0; + case NETDEV_QUEUE_TYPE_TX: + if (q_id >= netdev->real_num_tx_queues) + return -EINVAL; + } + return 0; +} + +static int +netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, + u32 q_type, const struct genl_info *info) +{ + int err; + + if (!netdev->up) + return -ENOENT; + + err = netdev_nl_queue_validate(netdev, q_idx, q_type); + if (err) + return err; + + return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); +} + +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 q_id, q_type, ifindex; + struct net_device *netdev; + struct sk_buff *rsp; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) + return -EINVAL; + + q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); + q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), + ifindex); + if (netdev) { + err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); + netdev_unlock_ops_compat(netdev); + } else { + err = -ENODEV; + } + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + int err = 0; + + if (!netdev->up) + return err; + + for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, + NETDEV_QUEUE_TYPE_RX, info); + if (err) + return err; + } + for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, + NETDEV_QUEUE_TYPE_TX, info); + if (err) + return err; + } + + return err; +} + +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + if (ifindex) { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (netdev) { + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + netdev_unlock_ops_compat(netdev); + } else { + err = -ENODEV; + } + } else { + for_each_netdev_lock_ops_compat_scoped(net, netdev, + ctx->ifindex) { + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + } + } + + return err; +} + +#define NETDEV_STAT_NOT_SET (~0ULL) + +static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) +{ + const u64 *add = _add; + u64 *sum = _sum; + + while (size) { + if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) + *sum += *add; + sum++; + add++; + size -= 8; + } +} + +static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) +{ + if (value == NETDEV_STAT_NOT_SET) + return 0; + return nla_put_uint(rsp, attr_id, value); +} + +static int +netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, + u32 q_type, int i, const struct genl_info *info) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + memset(&rx, 0xff, sizeof(rx)); + ops->get_queue_stats_rx(netdev, i, &rx); + if (!memchr_inv(&rx, 0xff, sizeof(rx))) + goto nla_cancel; + if (netdev_nl_stats_write_rx(rsp, &rx)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + memset(&tx, 0xff, sizeof(tx)); + ops->get_queue_stats_tx(netdev, i, &tx); + if (!memchr_inv(&tx, 0xff, sizeof(tx))) + goto nla_cancel; + if (netdev_nl_stats_write_tx(rsp, &tx)) + goto nla_put_failure; + break; + } + + genlmsg_end(rsp, hdr); + return 0; + +nla_cancel: + genlmsg_cancel(rsp, hdr); + return 0; +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + int i, err; + + if (!(netdev->flags & IFF_UP)) + return 0; + + i = ctx->rxq_idx; + while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, + i, info); + if (err) + return err; + ctx->rxq_idx = ++i; + } + i = ctx->txq_idx; + while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, + i, info); + if (err) + return err; + ctx->txq_idx = ++i; + } + + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + return 0; +} + +/** + * netdev_stat_queue_sum() - add up queue stats from range of queues + * @netdev: net_device + * @rx_start: index of the first Rx queue to query + * @rx_end: index after the last Rx queue (first *not* to query) + * @rx_sum: output Rx stats, should be already initialized + * @tx_start: index of the first Tx queue to query + * @tx_end: index after the last Tx queue (first *not* to query) + * @tx_sum: output Tx stats, should be already initialized + * + * Add stats from [start, end) range of queue IDs to *x_sum structs. + * The sum structs must be already initialized. Usually this + * helper is invoked from the .get_base_stats callbacks of drivers + * to account for stats of disabled queues. In that case the ranges + * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues). + */ +void netdev_stat_queue_sum(struct net_device *netdev, + int rx_start, int rx_end, + struct netdev_queue_stats_rx *rx_sum, + int tx_start, int tx_end, + struct netdev_queue_stats_tx *tx_sum) +{ + const struct netdev_stat_ops *ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + int i; + + ops = netdev->stat_ops; + + for (i = rx_start; i < rx_end; i++) { + memset(&rx, 0xff, sizeof(rx)); + if (ops->get_queue_stats_rx) + ops->get_queue_stats_rx(netdev, i, &rx); + netdev_nl_stats_add(rx_sum, &rx, sizeof(rx)); + } + for (i = tx_start; i < tx_end; i++) { + memset(&tx, 0xff, sizeof(tx)); + if (ops->get_queue_stats_tx) + ops->get_queue_stats_tx(netdev, i, &tx); + netdev_nl_stats_add(tx_sum, &tx, sizeof(tx)); + } +} +EXPORT_SYMBOL(netdev_stat_queue_sum); + +static int +netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info) +{ + struct netdev_queue_stats_rx rx_sum; + struct netdev_queue_stats_tx tx_sum; + void *hdr; + + /* Netdev can't guarantee any complete counters */ + if (!netdev->stat_ops->get_base_stats) + return 0; + + memset(&rx_sum, 0xff, sizeof(rx_sum)); + memset(&tx_sum, 0xff, sizeof(tx_sum)); + + netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum); + + /* The op was there, but nothing reported, don't bother */ + if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && + !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum, + 0, netdev->real_num_tx_queues, &tx_sum); + + if (netdev_nl_stats_write_rx(rsp, &rx_sum) || + netdev_nl_stats_write_tx(rsp, &tx_sum)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, + struct sk_buff *skb, const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + if (!netdev->stat_ops) + return 0; + + switch (scope) { + case 0: + return netdev_nl_stats_by_netdev(netdev, skb, info); + case NETDEV_QSTATS_SCOPE_QUEUE: + return netdev_nl_stats_by_queue(netdev, skb, info, ctx); + } + + return -EINVAL; /* Should not happen, per netlink policy */ +} + +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + unsigned int ifindex; + unsigned int scope; + int err = 0; + + scope = 0; + if (info->attrs[NETDEV_A_QSTATS_SCOPE]) + scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); + + ifindex = 0; + if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); + + if (ifindex) { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (!netdev) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + return -ENODEV; + } + if (netdev->stat_ops) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = -EOPNOTSUPP; + } + netdev_unlock_ops_compat(netdev); + return err; + } + + for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; + } + + return err; +} + +static int netdev_nl_read_rxq_bitmap(struct genl_info *info, + u32 rxq_bitmap_len, + unsigned long *rxq_bitmap) +{ + const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; + struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct nlattr *attr; + int rem, err = 0; + u32 rxq_idx; + + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + err = nla_parse_nested(tb, maxtype, attr, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) + return -EINVAL; + + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + if (rxq_idx >= rxq_bitmap_len) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]); + return -EINVAL; + } + + bitmap_set(rxq_bitmap, rxq_idx, 1); + } + + return 0; +} + +static struct device * +netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, + struct netlink_ext_ack *extack) +{ + struct device *dma_dev = NULL; + u32 rxq_idx, prev_rxq_idx; + + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { + struct device *rxq_dma_dev; + + rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); + if (dma_dev && rxq_dma_dev != dma_dev) { + NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", + rxq_idx, prev_rxq_idx); + return ERR_PTR(-EOPNOTSUPP); + } + + dma_dev = rxq_dma_dev; + prev_rxq_idx = rxq_idx; + } + + return dma_dev; +} + +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_devmem_dmabuf_binding *binding; + u32 ifindex, dmabuf_fd, rxq_idx; + struct netdev_nl_sock *priv; + struct net_device *netdev; + unsigned long *rxq_bitmap; + struct device *dma_dev; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + err = 0; + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + if (!netif_device_present(netdev)) + err = -ENODEV; + else if (!netdev_need_ops_lock(netdev)) + err = -EOPNOTSUPP; + if (err) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_DEV_IFINDEX]); + goto err_unlock; + } + + rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL); + if (!rxq_bitmap) { + err = -ENOMEM; + goto err_unlock; + } + + err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues, + rxq_bitmap); + if (err) + goto err_rxq_bitmap; + + dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack); + if (IS_ERR(dma_dev)) { + err = PTR_ERR(dma_dev); + goto err_rxq_bitmap; + } + + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, + dmabuf_fd, priv, info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_rxq_bitmap; + } + + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { + err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, + info->extack); + if (err) + goto err_unbind; + } + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + err = genlmsg_reply(rsp, info); + if (err) + goto err_unbind; + + bitmap_free(rxq_bitmap); + + netdev_unlock(netdev); + + mutex_unlock(&priv->lock); + + return 0; + +err_unbind: + net_devmem_unbind_dmabuf(binding); +err_rxq_bitmap: + bitmap_free(rxq_bitmap); +err_unlock: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_devmem_dmabuf_binding *binding; + struct netdev_nl_sock *priv; + struct net_device *netdev; + struct device *dma_dev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + + if (!netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + if (!netdev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver does not support netmem TX"); + goto err_unlock_netdev; + } + + dma_dev = netdev_queue_get_dma_dev(netdev, 0); + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, + dmabuf_fd, priv, info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock_netdev; + } + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + netdev_unlock(netdev); + mutex_unlock(&priv->lock); + + return genlmsg_reply(rsp, info); + +err_unlock_netdev: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) +{ + INIT_LIST_HEAD(&priv->bindings); + mutex_init(&priv->lock); +} + +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) +{ + struct net_devmem_dmabuf_binding *binding; + struct net_devmem_dmabuf_binding *temp; + netdevice_tracker dev_tracker; + struct net_device *dev; + + mutex_lock(&priv->lock); + list_for_each_entry_safe(binding, temp, &priv->bindings, list) { + mutex_lock(&binding->lock); + dev = binding->dev; + if (!dev) { + mutex_unlock(&binding->lock); + net_devmem_unbind_dmabuf(binding); + continue; + } + netdev_hold(dev, &dev_tracker, GFP_KERNEL); + mutex_unlock(&binding->lock); + + netdev_lock(dev); + net_devmem_unbind_dmabuf(binding); + netdev_unlock(dev); + netdev_put(dev, &dev_tracker); + } + mutex_unlock(&priv->lock); +} + +static int netdev_genl_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_REGISTER: + netdev_lock_ops_to_full(netdev); + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + netdev_unlock_full_to_ops(netdev); + break; + case NETDEV_UNREGISTER: + netdev_lock(netdev); + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + netdev_unlock(netdev); + break; + case NETDEV_XDP_FEAT_CHANGE: + netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block netdev_genl_nb = { + .notifier_call = netdev_genl_netdevice_event, +}; + +static int __init netdev_genl_init(void) +{ + int err; + + err = register_netdevice_notifier(&netdev_genl_nb); + if (err) + return err; + + err = genl_register_family(&netdev_nl_family); + if (err) + goto err_unreg_ntf; + + return 0; + +err_unreg_ntf: + unregister_netdevice_notifier(&netdev_genl_nb); + return err; +} + +subsys_initcall(netdev_genl_init); diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c new file mode 100644 index 000000000000..251f27a8307f --- /dev/null +++ b/net/core/netdev_queues.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/netdev_queues.h> + +/** + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations + * @dev: net_device + * @idx: queue index + * + * Get dma device for zero-copy operations to be used for this queue. + * When such device is not available or valid, the function will return NULL. + * + * Return: Device or NULL on error + */ +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) +{ + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; + struct device *dma_dev; + + if (queue_ops && queue_ops->ndo_queue_get_dma_dev) + dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); + else + dma_dev = dev->dev.parent; + + return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; +} + diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c new file mode 100644 index 000000000000..c7d9341b7630 --- /dev/null +++ b/net/core/netdev_rx_queue.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/ethtool_netlink.h> +#include <linux/netdevice.h> +#include <net/netdev_lock.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/memory_provider.h> + +#include "page_pool_priv.h" + +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + + return !!rxq->mp_params.mp_ops; +} +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; + void *new_mem, *old_mem; + int err; + + if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free || + !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start) + return -EOPNOTSUPP; + + netdev_assert_locked(dev); + + new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); + if (!new_mem) + return -ENOMEM; + + old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); + if (!old_mem) { + err = -ENOMEM; + goto err_free_new_mem; + } + + err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + if (err) + goto err_free_old_mem; + + err = page_pool_check_memory_provider(dev, rxq); + if (err) + goto err_free_new_queue_mem; + + if (netif_running(dev)) { + err = qops->ndo_queue_stop(dev, old_mem, rxq_idx); + if (err) + goto err_free_new_queue_mem; + + err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + if (err) + goto err_start_queue; + } else { + swap(new_mem, old_mem); + } + + qops->ndo_queue_mem_free(dev, old_mem); + + kvfree(old_mem); + kvfree(new_mem); + + return 0; + +err_start_queue: + /* Restarting the queue with old_mem should be successful as we haven't + * changed any of the queue configuration, and there is not much we can + * do to recover from a failure here. + * + * WARN if we fail to recover the old rx queue, and at least free + * old_mem so we don't also leak that. + */ + if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { + WARN(1, + "Failed to restart old queue in error path. RX queue %d may be unhealthy.", + rxq_idx); + qops->ndo_queue_mem_free(dev, old_mem); + } + +err_free_new_queue_mem: + qops->ndo_queue_mem_free(dev, new_mem); + +err_free_old_mem: + kvfree(old_mem); + +err_free_new_mem: + kvfree(new_mem); + + return err; +} +EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (!netdev_need_ops_lock(dev)) + return -EOPNOTSUPP; + + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); + return -EINVAL; + } + if (dev->cfg->hds_thresh) { + NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); + return -EINVAL; + } + if (dev_xdp_prog_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); + return -EEXIST; + } + + rxq = __netif_get_rx_queue(dev, rxq_idx); + if (rxq->mp_params.mp_ops) { + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); + return -EEXIST; + } +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) { + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); + return -EBUSY; + } +#endif + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, rxq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + netdev_lock(dev); + ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL); + netdev_unlock(dev); + return ret; +} + +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, + const struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + int err; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + err = netdev_rx_queue_restart(dev, ifq_idx); + WARN_ON(err && err != -ENETDOWN); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + netdev_lock(dev); + __net_mp_close_rxq(dev, ifq_idx, old_p); + netdev_unlock(dev); +} diff --git a/net/core/netevent.c b/net/core/netevent.c index f17ccd291d39..5bb615e963cc 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Network event notifiers * @@ -5,11 +6,6 @@ * Tom Tucker <tom@opengridcomputing.com> * Steve Wise <swise@opengridcomputing.com> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Fixes: */ @@ -31,15 +27,12 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); */ int register_netevent_notifier(struct notifier_block *nb) { - int err; - - err = atomic_notifier_chain_register(&netevent_notif_chain, nb); - return err; + return atomic_notifier_chain_register(&netevent_notif_chain, nb); } EXPORT_SYMBOL_GPL(register_netevent_notifier); /** - * netevent_unregister_notifier - unregister a netevent notifier block + * unregister_netevent_notifier - unregister a netevent notifier block * @nb: notifier * * Unregister a notifier previously registered by diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h new file mode 100644 index 000000000000..23175cb2bd86 --- /dev/null +++ b/net/core/netmem_priv.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NETMEM_PRIV_H +#define __NETMEM_PRIV_H + +static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) +{ + return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; +} + +static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) +{ + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; +} + +static inline void netmem_clear_pp_magic(netmem_ref netmem) +{ + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); + + netmem_to_nmdesc(netmem)->pp_magic = 0; +} + +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + +static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) +{ + netmem_to_nmdesc(netmem)->pp = pool; +} + +static inline void netmem_set_dma_addr(netmem_ref netmem, + unsigned long dma_addr) +{ + netmem_to_nmdesc(netmem)->dma_addr = dma_addr; +} + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = netmem_to_nmdesc(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + netmem_to_nmdesc(netmem)->pp_magic = magic; +} +#endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2c637e9a0b27..09f72f10813c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Common framework for low-level network console, dump, and debugger code * @@ -33,8 +34,9 @@ #include <net/addrconf.h> #include <net/ndisc.h> #include <net/ip6_checksum.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <trace/events/napi.h> +#include <linux/kconfig.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -43,16 +45,7 @@ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 - -static struct sk_buff_head skb_pool; - -static atomic_t trapped; - -DEFINE_STATIC_SRCU(netpoll_srcu); - #define USEC_PER_POLL 50 -#define NETPOLL_RX_ENABLED 1 -#define NETPOLL_RX_DROP 2 #define MAX_SKB_SIZE \ (sizeof(struct ethhdr) + \ @@ -61,18 +54,36 @@ DEFINE_STATIC_SRCU(netpoll_srcu); MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo); -static void netpoll_async_cleanup(struct work_struct *work); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); -#define np_info(np, fmt, ...) \ - pr_info("%s: " fmt, np->name, ##__VA_ARGS__) -#define np_err(np, fmt, ...) \ - pr_err("%s: " fmt, np->name, ##__VA_ARGS__) -#define np_notice(np, fmt, ...) \ - pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) +static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, + struct net_device *dev, + struct netdev_queue *txq) +{ + netdev_tx_t status = NETDEV_TX_OK; + netdev_features_t features; + + features = netif_skb_features(skb); + + if (skb_vlan_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) { + /* This is actually a packet drop, but we + * don't want the code that calls this + * function to try and operate on a NULL skb. + */ + goto out; + } + } + + status = netdev_start_xmit(skb, dev, txq, false); + +out: + return status; +} static void queue_process(struct work_struct *work) { @@ -83,209 +94,150 @@ static void queue_process(struct work_struct *work) while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; - const struct net_device_ops *ops = dev->netdev_ops; struct netdev_queue *txq; + unsigned int q_index; if (!netif_device_present(dev) || !netif_running(dev)) { - __kfree_skb(skb); + kfree_skb(skb); continue; } - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); - __netif_tx_lock(txq, smp_processor_id()); + /* check if skb->queue_mapping is still valid */ + q_index = skb_get_queue_mapping(skb); + if (unlikely(q_index >= dev->real_num_tx_queues)) { + q_index = q_index % dev->real_num_tx_queues; + skb_set_queue_mapping(skb, q_index); + } + txq = netdev_get_tx_queue(dev, q_index); + HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || - ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { + !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) { skb_queue_head(&npinfo->txq, skb); - __netif_tx_unlock(txq); + HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } - __netif_tx_unlock(txq); + HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); } } -static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, - unsigned short ulen, __be32 saddr, __be32 daddr) +static int netif_local_xmit_active(struct net_device *dev) { - __wsum psum; - - if (uh->check == 0 || skb_csum_unnecessary(skb)) - return 0; - - psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + int i; - if (skb->ip_summed == CHECKSUM_COMPLETE && - !csum_fold(csum_add(psum, skb->csum))) - return 0; + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - skb->csum = psum; + if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) + return 1; + } - return __skb_checksum_complete(skb); + return 0; } -/* - * Check whether delayed processing was scheduled for our NIC. If so, - * we attempt to grab the poll lock and use ->poll() to pump the card. - * If this fails, either we've recursed in ->poll() or it's already - * running on another CPU. - * - * Note: we don't mask interrupts with this lock because we're using - * trylock here and interrupts are already disabled in the softirq - * case. Further, we test the poll_owner to avoid recursion on UP - * systems where the lock doesn't exist. - * - * In cases where there is bi-directional communications, reading only - * one message at a time can lead to packets being dropped by the - * network adapter, forcing superfluous retries and possibly timeouts. - * Thus, we set our budget to greater than 1. - */ -static int poll_one_napi(struct netpoll_info *npinfo, - struct napi_struct *napi, int budget) +static void poll_one_napi(struct napi_struct *napi) { int work; - /* net_rx_action's ->poll() invocations and our's are - * synchronized by this test which is only made while - * holding the napi->poll_lock. + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation */ - if (!test_bit(NAPI_STATE_SCHED, &napi->state)) - return budget; - - npinfo->rx_flags |= NETPOLL_RX_DROP; - atomic_inc(&trapped); - set_bit(NAPI_STATE_NPSVC, &napi->state); + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + return; - work = napi->poll(napi, budget); - trace_napi_poll(napi); + /* We explicitly pass the polling call a budget of 0 to + * indicate that we are clearing the Tx path only. + */ + work = napi->poll(napi, 0); + WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); + trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); - atomic_dec(&trapped); - npinfo->rx_flags &= ~NETPOLL_RX_DROP; - - return budget - work; } static void poll_napi(struct net_device *dev) { struct napi_struct *napi; - int budget = 16; + int cpu = smp_processor_id(); - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner != smp_processor_id() && - spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), - napi, budget); - spin_unlock(&napi->poll_lock); - - if (!budget) - break; + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { + if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { + poll_one_napi(napi); + smp_store_release(&napi->poll_owner, -1); } } } -static void service_neigh_queue(struct netpoll_info *npi) +void netpoll_poll_dev(struct net_device *dev) { - if (npi) { - struct sk_buff *skb; - - while ((skb = skb_dequeue(&npi->neigh_tx))) - netpoll_neigh_reply(skb, npi); - } -} - -static void netpoll_poll_dev(struct net_device *dev) -{ - const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); + const struct net_device_ops *ops; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity * while changing device state */ - if (down_trylock(&ni->dev_lock)) + if (!ni || down_trylock(&ni->dev_lock)) return; - if (!netif_running(dev)) { + /* Some drivers will take the same locks in poll and xmit, + * we can't poll if local CPU is already in xmit. + */ + if (!netif_running(dev) || netif_local_xmit_active(dev)) { up(&ni->dev_lock); return; } ops = dev->netdev_ops; - if (!ops->ndo_poll_controller) { - up(&ni->dev_lock); - return; - } - - /* Process pending work on NIC */ - ops->ndo_poll_controller(dev); + if (ops->ndo_poll_controller) + ops->ndo_poll_controller(dev); poll_napi(dev); up(&ni->dev_lock); - if (dev->flags & IFF_SLAVE) { - if (ni) { - struct net_device *bond_dev; - struct sk_buff *skb; - struct netpoll_info *bond_ni; - - bond_dev = netdev_master_upper_dev_get_rcu(dev); - bond_ni = rcu_dereference_bh(bond_dev->npinfo); - while ((skb = skb_dequeue(&ni->neigh_tx))) { - skb->dev = bond_dev; - skb_queue_tail(&bond_ni->neigh_tx, skb); - } - } - } - - service_neigh_queue(ni); - zap_completion_queue(); } +EXPORT_SYMBOL(netpoll_poll_dev); -void netpoll_rx_disable(struct net_device *dev) +void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; - int idx; + might_sleep(); - idx = srcu_read_lock(&netpoll_srcu); - ni = srcu_dereference(dev->npinfo, &netpoll_srcu); + ni = rtnl_dereference(dev->npinfo); if (ni) down(&ni->dev_lock); - srcu_read_unlock(&netpoll_srcu, idx); } -EXPORT_SYMBOL(netpoll_rx_disable); -void netpoll_rx_enable(struct net_device *dev) +void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; - rcu_read_lock(); - ni = rcu_dereference(dev->npinfo); + + ni = rtnl_dereference(dev->npinfo); if (ni) up(&ni->dev_lock); - rcu_read_unlock(); } -EXPORT_SYMBOL(netpoll_rx_enable); -static void refill_skbs(void) +static void refill_skbs(struct netpoll *np) { + struct sk_buff_head *skb_pool; struct sk_buff *skb; - unsigned long flags; - spin_lock_irqsave(&skb_pool.lock, flags); - while (skb_pool.qlen < MAX_SKBS) { + skb_pool = &np->skb_pool; + + while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; - __skb_queue_tail(&skb_pool, skb); + skb_queue_tail(skb_pool, skb); } - spin_unlock_irqrestore(&skb_pool.lock, flags); } static void zap_completion_queue(void) @@ -304,8 +256,8 @@ static void zap_completion_queue(void) while (clist != NULL) { struct sk_buff *skb = clist; clist = clist->next; - if (skb->destructor) { - atomic_inc(&skb->users); + if (!skb_irq_freeable(skb)) { + refcount_set(&skb->users, 1); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); @@ -322,12 +274,13 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) struct sk_buff *skb; zap_completion_queue(); - refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) - skb = skb_dequeue(&skb_pool); + if (!skb) { + skb = skb_dequeue(&np->skb_pool); + schedule_work(&np->refill_wq); + } if (!skb) { if (++count < 10) { @@ -337,7 +290,7 @@ repeat: return NULL; } - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb_reserve(skb, reserve); return skb; } @@ -346,58 +299,50 @@ static int netpoll_owner_active(struct net_device *dev) { struct napi_struct *napi; - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { + if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; } /* call with IRQ disabled */ -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev) +static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { - int status = NETDEV_TX_BUSY; + netdev_tx_t status = NETDEV_TX_BUSY; + netdev_tx_t ret = NET_XMIT_DROP; + struct net_device *dev; unsigned long tries; - const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); + + dev = np->dev; + rcu_read_lock(); + npinfo = rcu_dereference_bh(dev->npinfo); - npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { - __kfree_skb(skb); - return; + dev_kfree_skb_irq(skb); + goto out; } /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - txq = netdev_pick_tx(dev, skb); + txq = netdev_core_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { - if (__netif_tx_trylock(txq)) { - if (!netif_xmit_stopped(txq)) { - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(netif_skb_features(skb), - skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - break; - skb->vlan_tci = 0; - } - - status = ops->ndo_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); - } - __netif_tx_unlock(txq); - - if (status == NETDEV_TX_OK) + if (HARD_TX_TRYLOCK(dev, txq)) { + if (!netif_xmit_stopped(txq)) + status = netpoll_start_xmit(skb, dev, txq); + + HARD_TX_UNLOCK(dev, txq); + + if (dev_xmit_complete(status)) break; } @@ -409,665 +354,244 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } WARN_ONCE(!irqs_disabled(), - "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", - dev->name, ops->ndo_start_xmit); + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", + dev->name, dev->netdev_ops->ndo_start_xmit); } - if (status != NETDEV_TX_OK) { + if (!dev_xmit_complete(status)) { skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + ret = NETDEV_TX_OK; +out: + rcu_read_unlock(); + return ret; } -EXPORT_SYMBOL(netpoll_send_skb_on_dev); -void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +static void netpoll_udp_checksum(struct netpoll *np, struct sk_buff *skb, + int len) { - int total_len, ip_len, udp_len; - struct sk_buff *skb; struct udphdr *udph; - struct iphdr *iph; - struct ethhdr *eth; - static atomic_t ip_ident; - struct ipv6hdr *ip6h; + int udp_len; - udp_len = len + sizeof(*udph); - if (np->ipv6) - ip_len = udp_len + sizeof(*ip6h); - else - ip_len = udp_len + sizeof(*iph); - - total_len = ip_len + LL_RESERVED_SPACE(np->dev); - - skb = find_skb(np, total_len + np->dev->needed_tailroom, - total_len - len); - if (!skb) - return; - - skb_copy_to_linear_data(skb, msg, len); - skb_put(skb, len); - - skb_push(skb, sizeof(*udph)); - skb_reset_transport_header(skb); + udp_len = len + sizeof(struct udphdr); udph = udp_hdr(skb); - udph->source = htons(np->local_port); - udph->dest = htons(np->remote_port); - udph->len = htons(udp_len); - if (np->ipv6) { - udph->check = 0; + /* check needs to be set, since it will be consumed in csum_partial */ + udph->check = 0; + if (np->ipv6) udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*ip6h)); - skb_reset_network_header(skb); - ip6h = ipv6_hdr(skb); - - /* ip6h->version = 6; ip6h->priority = 0; */ - put_unaligned(0x60, (unsigned char *)ip6h); - ip6h->flow_lbl[0] = 0; - ip6h->flow_lbl[1] = 0; - ip6h->flow_lbl[2] = 0; - - ip6h->payload_len = htons(sizeof(struct udphdr) + len); - ip6h->nexthdr = IPPROTO_UDP; - ip6h->hop_limit = 32; - ip6h->saddr = np->local_ip.in6; - ip6h->daddr = np->remote_ip.in6; - - eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IPV6); - } else { - udph->check = 0; + else udph->check = csum_tcpudp_magic(np->local_ip.ip, np->remote_ip.ip, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - - /* iph->version = 4; iph->ihl = 5; */ - put_unaligned(0x45, (unsigned char *)iph); - iph->tos = 0; - put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = htons(atomic_inc_return(&ip_ident)); - iph->frag_off = 0; - iph->ttl = 64; - iph->protocol = IPPROTO_UDP; - iph->check = 0; - put_unaligned(np->local_ip.ip, &(iph->saddr)); - put_unaligned(np->remote_ip.ip, &(iph->daddr)); - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - - eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IP); - } - - memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); - memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); - - skb->dev = np->dev; - - netpoll_send_skb(np, skb); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; } -EXPORT_SYMBOL(netpoll_send_udp); -static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo) +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { - int size, type = ARPOP_REPLY; - __be32 sip, tip; - unsigned char *sha; - struct sk_buff *send_skb; - struct netpoll *np, *tmp; unsigned long flags; - int hlen, tlen; - int hits = 0, proto; - - if (list_empty(&npinfo->rx_np)) - return; + netdev_tx_t ret; - /* Before checking the packet, we do some early - inspection whether this is interesting at all */ - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (np->dev == skb->dev) - hits++; + if (unlikely(!np)) { + dev_kfree_skb_irq(skb); + ret = NET_XMIT_DROP; + } else { + local_irq_save(flags); + ret = __netpoll_send_skb(np, skb); + local_irq_restore(flags); } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - - /* No netpoll struct is using this dev */ - if (!hits) - return; - - proto = ntohs(eth_hdr(skb)->h_proto); - if (proto == ETH_P_IP) { - struct arphdr *arp; - unsigned char *arp_ptr; - /* No arp on this interface */ - if (skb->dev->flags & IFF_NOARP) - return; - - if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) - return; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - arp = arp_hdr(skb); - - if ((arp->ar_hrd != htons(ARPHRD_ETHER) && - arp->ar_hrd != htons(ARPHRD_IEEE802)) || - arp->ar_pro != htons(ETH_P_IP) || - arp->ar_op != htons(ARPOP_REQUEST)) - return; - - arp_ptr = (unsigned char *)(arp+1); - /* save the location of the src hw addr */ - sha = arp_ptr; - arp_ptr += skb->dev->addr_len; - memcpy(&sip, arp_ptr, 4); - arp_ptr += 4; - /* If we actually cared about dst hw addr, - it would get copied here */ - arp_ptr += skb->dev->addr_len; - memcpy(&tip, arp_ptr, 4); - - /* Should we ignore arp? */ - if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) - return; - - size = arp_hdr_len(skb->dev); - - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (tip != np->local_ip.ip) - continue; - - hlen = LL_RESERVED_SPACE(np->dev); - tlen = np->dev->needed_tailroom; - send_skb = find_skb(np, size + hlen + tlen, hlen); - if (!send_skb) - continue; - - skb_reset_network_header(send_skb); - arp = (struct arphdr *) skb_put(send_skb, size); - send_skb->dev = skb->dev; - send_skb->protocol = htons(ETH_P_ARP); - - /* Fill the device header for the ARP frame */ - if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP, - sha, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - continue; - } - - /* - * Fill out the arp protocol part. - * - * we only support ethernet device type, - * which (according to RFC 1390) should - * always equal 1 (Ethernet). - */ - - arp->ar_hrd = htons(np->dev->type); - arp->ar_pro = htons(ETH_P_IP); - arp->ar_hln = np->dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); - - arp_ptr = (unsigned char *)(arp + 1); - memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &tip, 4); - arp_ptr += 4; - memcpy(arp_ptr, sha, np->dev->addr_len); - arp_ptr += np->dev->addr_len; - memcpy(arp_ptr, &sip, 4); - - netpoll_send_skb(np, send_skb); - - /* If there are several rx_hooks for the same address, - we're fine by sending a single reply */ - break; - } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } else if( proto == ETH_P_IPV6) { -#if IS_ENABLED(CONFIG_IPV6) - struct nd_msg *msg; - u8 *lladdr = NULL; - struct ipv6hdr *hdr; - struct icmp6hdr *icmp6h; - const struct in6_addr *saddr; - const struct in6_addr *daddr; - struct inet6_dev *in6_dev = NULL; - struct in6_addr *target; - - in6_dev = in6_dev_get(skb->dev); - if (!in6_dev || !in6_dev->cnf.accept_ra) - return; - - if (!pskb_may_pull(skb, skb->len)) - return; - - msg = (struct nd_msg *)skb_transport_header(skb); - - __skb_push(skb, skb->data - skb_transport_header(skb)); - - if (ipv6_hdr(skb)->hop_limit != 255) - return; - if (msg->icmph.icmp6_code != 0) - return; - if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) - return; + return ret; +} +EXPORT_SYMBOL(netpoll_send_skb); - saddr = &ipv6_hdr(skb)->saddr; - daddr = &ipv6_hdr(skb)->daddr; +static void push_ipv6(struct netpoll *np, struct sk_buff *skb, int len) +{ + struct ipv6hdr *ip6h; - size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (!ipv6_addr_equal(daddr, &np->local_ip.in6)) - continue; + /* ip6h->version = 6; ip6h->priority = 0; */ + *(unsigned char *)ip6h = 0x60; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; - hlen = LL_RESERVED_SPACE(np->dev); - tlen = np->dev->needed_tailroom; - send_skb = find_skb(np, size + hlen + tlen, hlen); - if (!send_skb) - continue; + ip6h->payload_len = htons(sizeof(struct udphdr) + len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 32; + ip6h->saddr = np->local_ip.in6; + ip6h->daddr = np->remote_ip.in6; - send_skb->protocol = htons(ETH_P_IPV6); - send_skb->dev = skb->dev; - - skb_reset_network_header(send_skb); - hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr)); - *(__be32*)hdr = htonl(0x60000000); - hdr->payload_len = htons(size); - hdr->nexthdr = IPPROTO_ICMPV6; - hdr->hop_limit = 255; - hdr->saddr = *saddr; - hdr->daddr = *daddr; - - icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr)); - icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; - icmp6h->icmp6_router = 0; - icmp6h->icmp6_solicited = 1; - - target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr)); - *target = msg->target; - icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size, - IPPROTO_ICMPV6, - csum_partial(icmp6h, - size, 0)); - - if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6, - lladdr, np->dev->dev_addr, - send_skb->len) < 0) { - kfree_skb(send_skb); - continue; - } - - netpoll_send_skb(np, send_skb); - - /* If there are several rx_hooks for the same address, - we're fine by sending a single reply */ - break; - } - spin_unlock_irqrestore(&npinfo->rx_lock, flags); -#endif - } + skb->protocol = htons(ETH_P_IPV6); } -static bool pkt_is_ns(struct sk_buff *skb) +static void push_ipv4(struct netpoll *np, struct sk_buff *skb, int len) { - struct nd_msg *msg; - struct ipv6hdr *hdr; - - if (skb->protocol != htons(ETH_P_ARP)) - return false; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg))) - return false; - - msg = (struct nd_msg *)skb_transport_header(skb); - __skb_push(skb, skb->data - skb_transport_header(skb)); - hdr = ipv6_hdr(skb); - - if (hdr->nexthdr != IPPROTO_ICMPV6) - return false; - if (hdr->hop_limit != 255) - return false; - if (msg->icmph.icmp6_code != 0) - return false; - if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) - return false; - - return true; + static atomic_t ip_ident; + struct iphdr *iph; + int ip_len; + + ip_len = len + sizeof(struct udphdr) + sizeof(struct iphdr); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + *(unsigned char *)iph = 0x45; + iph->tos = 0; + put_unaligned(htons(ip_len), &iph->tot_len); + iph->id = htons(atomic_inc_return(&ip_ident)); + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip.ip, &iph->saddr); + put_unaligned(np->remote_ip.ip, &iph->daddr); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + skb->protocol = htons(ETH_P_IP); } -int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) +static void push_udp(struct netpoll *np, struct sk_buff *skb, int len) { - int proto, len, ulen; - int hits = 0; - const struct iphdr *iph; - struct udphdr *uh; - struct netpoll *np, *tmp; - - if (list_empty(&npinfo->rx_np)) - goto out; - - if (skb->dev->type != ARPHRD_ETHER) - goto out; - - /* check if netpoll clients need ARP */ - if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { - skb_queue_tail(&npinfo->neigh_tx, skb); - return 1; - } else if (pkt_is_ns(skb) && atomic_read(&trapped)) { - skb_queue_tail(&npinfo->neigh_tx, skb); - return 1; - } - - if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { - skb = vlan_untag(skb); - if (unlikely(!skb)) - goto out; - } - - proto = ntohs(eth_hdr(skb)->h_proto); - if (proto != ETH_P_IP && proto != ETH_P_IPV6) - goto out; - if (skb->pkt_type == PACKET_OTHERHOST) - goto out; - if (skb_shared(skb)) - goto out; - - if (proto == ETH_P_IP) { - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - iph = (struct iphdr *)skb->data; - if (iph->ihl < 5 || iph->version != 4) - goto out; - if (!pskb_may_pull(skb, iph->ihl*4)) - goto out; - iph = (struct iphdr *)skb->data; - if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) - goto out; + struct udphdr *udph; + int udp_len; - len = ntohs(iph->tot_len); - if (skb->len < len || len < iph->ihl*4) - goto out; + udp_len = len + sizeof(struct udphdr); - /* - * Our transport medium may have padded the buffer out. - * Now We trim to the true length of the frame. - */ - if (pskb_trim_rcsum(skb, len)) - goto out; + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); - iph = (struct iphdr *)skb->data; - if (iph->protocol != IPPROTO_UDP) - goto out; + udph = udp_hdr(skb); + udph->source = htons(np->local_port); + udph->dest = htons(np->remote_port); + udph->len = htons(udp_len); - len -= iph->ihl*4; - uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); - ulen = ntohs(uh->len); + netpoll_udp_checksum(np, skb, len); +} - if (ulen != len) - goto out; - if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) - goto out; - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (np->local_ip.ip && np->local_ip.ip != iph->daddr) - continue; - if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr) - continue; - if (np->local_port && np->local_port != ntohs(uh->dest)) - continue; +static void push_eth(struct netpoll *np, struct sk_buff *skb) +{ + struct ethhdr *eth; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); - hits++; - } - } else { -#if IS_ENABLED(CONFIG_IPV6) - const struct ipv6hdr *ip6h; + eth = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + ether_addr_copy(eth->h_source, np->dev->dev_addr); + ether_addr_copy(eth->h_dest, np->remote_mac); + if (np->ipv6) + eth->h_proto = htons(ETH_P_IPV6); + else + eth->h_proto = htons(ETH_P_IP); +} - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto out; - ip6h = (struct ipv6hdr *)skb->data; - if (ip6h->version != 6) - goto out; - len = ntohs(ip6h->payload_len); - if (!len) - goto out; - if (len + sizeof(struct ipv6hdr) > skb->len) - goto out; - if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr))) - goto out; - ip6h = ipv6_hdr(skb); - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - goto out; - uh = udp_hdr(skb); - ulen = ntohs(uh->len); - if (ulen != skb->len) - goto out; - if (udp6_csum_init(skb, uh, IPPROTO_UDP)) - goto out; - list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr)) - continue; - if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr)) - continue; - if (np->local_port && np->local_port != ntohs(uh->dest)) - continue; +int netpoll_send_udp(struct netpoll *np, const char *msg, int len) +{ + int total_len, ip_len, udp_len; + struct sk_buff *skb; - np->rx_hook(np, ntohs(uh->source), - (char *)(uh+1), - ulen - sizeof(struct udphdr)); - hits++; - } -#endif - } + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!irqs_disabled()); - if (!hits) - goto out; + udp_len = len + sizeof(struct udphdr); + if (np->ipv6) + ip_len = udp_len + sizeof(struct ipv6hdr); + else + ip_len = udp_len + sizeof(struct iphdr); - kfree_skb(skb); - return 1; + total_len = ip_len + LL_RESERVED_SPACE(np->dev); -out: - if (atomic_read(&trapped)) { - kfree_skb(skb); - return 1; - } + skb = find_skb(np, total_len + np->dev->needed_tailroom, + total_len - len); + if (!skb) + return -ENOMEM; - return 0; -} + skb_copy_to_linear_data(skb, msg, len); + skb_put(skb, len); -void netpoll_print_options(struct netpoll *np) -{ - np_info(np, "local port %d\n", np->local_port); - if (np->ipv6) - np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); - else - np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); - np_info(np, "interface '%s'\n", np->dev_name); - np_info(np, "remote port %d\n", np->remote_port); + push_udp(np, skb, len); if (np->ipv6) - np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); + push_ipv6(np, skb, len); else - np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); - np_info(np, "remote ethernet address %pM\n", np->remote_mac); + push_ipv4(np, skb, len); + push_eth(np, skb); + skb->dev = np->dev; + + return (int)netpoll_send_skb(np, skb); } -EXPORT_SYMBOL(netpoll_print_options); +EXPORT_SYMBOL(netpoll_send_udp); -static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) + +static void skb_pool_flush(struct netpoll *np) { - const char *end; + struct sk_buff_head *skb_pool; - if (!strchr(str, ':') && - in4_pton(str, -1, (void *)addr, -1, &end) > 0) { - if (!*end) - return 0; - } - if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { -#if IS_ENABLED(CONFIG_IPV6) - if (!*end) - return 1; -#else - return -1; -#endif - } - return -1; + cancel_work_sync(&np->refill_wq); + skb_pool = &np->skb_pool; + skb_queue_purge_reason(skb_pool, SKB_CONSUMED); } -int netpoll_parse_options(struct netpoll *np, char *opt) +static void refill_skbs_work_handler(struct work_struct *work) { - char *cur=opt, *delim; - int ipv6; - - if (*cur != '@') { - if ((delim = strchr(cur, '@')) == NULL) - goto parse_failed; - *delim = 0; - if (kstrtou16(cur, 10, &np->local_port)) - goto parse_failed; - cur = delim; - } - cur++; - - if (*cur != '/') { - if ((delim = strchr(cur, '/')) == NULL) - goto parse_failed; - *delim = 0; - ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); - if (ipv6 < 0) - goto parse_failed; - else - np->ipv6 = (bool)ipv6; - cur = delim; - } - cur++; - - if (*cur != ',') { - /* parse out dev name */ - if ((delim = strchr(cur, ',')) == NULL) - goto parse_failed; - *delim = 0; - strlcpy(np->dev_name, cur, sizeof(np->dev_name)); - cur = delim; - } - cur++; - - if (*cur != '@') { - /* dst port */ - if ((delim = strchr(cur, '@')) == NULL) - goto parse_failed; - *delim = 0; - if (*cur == ' ' || *cur == '\t') - np_info(np, "warning: whitespace is not allowed\n"); - if (kstrtou16(cur, 10, &np->remote_port)) - goto parse_failed; - cur = delim; - } - cur++; - - /* dst ip */ - if ((delim = strchr(cur, '/')) == NULL) - goto parse_failed; - *delim = 0; - ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); - if (ipv6 < 0) - goto parse_failed; - else if (np->ipv6 != (bool)ipv6) - goto parse_failed; - else - np->ipv6 = (bool)ipv6; - cur = delim + 1; - - if (*cur != 0) { - /* MAC address */ - if (!mac_pton(cur, np->remote_mac)) - goto parse_failed; - } + struct netpoll *np = + container_of(work, struct netpoll, refill_wq); - netpoll_print_options(np); - - return 0; - - parse_failed: - np_info(np, "couldn't parse config at '%s'!\n", cur); - return -1; + refill_skbs(np); } -EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; const struct net_device_ops *ops; - unsigned long flags; int err; - np->dev = ndev; - strlcpy(np->dev_name, ndev->name, IFNAMSIZ); - INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); + skb_queue_head_init(&np->skb_pool); + INIT_WORK(&np->refill_wq, refill_skbs_work_handler); - if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || - !ndev->netdev_ops->ndo_poll_controller) { + if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", - np->dev_name); + ndev->name); err = -ENOTSUPP; goto out; } - if (!ndev->npinfo) { - npinfo = kmalloc(sizeof(*npinfo), gfp); + npinfo = rtnl_dereference(ndev->npinfo); + if (!npinfo) { + npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; goto out; } - npinfo->rx_flags = 0; - INIT_LIST_HEAD(&npinfo->rx_np); - - spin_lock_init(&npinfo->rx_lock); sema_init(&npinfo->dev_lock, 1); - skb_queue_head_init(&npinfo->neigh_tx); skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); - atomic_set(&npinfo->refcnt, 1); + refcount_set(&npinfo->refcnt, 1); - ops = np->dev->netdev_ops; + ops = ndev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); + err = ops->ndo_netpoll_setup(ndev); if (err) goto free_npinfo; } } else { - npinfo = rtnl_dereference(ndev->npinfo); - atomic_inc(&npinfo->refcnt); + refcount_inc(&npinfo->refcnt); } - npinfo->netpoll = np; + np->dev = ndev; + strscpy(np->dev_name, ndev->name, IFNAMSIZ); - if (np->rx_hook) { - spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_flags |= NETPOLL_RX_ENABLED; - list_add_tail(&np->rx, &npinfo->rx_np); - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } + /* fill up the skb queue */ + refill_skbs(np); /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -1081,143 +605,192 @@ out: } EXPORT_SYMBOL_GPL(__netpoll_setup); +/* + * Returns a pointer to a string representation of the identifier used + * to select the egress interface for the given netpoll instance. buf + * must be a buffer of length at least MAC_ADDR_STR_LEN + 1. + */ +static char *egress_dev(struct netpoll *np, char *buf) +{ + if (np->dev_name[0]) + return np->dev_name; + + snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac); + return buf; +} + +static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev, + unsigned int timeout) +{ + unsigned long atmost; + + atmost = jiffies + timeout * HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + np_notice(np, "timeout waiting for carrier\n"); + break; + } + msleep(1); + } +} + +/* + * Take the IPv6 from ndev and populate local_ip structure in netpoll + */ +static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) +{ + char buf[MAC_ADDR_STR_LEN + 1]; + int err = -EDESTADDRREQ; + struct inet6_dev *idev; + + if (!IS_ENABLED(CONFIG_IPV6)) { + np_err(np, "IPv6 is not supported %s, aborting\n", + egress_dev(np, buf)); + return -EINVAL; + } + + idev = __in6_dev_get(ndev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) + continue; + /* Got the IP, let's return */ + np->local_ip.in6 = ifp->addr; + err = 0; + break; + } + read_unlock_bh(&idev->lock); + } + if (err) { + np_err(np, "no IPv6 address for %s, aborting\n", + egress_dev(np, buf)); + return err; + } + + np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); + return 0; +} + +/* + * Take the IPv4 from ndev and populate local_ip structure in netpoll + */ +static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) +{ + char buf[MAC_ADDR_STR_LEN + 1]; + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) { + np_err(np, "no IP address for %s, aborting\n", + egress_dev(np, buf)); + return -EDESTADDRREQ; + } + + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { + np_err(np, "no IP address for %s, aborting\n", + egress_dev(np, buf)); + return -EDESTADDRREQ; + } + + np->local_ip.ip = ifa->ifa_local; + np_info(np, "local IP %pI4\n", &np->local_ip.ip); + + return 0; +} + int netpoll_setup(struct netpoll *np) { + struct net *net = current->nsproxy->net_ns; + char buf[MAC_ADDR_STR_LEN + 1]; struct net_device *ndev = NULL; - struct in_device *in_dev; + bool ip_overwritten = false; int err; rtnl_lock(); - if (np->dev_name) { - struct net *net = current->nsproxy->net_ns; + if (np->dev_name[0]) ndev = __dev_get_by_name(net, np->dev_name); - } + else if (is_valid_ether_addr(np->dev_mac)) + ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); + if (!ndev) { - np_err(np, "%s doesn't exist, aborting\n", np->dev_name); + np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf)); err = -ENODEV; goto unlock; } - dev_hold(ndev); + netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { - np_err(np, "%s is a slave device, aborting\n", np->dev_name); + np_err(np, "%s is a slave device, aborting\n", + egress_dev(np, buf)); err = -EBUSY; goto put; } if (!netif_running(ndev)) { - unsigned long atmost, atleast; - - np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - - err = dev_open(ndev); + np_info(np, "device %s not up yet, forcing it\n", + egress_dev(np, buf)); + err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } rtnl_unlock(); - atleast = jiffies + HZ/10; - atmost = jiffies + carrier_timeout * HZ; - while (!netif_carrier_ok(ndev)) { - if (time_after(jiffies, atmost)) { - np_notice(np, "timeout waiting for carrier\n"); - break; - } - msleep(1); - } - - /* If carrier appears to come up instantly, we don't - * trust it and pause so that we don't pump all our - * queued console messages into the bitbucket. - */ - - if (time_before(jiffies, atleast)) { - np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n"); - msleep(4000); - } + netpoll_wait_carrier(np, ndev, carrier_timeout); rtnl_lock(); } if (!np->local_ip.ip) { if (!np->ipv6) { - in_dev = __in_dev_get_rtnl(ndev); - - if (!in_dev || !in_dev->ifa_list) { - np_err(np, "no IP address for %s, aborting\n", - np->dev_name); - err = -EDESTADDRREQ; + err = netpoll_take_ipv4(np, ndev); + if (err) goto put; - } - - np->local_ip.ip = in_dev->ifa_list->ifa_local; - np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_dev *idev; - - err = -EDESTADDRREQ; - idev = __in6_dev_get(ndev); - if (idev) { - struct inet6_ifaddr *ifp; - - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) - continue; - np->local_ip.in6 = ifp->addr; - err = 0; - break; - } - read_unlock_bh(&idev->lock); - } - if (err) { - np_err(np, "no IPv6 address for %s, aborting\n", - np->dev_name); + err = netpoll_take_ipv6(np, ndev); + if (err) goto put; - } else - np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); -#else - np_err(np, "IPv6 is not supported %s, aborting\n", - np->dev_name); - err = -EINVAL; - goto put; -#endif } + ip_overwritten = true; } - /* fill up the skb queue */ - refill_skbs(); - - err = __netpoll_setup(np, ndev, GFP_KERNEL); + err = __netpoll_setup(np, ndev); if (err) - goto put; - + goto flush; rtnl_unlock(); + + /* Make sure all NAPI polls which started before dev->npinfo + * was visible have exited before we start calling NAPI poll. + * NAPI skips locking if dev->npinfo is NULL. + */ + synchronize_rcu(); + return 0; +flush: + skb_pool_flush(np); put: - dev_put(ndev); + DEBUG_NET_WARN_ON_ONCE(np->dev); + if (ip_overwritten) + memset(&np->local_ip, 0, sizeof(np->local_ip)); + netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(netpoll_setup); -static int __init netpoll_init(void) -{ - skb_queue_head_init(&skb_pool); - return 0; -} -core_initcall(netpoll_init); - static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = container_of(rcu_head, struct netpoll_info, rcu); - skb_queue_purge(&npinfo->neigh_tx); skb_queue_purge(&npinfo->txq); /* we can't call cancel_delayed_work_sync here, as we are in softirq */ @@ -1230,83 +803,58 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) kfree(npinfo); } -void __netpoll_cleanup(struct netpoll *np) +static void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; - unsigned long flags; - /* rtnl_dereference would be preferable here but - * rcu_cleanup_netpoll path can put us in here safely without - * holding the rtnl, so plain rcu_dereference it is - */ npinfo = rtnl_dereference(np->dev->npinfo); if (!npinfo) return; - if (!list_empty(&npinfo->rx_np)) { - spin_lock_irqsave(&npinfo->rx_lock, flags); - list_del(&np->rx); - if (list_empty(&npinfo->rx_np)) - npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; - spin_unlock_irqrestore(&npinfo->rx_lock, flags); - } - - synchronize_srcu(&netpoll_srcu); - - if (atomic_dec_and_test(&npinfo->refcnt)) { + /* At this point, there is a single npinfo instance per netdevice, and + * its refcnt tracks how many netpoll structures are linked to it. We + * only perform npinfo cleanup when the refcnt decrements to zero. + */ + if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; ops = np->dev->netdev_ops; if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); - rcu_assign_pointer(np->dev->npinfo, NULL); - call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + RCU_INIT_POINTER(np->dev->npinfo, NULL); + call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } + + skb_pool_flush(np); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); -static void netpoll_async_cleanup(struct work_struct *work) +void __netpoll_free(struct netpoll *np) { - struct netpoll *np = container_of(work, struct netpoll, cleanup_work); + ASSERT_RTNL(); - rtnl_lock(); + /* Wait for transmitting packets to finish before freeing. */ + synchronize_net(); __netpoll_cleanup(np); - rtnl_unlock(); kfree(np); } +EXPORT_SYMBOL_GPL(__netpoll_free); -void __netpoll_free_async(struct netpoll *np) +void do_netpoll_cleanup(struct netpoll *np) { - schedule_work(&np->cleanup_work); + __netpoll_cleanup(np); + netdev_put(np->dev, &np->dev_tracker); + np->dev = NULL; } -EXPORT_SYMBOL_GPL(__netpoll_free_async); +EXPORT_SYMBOL(do_netpoll_cleanup); void netpoll_cleanup(struct netpoll *np) { - if (!np->dev) - return; - rtnl_lock(); - __netpoll_cleanup(np); + if (!np->dev) + goto out; + do_netpoll_cleanup(np); +out: rtnl_unlock(); - - dev_put(np->dev); - np->dev = NULL; } EXPORT_SYMBOL(netpoll_cleanup); - -int netpoll_trap(void) -{ - return atomic_read(&trapped); -} -EXPORT_SYMBOL(netpoll_trap); - -void netpoll_set_trap(int trap) -{ - if (trap) - atomic_inc(&trapped); - else - atomic_dec(&trapped); -} -EXPORT_SYMBOL(netpoll_set_trap); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index e533259dce3c..8456dfbe2eb4 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netprio_cgroup.c Priority Control Group * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Authors: Neil Horman <nhorman@tuxdriver.com> */ @@ -20,6 +16,8 @@ #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/atomic.h> +#include <linux/sched/task.h> + #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> @@ -27,16 +25,16 @@ #include <linux/fdtable.h> -#define PRIOMAP_MIN_SZ 128 +/* + * netprio allocates per-net_device priomap array which is indexed by + * css->id. Limiting css ID to 16bits doesn't lose anything. + */ +#define NETPRIO_ID_MAX USHRT_MAX -static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), - struct cgroup_netprio_state, css); -} +#define PRIOMAP_MIN_SZ 128 /* - * Extend @dev->priomap so that it's large enough to accomodate + * Extend @dev->priomap so that it's large enough to accommodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful * return. Must be called under rtnl lock. */ @@ -87,67 +85,73 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) /** * netprio_prio - return the effective netprio of a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ -static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) +static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); + int id = css->id; - if (map && cgrp->id < map->priomap_len) - return map->priomap[cgrp->id]; + if (map && id < map->priomap_len) + return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * - * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl + * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ -static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, - u32 prio) +static int netprio_set_prio(struct cgroup_subsys_state *css, + struct net_device *dev, u32 prio) { struct netprio_map *map; + int id = css->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); - if (!prio && (!map || map->priomap_len <= cgrp->id)) + if (!prio && (!map || map->priomap_len <= id)) return 0; - ret = extend_netdev_table(dev, cgrp->id); + ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); - map->priomap[cgrp->id] = prio; + map->priomap[id] = prio; return 0; } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { - struct cgroup_netprio_state *cs; + struct cgroup_subsys_state *css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) + css = kzalloc(sizeof(*css), GFP_KERNEL); + if (!css) return ERR_PTR(-ENOMEM); - return &cs->css; + return css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; - if (!parent) + if (css->id > NETPRIO_ID_MAX) + return -ENOSPC; + + if (!parent_css) return 0; rtnl_lock(); @@ -156,9 +160,9 @@ static int cgrp_css_online(struct cgroup *cgrp) * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { - u32 prio = netprio_prio(parent, dev); + u32 prio = netprio_prio(parent_css, dev); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(css, dev, prio); if (ret) break; } @@ -166,37 +170,37 @@ static int cgrp_css_online(struct cgroup *cgrp) return ret; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_netprio_state(cgrp)); + kfree(css); } -static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp->id; + return css->id; } -static int read_priomap(struct cgroup *cont, struct cftype *cft, - struct cgroup_map_cb *cb) +static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) - cb->fill(cb, dev->name, netprio_prio(cont, dev)); + seq_printf(sf, "%s %u\n", dev->name, + netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } -static int write_priomap(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static ssize_t write_priomap(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; - if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) + if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); @@ -205,30 +209,32 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, rtnl_lock(); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); - return ret; + return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) { - int err; - struct socket *sock = sock_from_file(file, &err); + struct socket *sock = sock_from_file(file); + if (sock) - sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; + sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, + (unsigned long)v); return 0; } -static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; - void *v; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->id; - cgroup_taskset_for_each(p, cgrp, tset) { task_lock(p); - v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } @@ -241,21 +247,18 @@ static struct cftype ss_files[] = { }, { .name = "ifpriomap", - .read_map = read_priomap, - .write_string = write_priomap, + .seq_show = read_priomap, + .write = write_priomap, }, { } /* terminate */ }; -struct cgroup_subsys net_prio_subsys = { - .name = "net_prio", +struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, - .subsys_id = net_prio_subsys_id, - .base_cftypes = ss_files, - .module = THIS_MODULE, + .legacy_cftypes = ss_files, }; static int netprio_device_event(struct notifier_block *unused, @@ -286,37 +289,7 @@ static struct notifier_block netprio_device_notifier = { static int __init init_cgroup_netprio(void) { - int ret; - - ret = cgroup_load_subsys(&net_prio_subsys); - if (ret) - goto out; - register_netdevice_notifier(&netprio_device_notifier); - -out: - return ret; -} - -static void __exit exit_cgroup_netprio(void) -{ - struct netprio_map *old; - struct net_device *dev; - - unregister_netdevice_notifier(&netprio_device_notifier); - - cgroup_unload_subsys(&net_prio_subsys); - - rtnl_lock(); - for_each_netdev(&init_net, dev) { - old = rtnl_dereference(dev->priomap); - RCU_INIT_POINTER(dev->priomap, NULL); - if (old) - kfree_rcu(old, rcu); - } - rtnl_unlock(); + return 0; } - -module_init(init_cgroup_netprio); -module_exit(exit_cgroup_netprio); -MODULE_LICENSE("GPL v2"); +subsys_initcall(init_cgroup_netprio); diff --git a/net/core/of_net.c b/net/core/of_net.c new file mode 100644 index 000000000000..93ea425b9248 --- /dev/null +++ b/net/core/of_net.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * OF helpers for network devices. + * + * Initially copied out of arch/powerpc/kernel/prom_parse.c + */ +#include <linux/etherdevice.h> +#include <linux/kernel.h> +#include <linux/of_net.h> +#include <linux/of_platform.h> +#include <linux/platform_device.h> +#include <linux/phy.h> +#include <linux/export.h> +#include <linux/device.h> +#include <linux/nvmem-consumer.h> + +/** + * of_get_phy_mode - Get phy mode for given device_node + * @np: Pointer to the given device_node + * @interface: Pointer to the result + * + * The function gets phy interface string from property 'phy-mode' or + * 'phy-connection-type'. The index in phy_modes table is set in + * interface and 0 returned. In case of error interface is set to + * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV. + */ +int of_get_phy_mode(struct device_node *np, phy_interface_t *interface) +{ + const char *pm; + int err, i; + + *interface = PHY_INTERFACE_MODE_NA; + + err = of_property_read_string(np, "phy-mode", &pm); + if (err < 0) + err = of_property_read_string(np, "phy-connection-type", &pm); + if (err < 0) + return err; + + for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) + if (!strcasecmp(pm, phy_modes(i))) { + *interface = i; + return 0; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(of_get_phy_mode); + +static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr) +{ + struct property *pp = of_find_property(np, name, NULL); + + if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) { + memcpy(addr, pp->value, ETH_ALEN); + return 0; + } + return -ENODEV; +} + +int of_get_mac_address_nvmem(struct device_node *np, u8 *addr) +{ + struct platform_device *pdev = of_find_device_by_node(np); + struct nvmem_cell *cell; + const void *mac; + size_t len; + int ret; + + /* Try lookup by device first, there might be a nvmem_cell_lookup + * associated with a given device. + */ + if (pdev) { + ret = nvmem_get_mac_address(&pdev->dev, addr); + put_device(&pdev->dev); + return ret; + } + + cell = of_nvmem_cell_get(np, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; + } + + memcpy(addr, mac, ETH_ALEN); + kfree(mac); + + return 0; +} +EXPORT_SYMBOL(of_get_mac_address_nvmem); + +/** + * of_get_mac_address() + * @np: Caller's Device Node + * @addr: Pointer to a six-byte array for the result + * + * Search the device tree for the best MAC address to use. 'mac-address' is + * checked first, because that is supposed to contain to "most recent" MAC + * address. If that isn't set, then 'local-mac-address' is checked next, + * because that is the default address. If that isn't set, then the obsolete + * 'address' is checked, just in case we're using an old device tree. If any + * of the above isn't set, then try to get MAC address from nvmem cell named + * 'mac-address'. + * + * Note that the 'address' property is supposed to contain a virtual address of + * the register set, but some DTS files have redefined that property to be the + * MAC address. + * + * All-zero MAC addresses are rejected, because those could be properties that + * exist in the device tree, but were not set by U-Boot. For example, the + * DTS could define 'mac-address' and 'local-mac-address', with zero MAC + * addresses. Some older U-Boots only initialized 'local-mac-address'. In + * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists + * but is all zeros. + * + * Return: 0 on success and errno in case of error. +*/ +int of_get_mac_address(struct device_node *np, u8 *addr) +{ + int ret; + + if (!np) + return -ENODEV; + + ret = of_get_mac_addr(np, "mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "local-mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "address", addr); + if (!ret) + return 0; + + return of_get_mac_address_nvmem(np, addr); +} +EXPORT_SYMBOL(of_get_mac_address); + +/** + * of_get_ethdev_address() + * @np: Caller's Device Node + * @dev: Pointer to netdevice which address will be updated + * + * Search the device tree for the best MAC address to use. + * If found set @dev->dev_addr to that address. + * + * See documentation of of_get_mac_address() for more information on how + * the best address is determined. + * + * Return: 0 on success and errno in case of error. + */ +int of_get_ethdev_address(struct device_node *np, struct net_device *dev) +{ + u8 addr[ETH_ALEN]; + int ret; + + ret = of_get_mac_address(np, addr); + if (!ret) + eth_hw_addr_set(dev, addr); + return ret; +} +EXPORT_SYMBOL(of_get_ethdev_address); diff --git a/net/core/page_pool.c b/net/core/page_pool.c new file mode 100644 index 000000000000..265a729431bb --- /dev/null +++ b/net/core/page_pool.c @@ -0,0 +1,1354 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.c + * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> + * Copyright (C) 2016 Red Hat, Inc. + */ + +#include <linux/error-injection.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/device.h> + +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/xdp.h> + +#include <linux/dma-direction.h> +#include <linux/dma-mapping.h> +#include <linux/page-flags.h> +#include <linux/mm.h> /* for put_page() */ +#include <linux/poison.h> +#include <linux/ethtool.h> +#include <linux/netdevice.h> + +#include <trace/events/page_pool.h> + +#include "dev.h" +#include "mp_dmabuf_devmem.h" +#include "netmem_priv.h" +#include "page_pool_priv.h" + +DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); + +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (60 * HZ) + +#define BIAS_MAX (LONG_MAX >> 1) + +#ifdef CONFIG_PAGE_POOL_STATS +static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); + +/* alloc_stat_inc is intended to be used in softirq context */ +#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) +/* recycle_stat_inc is safe to use when preemption is possible. */ +#define recycle_stat_inc(pool, __stat) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_inc(s->__stat); \ + } while (0) + +#define recycle_stat_add(pool, __stat, val) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_add(s->__stat, val); \ + } while (0) + +static const char pp_stats[][ETH_GSTRING_LEN] = { + "rx_pp_alloc_fast", + "rx_pp_alloc_slow", + "rx_pp_alloc_slow_ho", + "rx_pp_alloc_empty", + "rx_pp_alloc_refill", + "rx_pp_alloc_waive", + "rx_pp_recycle_cached", + "rx_pp_recycle_cache_full", + "rx_pp_recycle_ring", + "rx_pp_recycle_ring_full", + "rx_pp_recycle_released_ref", +}; + +/** + * page_pool_get_stats() - fetch page pool stats + * @pool: pool from which page was allocated + * @stats: struct page_pool_stats to fill in + * + * Retrieve statistics about the page_pool. This API is only available + * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. + * A pointer to a caller allocated struct page_pool_stats structure + * is passed to this API which is filled in. The caller can then report + * those stats to the user (perhaps via ethtool, debugfs, etc.). + */ +bool page_pool_get_stats(const struct page_pool *pool, + struct page_pool_stats *stats) +{ + int cpu = 0; + + if (!stats) + return false; + + /* The caller is responsible to initialize stats. */ + stats->alloc_stats.fast += pool->alloc_stats.fast; + stats->alloc_stats.slow += pool->alloc_stats.slow; + stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; + stats->alloc_stats.empty += pool->alloc_stats.empty; + stats->alloc_stats.refill += pool->alloc_stats.refill; + stats->alloc_stats.waive += pool->alloc_stats.waive; + + for_each_possible_cpu(cpu) { + const struct page_pool_recycle_stats *pcpu = + per_cpu_ptr(pool->recycle_stats, cpu); + + stats->recycle_stats.cached += pcpu->cached; + stats->recycle_stats.cache_full += pcpu->cache_full; + stats->recycle_stats.ring += pcpu->ring; + stats->recycle_stats.ring_full += pcpu->ring_full; + stats->recycle_stats.released_refcnt += pcpu->released_refcnt; + } + + return true; +} +EXPORT_SYMBOL(page_pool_get_stats); + +u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { + memcpy(data, pp_stats[i], ETH_GSTRING_LEN); + data += ETH_GSTRING_LEN; + } + + return data; +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); + +int page_pool_ethtool_stats_get_count(void) +{ + return ARRAY_SIZE(pp_stats); +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); + +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) +{ + const struct page_pool_stats *pool_stats = stats; + + *data++ = pool_stats->alloc_stats.fast; + *data++ = pool_stats->alloc_stats.slow; + *data++ = pool_stats->alloc_stats.slow_high_order; + *data++ = pool_stats->alloc_stats.empty; + *data++ = pool_stats->alloc_stats.refill; + *data++ = pool_stats->alloc_stats.waive; + *data++ = pool_stats->recycle_stats.cached; + *data++ = pool_stats->recycle_stats.cache_full; + *data++ = pool_stats->recycle_stats.ring; + *data++ = pool_stats->recycle_stats.ring_full; + *data++ = pool_stats->recycle_stats.released_refcnt; + + return data; +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get); + +#else +#define alloc_stat_inc(...) do { } while (0) +#define recycle_stat_inc(...) do { } while (0) +#define recycle_stat_add(...) do { } while (0) +#endif + +static bool page_pool_producer_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + bool in_softirq = in_softirq(); + + if (in_softirq) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); + + return in_softirq; +} + +static void page_pool_producer_unlock(struct page_pool *pool, + bool in_softirq) + __releases(&pool->ring.producer_lock) +{ + if (in_softirq) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + +static void page_pool_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); + CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, + PAGE_POOL_FRAG_GROUP_ALIGN); +} + +static int page_pool_init(struct page_pool *pool, + const struct page_pool_params *params, + int cpuid) +{ + unsigned int ring_qsize = 1024; /* Default */ + struct netdev_rx_queue *rxq; + int err; + + page_pool_struct_check(); + + memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); + memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); + + pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; + + /* Validate only known flags were used */ + if (pool->slow.flags & ~PP_FLAG_ALL) + return -EINVAL; + + if (pool->p.pool_size) + ring_qsize = min(pool->p.pool_size, 16384); + + /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, + * which is the XDP_TX use-case. + */ + if (pool->slow.flags & PP_FLAG_DMA_MAP) { + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + + pool->dma_map = true; + } + + if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { + /* In order to request DMA-sync-for-device the page + * needs to be mapped + */ + if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) + return -EINVAL; + + if (!pool->p.max_len) + return -EINVAL; + + pool->dma_sync = true; + + /* pool->p.offset has to be set according to the address + * offset used by the DMA engine to start copying rx data + */ + } + + pool->has_init_callback = !!pool->slow.init_callback; + +#ifdef CONFIG_PAGE_POOL_STATS + if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; + } else { + /* For system page pool instance we use a singular stats object + * instead of allocating a separate percpu variable for each + * (also percpu) page pool instance. + */ + pool->recycle_stats = &pp_system_recycle_stats; + pool->system = true; + } +#endif + + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif + return -ENOMEM; + } + + atomic_set(&pool->pages_state_release_cnt, 0); + + /* Driver calling page_pool_create() also call page_pool_destroy() */ + refcount_set(&pool->user_cnt, 1); + + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); + + if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { + netdev_assert_locked(pool->slow.netdev); + rxq = __netif_get_rx_queue(pool->slow.netdev, + pool->slow.queue_idx); + pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; + } + + if (pool->mp_ops) { + if (!pool->dma_map || !pool->dma_sync) { + err = -EOPNOTSUPP; + goto free_ptr_ring; + } + + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); + if (err) { + pr_warn("%s() mem-provider init failed %d\n", __func__, + err); + goto free_ptr_ring; + } + + static_branch_inc(&page_pool_mem_providers); + } else if (pool->p.order > MAX_PAGE_ORDER) { + err = -EINVAL; + goto free_ptr_ring; + } + + return 0; + +free_ptr_ring: + ptr_ring_cleanup(&pool->ring, NULL); + xa_destroy(&pool->dma_mapped); +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif + return err; +} + +static void page_pool_uninit(struct page_pool *pool) +{ + ptr_ring_cleanup(&pool->ring, NULL); + xa_destroy(&pool->dma_mapped); + +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif +} + +/** + * page_pool_create_percpu() - create a page pool for a given cpu. + * @params: parameters, see struct page_pool_params + * @cpuid: cpu identifier + */ +struct page_pool * +page_pool_create_percpu(const struct page_pool_params *params, int cpuid) +{ + struct page_pool *pool; + int err; + + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); + if (!pool) + return ERR_PTR(-ENOMEM); + + err = page_pool_init(pool, params, cpuid); + if (err < 0) + goto err_free; + + err = page_pool_list(pool); + if (err) + goto err_uninit; + + return pool; + +err_uninit: + page_pool_uninit(pool); +err_free: + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); +} +EXPORT_SYMBOL(page_pool_create_percpu); + +/** + * page_pool_create() - create a page pool + * @params: parameters, see struct page_pool_params + */ +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + return page_pool_create_percpu(params, -1); +} +EXPORT_SYMBOL(page_pool_create); + +static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem); + +static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) +{ + struct ptr_ring *r = &pool->ring; + netmem_ref netmem; + int pref_nid; /* preferred NUMA node */ + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) { + alloc_stat_inc(pool, empty); + return 0; + } + + /* Softirq guarantee CPU and thus NUMA node is stable. This, + * assumes CPU refilling driver RX-ring will also run RX-NAPI. + */ +#ifdef CONFIG_NUMA + pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; +#else + /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ + pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ +#endif + + /* Refill alloc array, but only if NUMA match */ + do { + netmem = (__force netmem_ref)__ptr_ring_consume(r); + if (unlikely(!netmem)) + break; + + if (likely(netmem_is_pref_nid(netmem, pref_nid))) { + pool->alloc.cache[pool->alloc.count++] = netmem; + } else { + /* NUMA mismatch; + * (1) release 1 page to page-allocator and + * (2) break out to fallthrough to alloc_pages_node. + * This limit stress on page buddy alloactor. + */ + page_pool_return_netmem(pool, netmem); + alloc_stat_inc(pool, waive); + netmem = 0; + break; + } + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); + + /* Return last page */ + if (likely(pool->alloc.count > 0)) { + netmem = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, refill); + } + + return netmem; +} + +/* fast path */ +static netmem_ref __page_pool_get_cached(struct page_pool *pool) +{ + netmem_ref netmem; + + /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ + if (likely(pool->alloc.count)) { + /* Fast-path */ + netmem = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, fast); + } else { + netmem = page_pool_refill_alloc_cache(pool); + } + + return netmem; +} + +static void __page_pool_dma_sync_for_device(const struct page_pool *pool, + netmem_ref netmem, + u32 dma_sync_size) +{ +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); + + dma_sync_size = min(dma_sync_size, pool->p.max_len); + __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, + dma_sync_size, pool->p.dma_dir); +#endif +} + +static __always_inline void +page_pool_dma_sync_for_device(const struct page_pool *pool, + netmem_ref netmem, + u32 dma_sync_size) +{ + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } +} + +static int page_pool_register_dma_index(struct page_pool *pool, + netmem_ref netmem, gfp_t gfp) +{ + int err = 0; + u32 id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + goto out; + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto out; + } + + netmem_set_dma_index(netmem, id); +out: + return err; +} + +static int page_pool_release_dma_index(struct page_pool *pool, + netmem_ref netmem) +{ + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + return 0; + + id = netmem_get_dma_index(netmem); + if (!id) + return -1; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return -1; + + netmem_set_dma_index(netmem, 0); + + return 0; +} + +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) +{ + dma_addr_t dma; + int err; + + /* Setup DMA mapping: use 'struct page' area for storing DMA-addr + * since dma_addr_t can be either 32 or 64 bits and does not always fit + * into page private data (i.e 32bit cpu with 64bit DMA caps) + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, + (PAGE_SIZE << pool->p.order), pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC | + DMA_ATTR_WEAK_ORDERING); + if (dma_mapping_error(pool->p.dev, dma)) + return false; + + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); + goto unmap_failed; + } + + err = page_pool_register_dma_index(pool, netmem, gfp); + if (err) + goto unset_failed; + + page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); + + return true; + +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); +unmap_failed: + dma_unmap_page_attrs(pool->p.dev, dma, + PAGE_SIZE << pool->p.order, pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); + return false; +} + +static struct page *__page_pool_alloc_page_order(struct page_pool *pool, + gfp_t gfp) +{ + struct page *page; + + gfp |= __GFP_COMP; + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (unlikely(!page)) + return NULL; + + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { + put_page(page); + return NULL; + } + + alloc_stat_inc(pool, slow_high_order); + page_pool_set_pp_info(pool, page_to_netmem(page)); + + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, page_to_netmem(page), + pool->pages_state_hold_cnt); + return page; +} + +/* slow path */ +static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool, + gfp_t gfp) +{ + const int bulk = PP_ALLOC_CACHE_REFILL; + unsigned int pp_order = pool->p.order; + bool dma_map = pool->dma_map; + netmem_ref netmem; + int i, nr_pages; + + /* Unconditionally set NOWARN if allocating from NAPI. + * Drivers forget to set it, and OOM reports on packet Rx are useless. + */ + if ((gfp & GFP_ATOMIC) == GFP_ATOMIC) + gfp |= __GFP_NOWARN; + + /* Don't support bulk alloc for high-order pages */ + if (unlikely(pp_order)) + return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); + + /* Unnecessary as alloc cache is empty, but guarantees zero count */ + if (unlikely(pool->alloc.count > 0)) + return pool->alloc.cache[--pool->alloc.count]; + + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ + memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); + + nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, + (struct page **)pool->alloc.cache); + if (unlikely(!nr_pages)) + return 0; + + /* Pages have been filled into alloc.cache array, but count is zero and + * page element have not been (possibly) DMA mapped. + */ + for (i = 0; i < nr_pages; i++) { + netmem = pool->alloc.cache[i]; + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { + put_page(netmem_to_page(netmem)); + continue; + } + + page_pool_set_pp_info(pool, netmem); + pool->alloc.cache[pool->alloc.count++] = netmem; + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, + pool->pages_state_hold_cnt); + } + + /* Return last page */ + if (likely(pool->alloc.count > 0)) { + netmem = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, slow); + } else { + netmem = 0; + } + + /* When page just alloc'ed is should/must have refcnt 1. */ + return netmem; +} + +/* For using page_pool replace: alloc_pages() API calls, but provide + * synchronization guarantee for allocation side. + */ +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + netmem_ref netmem; + + /* Fast-path: Get a page from cache */ + netmem = __page_pool_get_cached(pool); + if (netmem) + return netmem; + + /* Slow-path: cache empty, do real allocation */ + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); + else + netmem = __page_pool_alloc_netmems_slow(pool, gfp); + return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_netmems); +ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); +} +EXPORT_SYMBOL(page_pool_alloc_pages); + +/* Calculate distance between two u32 values, valid if distance is below 2^(31) + * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution + */ +#define _distance(a, b) (s32)((a) - (b)) + +s32 page_pool_inflight(const struct page_pool *pool, bool strict) +{ + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); + s32 inflight; + + inflight = _distance(hold_cnt, release_cnt); + + if (strict) { + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); + WARN(inflight < 0, "Negative(%d) inflight packet-pages", + inflight); + } else { + inflight = max(0, inflight); + } + + return inflight; +} + +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) +{ + netmem_set_pp(netmem, pool); + netmem_or_pp_magic(netmem, PP_SIGNATURE); + + /* Ensuring all pages have been split into one fragment initially: + * page_pool_set_pp_info() is only called once for every page when it + * is allocated from the page allocator and page_pool_fragment_page() + * is dirtying the same cache line as the page->pp_magic above, so + * the overhead is negligible. + */ + page_pool_fragment_netmem(netmem, 1); + if (pool->has_init_callback) + pool->slow.init_callback(netmem, pool->slow.init_arg); +} + +void page_pool_clear_pp_info(netmem_ref netmem) +{ + netmem_clear_pp_magic(netmem); + netmem_set_pp(netmem, NULL); +} + +static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, + netmem_ref netmem) +{ + dma_addr_t dma; + + if (!pool->dma_map) + /* Always account for inflight pages, even if we didn't + * map them + */ + return; + + if (page_pool_release_dma_index(pool, netmem)) + return; + + dma = page_pool_get_dma_addr_netmem(netmem); + + /* When page is unmapped, it cannot be returned to our pool */ + dma_unmap_page_attrs(pool->p.dev, dma, + PAGE_SIZE << pool->p.order, pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); + page_pool_set_dma_addr_netmem(netmem, 0); +} + +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem) +{ + int count; + bool put; + + put = true; + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); + else + __page_pool_release_netmem_dma(pool, netmem); + + /* This may be the last page returned, releasing the pool, so + * it is not safe to reference pool afterwards. + */ + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, netmem, count); + + if (put) { + page_pool_clear_pp_info(netmem); + put_page(netmem_to_page(netmem)); + } + /* An optimization would be to call __free_pages(page, pool->p.order) + * knowing page is not part of page-cache (thus avoiding a + * __page_cache_release() call). + */ +} + +static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) +{ + bool in_softirq, ret; + + /* BH protection not needed if current is softirq */ + in_softirq = page_pool_producer_lock(pool); + ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); + if (ret) + recycle_stat_inc(pool, ring); + page_pool_producer_unlock(pool, in_softirq); + + return ret; +} + +/* Only allow direct recycling in special circumstances, into the + * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. + * + * Caller must provide appropriate safe context. + */ +static bool page_pool_recycle_in_cache(netmem_ref netmem, + struct page_pool *pool) +{ + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { + recycle_stat_inc(pool, cache_full); + return false; + } + + /* Caller MUST have verified/know (page_ref_count(page) == 1) */ + pool->alloc.cache[pool->alloc.count++] = netmem; + recycle_stat_inc(pool, cached); + return true; +} + +static bool __page_pool_page_can_be_recycled(netmem_ref netmem) +{ + return netmem_is_net_iov(netmem) || + (page_ref_count(netmem_to_page(netmem)) == 1 && + !page_is_pfmemalloc(netmem_to_page(netmem))); +} + +/* If the page refcnt == 1, this will try to recycle the page. + * If pool->dma_sync is set, we'll try to sync the DMA area for + * the configured size min(dma_sync_size, pool->max_len). + * If the page refcnt != 1, then the page will be returned to memory + * subsystem. + */ +static __always_inline netmem_ref +__page_pool_put_page(struct page_pool *pool, netmem_ref netmem, + unsigned int dma_sync_size, bool allow_direct) +{ + lockdep_assert_no_hardirq(); + + /* This allocator is optimized for the XDP mode that uses + * one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * refcnt == 1 means page_pool owns page, and can recycle it. + * + * page is NOT reusable when allocated when system is under + * some pressure. (page_is_pfmemalloc) + */ + if (likely(__page_pool_page_can_be_recycled(netmem))) { + /* Read barrier done in page_ref_count / READ_ONCE */ + + page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); + + if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) + return 0; + + /* Page found as candidate for recycling */ + return netmem; + } + + /* Fallback/non-XDP mode: API user have elevated refcnt. + * + * Many drivers split up the page into fragments, and some + * want to keep doing this to save memory and do refcnt based + * recycling. Support this use case too, to ease drivers + * switching between XDP/non-XDP. + * + * In-case page_pool maintains the DMA mapping, API user must + * call page_pool_put_page once. In this elevated refcnt + * case, the DMA is unmapped/released, as driver is likely + * doing refcnt based recycle tricks, meaning another process + * will be invoking put_page. + */ + recycle_stat_inc(pool, released_refcnt); + page_pool_return_netmem(pool, netmem); + + return 0; +} + +static bool page_pool_napi_local(const struct page_pool *pool) +{ + const struct napi_struct *napi; + u32 cpuid; + + /* On PREEMPT_RT the softirq can be preempted by the consumer */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + +void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, + unsigned int dma_sync_size, bool allow_direct) +{ + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + + netmem = __page_pool_put_page(pool, netmem, dma_sync_size, + allow_direct); + if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { + /* Cache full, fallback to free pages */ + recycle_stat_inc(pool, ring_full); + page_pool_return_netmem(pool, netmem); + } +} +EXPORT_SYMBOL(page_pool_put_unrefed_netmem); + +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, + allow_direct); +} +EXPORT_SYMBOL(page_pool_put_unrefed_page); + +static void page_pool_recycle_ring_bulk(struct page_pool *pool, + netmem_ref *bulk, + u32 bulk_len) +{ + bool in_softirq; + u32 i; + + /* Bulk produce into ptr_ring page_pool cache */ + in_softirq = page_pool_producer_lock(pool); + + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { + /* ring full */ + recycle_stat_inc(pool, ring_full); + break; + } + } + + page_pool_producer_unlock(pool, in_softirq); + recycle_stat_add(pool, ring, i); + + /* Hopefully all pages were returned into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* + * ptr_ring cache is full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation. + */ + for (; i < bulk_len; i++) + page_pool_return_netmem(pool, bulk[i]); +} + +/** + * page_pool_put_netmem_bulk() - release references on multiple netmems + * @data: array holding netmem references + * @count: number of entries in @data + * + * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() + * will release leftover netmems to the memory provider. + * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_netmem_bulk(), as this function overwrites it. + */ +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) +{ + u32 bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + netmem_ref netmem = netmem_compound_head(data[i]); + + if (page_pool_unref_and_test(netmem)) + data[bulk_len++] = netmem; + } + + count = bulk_len; + while (count) { + netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; + struct page_pool *pool = NULL; + bool allow_direct; + u32 foreign = 0; + + bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + struct page_pool *netmem_pp; + netmem_ref netmem = data[i]; + + netmem_pp = netmem_get_pp(netmem); + if (unlikely(!pool)) { + pool = netmem_pp; + allow_direct = page_pool_napi_local(pool); + } else if (netmem_pp != pool) { + /* + * If the netmem belongs to a different + * page_pool, save it for another round. + */ + data[foreign++] = netmem; + continue; + } + + netmem = __page_pool_put_page(pool, netmem, -1, + allow_direct); + /* Approved for bulk recycling in ptr_ring cache */ + if (netmem) + bulk[bulk_len++] = netmem; + } + + if (bulk_len) + page_pool_recycle_ring_bulk(pool, bulk, bulk_len); + + count = foreign; + } +} +EXPORT_SYMBOL(page_pool_put_netmem_bulk); + +static netmem_ref page_pool_drain_frag(struct page_pool *pool, + netmem_ref netmem) +{ + long drain_count = BIAS_MAX - pool->frag_users; + + /* Some user is still using the page frag */ + if (likely(page_pool_unref_netmem(netmem, drain_count))) + return 0; + + if (__page_pool_page_can_be_recycled(netmem)) { + page_pool_dma_sync_for_device(pool, netmem, -1); + return netmem; + } + + page_pool_return_netmem(pool, netmem); + return 0; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_users; + netmem_ref netmem = pool->frag_page; + + pool->frag_page = 0; + + if (!netmem || page_pool_unref_netmem(netmem, drain_count)) + return; + + page_pool_return_netmem(pool, netmem); +} + +netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, + unsigned int *offset, unsigned int size, + gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + netmem_ref netmem = pool->frag_page; + + if (WARN_ON(size > max_size)) + return 0; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (netmem && *offset + size > max_size) { + netmem = page_pool_drain_frag(pool, netmem); + if (netmem) { + recycle_stat_inc(pool, cached); + alloc_stat_inc(pool, fast); + goto frag_reset; + } + } + + if (!netmem) { + netmem = page_pool_alloc_netmems(pool, gfp); + if (unlikely(!netmem)) { + pool->frag_page = 0; + return 0; + } + + pool->frag_page = netmem; + +frag_reset: + pool->frag_users = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_fragment_netmem(netmem, BIAS_MAX); + return netmem; + } + + pool->frag_users++; + pool->frag_offset = *offset + size; + return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_frag_netmem); + +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, + gfp)); +} +EXPORT_SYMBOL(page_pool_alloc_frag); + +static void page_pool_empty_ring(struct page_pool *pool) +{ + netmem_ref netmem; + + /* Empty recycle ring */ + while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { + /* Verify the refcnt invariant of cached pages */ + if (!(netmem_ref_count(netmem) == 1)) + pr_crit("%s() page_pool refcnt %d violation\n", + __func__, netmem_ref_count(netmem)); + + page_pool_return_netmem(pool, netmem); + } +} + +static void __page_pool_destroy(struct page_pool *pool) +{ + if (pool->disconnect) + pool->disconnect(pool); + + page_pool_unlist(pool); + page_pool_uninit(pool); + + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); + static_branch_dec(&page_pool_mem_providers); + } + + kfree(pool); +} + +static void page_pool_empty_alloc_cache_once(struct page_pool *pool) +{ + netmem_ref netmem; + + if (pool->destroy_cnt) + return; + + /* Empty alloc cache, assume caller made sure this is + * no-longer in use, and page_pool_alloc_pages() cannot be + * call concurrently. + */ + while (pool->alloc.count) { + netmem = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_netmem(pool, netmem); + } +} + +static void page_pool_scrub(struct page_pool *pool) +{ + unsigned long id; + void *ptr; + + page_pool_empty_alloc_cache_once(pool); + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_netmem_dma(pool, page_to_netmem((struct page *)ptr)); + } + + /* No more consumers should exist, but producers could still + * be in-flight. + */ + page_pool_empty_ring(pool); +} + +static int page_pool_release(struct page_pool *pool) +{ + bool in_softirq; + int inflight; + + page_pool_scrub(pool); + inflight = page_pool_inflight(pool, true); + /* Acquire producer lock to make sure producers have exited. */ + in_softirq = page_pool_producer_lock(pool); + page_pool_producer_unlock(pool, in_softirq); + if (!inflight) + __page_pool_destroy(pool); + + return inflight; +} + +static void page_pool_release_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + void *netdev; + int inflight; + + inflight = page_pool_release(pool); + /* In rare cases, a driver bug may cause inflight to go negative. + * Don't reschedule release if inflight is 0 or negative. + * - If 0, the page_pool has been destroyed + * - if negative, we will never recover + * in both cases no reschedule is necessary. + */ + if (inflight <= 0) + return; + + /* Periodic warning for page pools the user can't see */ + netdev = READ_ONCE(pool->slow.netdev); + if (time_after_eq(jiffies, pool->defer_warn) && + (!netdev || netdev == NET_PTR_POISON)) { + int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; + + pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", + __func__, pool->user.id, inflight, sec); + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} + +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + const struct xdp_mem_info *mem) +{ + refcount_inc(&pool->user_cnt); + pool->disconnect = disconnect; + pool->xdp_mem_id = mem->id; +} + +/** + * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI + * @pool: page pool to modify + * @napi: NAPI instance to associate the page pool with + * + * Associate a page pool with a NAPI instance for lockless page recycling. + * This is useful when a new page pool has to be added to a NAPI instance + * without disabling that NAPI instance, to mark the point at which control + * path "hands over" the page pool to the NAPI instance. In most cases driver + * can simply set the @napi field in struct page_pool_params, and does not + * have to call this helper. + * + * The function is idempotent, but does not implement any refcounting. + * Single page_pool_disable_direct_recycling() will disable recycling, + * no matter how many times enable was called. + */ +void page_pool_enable_direct_recycling(struct page_pool *pool, + struct napi_struct *napi) +{ + if (READ_ONCE(pool->p.napi) == napi) + return; + WARN_ON(!napi || pool->p.napi); + + mutex_lock(&page_pools_lock); + WRITE_ONCE(pool->p.napi, napi); + mutex_unlock(&page_pools_lock); +} +EXPORT_SYMBOL(page_pool_enable_direct_recycling); + +void page_pool_disable_direct_recycling(struct page_pool *pool) +{ + /* Disable direct recycling based on pool->cpuid. + * Paired with READ_ONCE() in page_pool_napi_local(). + */ + WRITE_ONCE(pool->cpuid, -1); + + if (!pool->p.napi) + return; + + napi_assert_will_not_race(pool->p.napi); + + mutex_lock(&page_pools_lock); + WRITE_ONCE(pool->p.napi, NULL); + mutex_unlock(&page_pools_lock); +} +EXPORT_SYMBOL(page_pool_disable_direct_recycling); + +void page_pool_destroy(struct page_pool *pool) +{ + if (!pool) + return; + + if (!page_pool_put(pool)) + return; + + page_pool_disable_direct_recycling(pool); + page_pool_free_frag(pool); + + if (!page_pool_release(pool)) + return; + + page_pool_detached(pool); + pool->defer_start = jiffies; + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + + INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} +EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ + netmem_ref netmem; + + trace_page_pool_update_nid(pool, new_nid); + pool->p.nid = new_nid; + + /* Flush pool alloc cache, as refill will check NUMA node */ + while (pool->alloc.count) { + netmem = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_netmem(pool, netmem); + } +} +EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h new file mode 100644 index 000000000000..2fb06d5f6d55 --- /dev/null +++ b/net/core/page_pool_priv.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PAGE_POOL_PRIV_H +#define __PAGE_POOL_PRIV_H + +#include <net/page_pool/helpers.h> + +#include "netmem_priv.h" + +extern struct mutex page_pools_lock; + +s32 page_pool_inflight(const struct page_pool *pool, bool strict); + +int page_pool_list(struct page_pool *pool); +void page_pool_detached(struct page_pool *pool); +void page_pool_unlist(struct page_pool *pool); + +static inline bool +page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr) +{ + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { + netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT); + + /* We assume page alignment to shave off bottom bits, + * if this "compression" doesn't work we need to drop. + */ + return addr != (dma_addr_t)netmem_get_dma_addr(netmem) + << PAGE_SHIFT; + } + + netmem_set_dma_addr(netmem, addr); + return false; +} + +static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr); +} + +#if defined(CONFIG_PAGE_POOL) +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem); +void page_pool_clear_pp_info(netmem_ref netmem); +int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq); +#else +static inline void page_pool_set_pp_info(struct page_pool *pool, + netmem_ref netmem) +{ +} +static inline void page_pool_clear_pp_info(netmem_ref netmem) +{ +} +static inline int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq) +{ + return 0; +} +#endif + +#endif diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c new file mode 100644 index 000000000000..c82a95beceff --- /dev/null +++ b/net/core/page_pool_user.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/xarray.h> +#include <net/busy_poll.h> +#include <net/net_debug.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <net/page_pool/types.h> +#include <net/page_pool/memory_provider.h> +#include <net/sock.h> + +#include "page_pool_priv.h" +#include "netdev-genl-gen.h" + +static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); +/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, + * pool->user. + * Ordering: inside rtnl_lock + */ +DEFINE_MUTEX(page_pools_lock); + +/* Page pools are only reachable from user space (via netlink) if they are + * linked to a netdev at creation time. Following page pool "visibility" + * states are possible: + * - normal + * - user.list: linked to real netdev, netdev: real netdev + * - orphaned - real netdev has disappeared + * - user.list: linked to lo, netdev: lo + * - invisible - either (a) created without netdev linking, (b) unlisted due + * to error, or (c) the entire namespace which owned this pool disappeared + * - user.list: unhashed, netdev: unknown + */ + +typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info); + +static int +netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill) +{ + struct page_pool *pool; + struct sk_buff *rsp; + int err; + + mutex_lock(&page_pools_lock); + pool = xa_load(&page_pools, id); + if (!pool || hlist_unhashed(&pool->user.list) || + !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) { + err = -ENOENT; + goto err_unlock; + } + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) { + err = -ENOMEM; + goto err_unlock; + } + + err = fill(rsp, pool, info); + if (err) + goto err_free_msg; + + mutex_unlock(&page_pools_lock); + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +struct page_pool_dump_cb { + unsigned long ifindex; + u32 pp_id; +}; + +static int +netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, + pp_nl_fill_cb fill) +{ + struct page_pool_dump_cb *state = (void *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + struct page_pool *pool; + int err = 0; + + rtnl_lock(); + mutex_lock(&page_pools_lock); + for_each_netdev_dump(net, netdev, state->ifindex) { + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + if (state->pp_id && state->pp_id < pool->user.id) + continue; + + state->pp_id = pool->user.id; + err = fill(skb, pool, info); + if (err) + goto out; + } + + state->pp_id = 0; + } +out: + mutex_unlock(&page_pools_lock); + rtnl_unlock(); + + return err; +} + +static int +page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct page_pool_stats stats = {}; + struct nlattr *nest; + void *hdr; + + if (!page_pool_get_stats(pool, &stats)) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) || + (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex))) + goto err_cancel_nest; + + nla_nest_end(rsp, nest); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST, + stats.alloc_stats.fast) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, + stats.alloc_stats.slow) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, + stats.alloc_stats.slow_high_order) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, + stats.alloc_stats.empty) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, + stats.alloc_stats.refill) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, + stats.alloc_stats.waive) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, + stats.recycle_stats.cached) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, + stats.recycle_stats.cache_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, + stats.recycle_stats.ring) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, + stats.recycle_stats.ring_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, + stats.recycle_stats.released_refcnt)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel_nest: + nla_nest_cancel(rsp, nest); +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +#else + GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS"); + return -EOPNOTSUPP; +#endif +} + +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)]; + struct nlattr *nest; + int err; + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO)) + return -EINVAL; + + nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO]; + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + netdev_page_pool_info_nl_policy, + info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NETDEV_A_PAGE_POOL_IFINDEX], + "selecting by ifindex not supported"); + return -EINVAL; + } + + id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill); +} + +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill); +} + +static int +page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ + size_t inflight, refsz; + unsigned int napi_id; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id)) + goto err_cancel; + + if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex)) + goto err_cancel; + + napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; + if (napi_id_valid(napi_id) && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) + goto err_cancel; + + inflight = page_pool_inflight(pool, false); + refsz = PAGE_SIZE << pool->p.order; + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, + inflight * refsz)) + goto err_cancel; + if (pool->user.detach_time && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, + pool->user.detach_time)) + goto err_cancel; + + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) + goto err_cancel; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + struct net *net; + + lockdep_assert_held(&page_pools_lock); + + /* 'invisible' page pools don't matter */ + if (hlist_unhashed(&pool->user.list)) + return; + net = dev_net(pool->slow.netdev); + + if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL)) + return; + + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + if (page_pool_nl_fill(ntf, pool, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&netdev_nl_family, net, ntf, + 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL); +} + +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + + id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill); +} + +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill); +} + +int page_pool_list(struct page_pool *pool) +{ + static u32 id_alloc_next; + int err; + + mutex_lock(&page_pools_lock); + err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b, + &id_alloc_next, GFP_KERNEL); + if (err < 0) + goto err_unlock; + + INIT_HLIST_NODE(&pool->user.list); + if (pool->slow.netdev) { + hlist_add_head(&pool->user.list, + &pool->slow.netdev->page_pools); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); + } + + mutex_unlock(&page_pools_lock); + return 0; + +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +void page_pool_detached(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + pool->user.detach_time = ktime_get_boottime_seconds(); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + mutex_unlock(&page_pools_lock); +} + +void page_pool_unlist(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); + xa_erase(&page_pools, pool->user.id); + if (!hlist_unhashed(&pool->user.list)) + hlist_del(&pool->user.list); + mutex_unlock(&page_pools_lock); +} + +int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq) +{ + void *binding = rxq->mp_params.mp_priv; + struct page_pool *pool; + struct hlist_node *n; + + if (!binding) + return 0; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { + if (pool->mp_priv != binding) + continue; + + if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { + mutex_unlock(&page_pools_lock); + return 0; + } + } + mutex_unlock(&page_pools_lock); + return -ENODATA; +} + +static void page_pool_unreg_netdev_wipe(struct net_device *netdev) +{ + struct page_pool *pool; + struct hlist_node *n; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) { + hlist_del_init(&pool->user.list); + pool->slow.netdev = NET_PTR_POISON; + } + mutex_unlock(&page_pools_lock); +} + +static void page_pool_unreg_netdev(struct net_device *netdev) +{ + struct page_pool *pool, *last; + struct net_device *lo; + + lo = dev_net(netdev)->loopback_dev; + + mutex_lock(&page_pools_lock); + last = NULL; + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + pool->slow.netdev = lo; + netdev_nl_page_pool_event(pool, + NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + last = pool; + } + if (last) + hlist_splice_init(&netdev->page_pools, &last->user.list, + &lo->page_pools); + mutex_unlock(&page_pools_lock); +} + +static int +page_pool_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + if (hlist_empty(&netdev->page_pools)) + return NOTIFY_OK; + + if (netdev->ifindex != LOOPBACK_IFINDEX) + page_pool_unreg_netdev(netdev); + else + page_pool_unreg_netdev_wipe(netdev); + return NOTIFY_OK; +} + +static struct notifier_block page_pool_netdevice_nb = { + .notifier_call = page_pool_netdevice_event, +}; + +static int __init page_pool_user_init(void) +{ + return register_netdevice_notifier(&page_pool_netdevice_nb); +} + +subsys_initcall(page_pool_user_init); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9640972ec50e..d41b03fd1f63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Authors: * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se> @@ -8,12 +9,6 @@ * Ben Greear <greearb@candelatech.com> * Jens Låås <jens.laas@data.slu.se> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * * A tool for loading the network with preconfigurated packets. * The tool is implemented as a linux module. Parameters are output * device, delay (to hard_xmit), number of packets, and whether @@ -60,21 +55,21 @@ * * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) * - * * 021124 Finished major redesign and rewrite for new functionality. - * See Documentation/networking/pktgen.txt for how to use this. + * See Documentation/networking/pktgen.rst for how to use this. * * The new operation: * For each CPU one thread/process is created at start. This process checks * for running devices in the if_list and sends packets until count is 0 it * also the thread checks the thread->control which is used for inter-process * communication. controlling process "posts" operations to the threads this - * way. The if_lock should be possible to remove when add/rem_device is merged - * into this too. + * way. + * The if_list is RCU protected, and the if_lock remains to protect updating + * of if_list, from "add_device" as it invoked from userspace (via proc write). * * By design there should only be *one* "controlling" process. In practice * multiple write accesses gives unpredictable result. Understood by "write" - * to /proc gives result code thats should be read be the "writer". + * to /proc gives result code that should be read be the "writer". * For practical use this should be no problem. * * Note when adding devices to a specific CPU there good idea to also assign @@ -96,7 +91,7 @@ * New xmit() return, do_div and misc clean up by Stephen Hemminger * <shemminger@osdl.org> 040923 * - * Randy Dunlap fixed u64 printk compiler waring + * Randy Dunlap fixed u64 printk compiler warning * * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org> * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213 @@ -113,13 +108,13 @@ * * Fixed src_mac command to set source mac of packet to value specified in * command by Adit Ranadive <adit.262@gmail.com> - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sys.h> #include <linux/types.h> +#include <linux/minmax.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> @@ -157,13 +152,14 @@ #include <linux/etherdevice.h> #include <linux/kthread.h> #include <linux/prefetch.h> +#include <linux/mmzone.h> #include <net/net_namespace.h> #include <net/checksum.h> #include <net/ipv6.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> #include <net/addrconf.h> -#ifdef CONFIG_XFRM #include <net/xfrm.h> -#endif #include <net/netns/generic.h> #include <asm/byteorder.h> #include <linux/rcupdate.h> @@ -174,30 +170,55 @@ #include <asm/dma.h> #include <asm/div64.h> /* do_div */ -#define VERSION "2.74" +#define VERSION "2.75" #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) - -#define func_enter() pr_debug("entering %s\n", __func__); +/* Max number of internet mix entries that can be specified in imix_weights. */ +#define MAX_IMIX_ENTRIES 20 +#define IMIX_PRECISION 100 /* Precision of IMIX distribution */ + +#define func_enter() pr_debug("entering %s\n", __func__) + +#define PKT_FLAGS \ + pf(IPV6) /* Interface in IPV6 Mode */ \ + pf(IPSRC_RND) /* IP-Src Random */ \ + pf(IPDST_RND) /* IP-Dst Random */ \ + pf(TXSIZE_RND) /* Transmit size is random */ \ + pf(UDPSRC_RND) /* UDP-Src Random */ \ + pf(UDPDST_RND) /* UDP-Dst Random */ \ + pf(UDPCSUM) /* Include UDP checksum */ \ + pf(NO_TIMESTAMP) /* Don't timestamp packets (default TS) */ \ + pf(MPLS_RND) /* Random MPLS labels */ \ + pf(QUEUE_MAP_RND) /* queue map Random */ \ + pf(QUEUE_MAP_CPU) /* queue map mirrors smp_processor_id() */ \ + pf(FLOW_SEQ) /* Sequential flows */ \ + pf(IPSEC) /* ipsec on for flows */ \ + pf(MACSRC_RND) /* MAC-Src Random */ \ + pf(MACDST_RND) /* MAC-Dst Random */ \ + pf(VID_RND) /* Random VLAN ID */ \ + pf(SVID_RND) /* Random SVLAN ID */ \ + pf(NODE) /* Node memory alloc*/ \ + pf(SHARED) /* Shared SKB */ \ + +#define pf(flag) flag##_SHIFT, +enum pkt_flags { + PKT_FLAGS +}; +#undef pf /* Device flag bits */ -#define F_IPSRC_RND (1<<0) /* IP-Src Random */ -#define F_IPDST_RND (1<<1) /* IP-Dst Random */ -#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ -#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ -#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ -#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ -#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ -#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ -#define F_MPLS_RND (1<<8) /* Random MPLS labels */ -#define F_VID_RND (1<<9) /* Random VLAN ID */ -#define F_SVID_RND (1<<10) /* Random SVLAN ID */ -#define F_FLOW_SEQ (1<<11) /* Sequential flows */ -#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ -#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ -#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ -#define F_NODE (1<<15) /* Node memory alloc*/ +#define pf(flag) static const __u32 F_##flag = (1<<flag##_SHIFT); +PKT_FLAGS +#undef pf + +#define pf(flag) __stringify(flag), +static char *pkt_flag_names[] = { + PKT_FLAGS +}; +#undef pf + +#define NR_PKT_FLAGS ARRAY_SIZE(pkt_flag_names) /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -205,9 +226,14 @@ #define T_REMDEVALL (1<<2) /* Remove all devs */ #define T_REMDEV (1<<3) /* Remove one dev */ -/* If lock -- can be removed after some work */ -#define if_lock(t) spin_lock(&(t->if_lock)); -#define if_unlock(t) spin_unlock(&(t->if_lock)); +/* Xmit modes */ +#define M_START_XMIT 0 /* Default normal TX */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ + +/* If lock -- protects updating of if_list */ +#define if_lock(t) mutex_lock(&(t->if_lock)) +#define if_unlock(t) mutex_unlock(&(t->if_lock)) /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 @@ -219,6 +245,12 @@ #define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) #define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) +struct imix_pkt { + u64 size; + u64 weight; + u64 count_so_far; +}; + struct flow_state { __be32 cur_daddr; int count; @@ -238,6 +270,7 @@ struct pktgen_dev { struct proc_dir_entry *entry; /* proc file */ struct pktgen_thread *pg_thread;/* the owner */ struct list_head list; /* chaining in the thread's run-queue */ + struct rcu_head rcu; /* freed by RCU */ int running; /* if false, the test will stop */ @@ -245,13 +278,15 @@ struct pktgen_dev { * we will do a random selection from within the range. */ __u32 flags; - int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ - + int xmit_mode; int min_pkt_size; int max_pkt_size; int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread + */ + struct page *page; u64 delay; /* nano-seconds */ @@ -262,7 +297,6 @@ struct pktgen_dev { /* runtime counters relating to clone_skb */ - __u64 allocated_skbs; __u32 clone_count; int last_ok; /* Was last skb sent? * Or a failed transmit of some sort? @@ -314,10 +348,18 @@ struct pktgen_dev { __u16 udp_dst_max; /* exclusive, dest UDP port */ /* DSCP + ECN */ - __u8 tos; /* six MSB of (former) IPv4 TOS - are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 - (see RFC 3260, sec. 4) */ + __u8 tos; /* six MSB of (former) IPv4 TOS + * are for dscp codepoint + */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + * (see RFC 3260, sec. 4) + */ + + /* IMIX */ + unsigned int n_imix_entries; + struct imix_pkt imix_entries[MAX_IMIX_ENTRIES]; + /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/ + __u8 imix_distribution[IMIX_PRECISION]; /* MPLS */ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */ @@ -351,12 +393,12 @@ struct pktgen_dev { __u8 hh[14]; /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; + * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + * + * We fill in SRC address later + * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + * 0x08, 0x00 + * }; */ __u16 pad; /* pad out the hh struct to an even 16 bytes */ @@ -371,6 +413,7 @@ struct pktgen_dev { * device name (not when the inject is * started as it used to do.) */ + netdevice_tracker dev_tracker; char odevname[32]; struct flow_state *flows; unsigned int cflows; /* Concurrent flows (config) */ @@ -381,11 +424,15 @@ struct pktgen_dev { u16 queue_map_min; u16 queue_map_max; __u32 skb_priority; /* skb priority field */ + unsigned int burst; /* number of duplicated packets to burst */ int node; /* Memory node */ #ifdef CONFIG_XFRM __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ + __u32 spi; + struct xfrm_dst xdst; + struct dst_ops dstops; #endif char result[512]; }; @@ -398,7 +445,7 @@ struct pktgen_hdr { }; -static int pg_net_id __read_mostly; +static unsigned int pg_net_id __read_mostly; struct pktgen_net { struct net *net; @@ -408,14 +455,15 @@ struct pktgen_net { }; struct pktgen_thread { - spinlock_t if_lock; /* for list of devices */ + struct mutex if_lock; /* for list of devices */ struct list_head if_list; /* All device here */ struct list_head th_list; struct task_struct *tsk; char result[512]; /* Field for thread to receive "posted" events terminate, - stop ifs etc. */ + * stop ifs etc. + */ u32 control; int cpu; @@ -429,8 +477,7 @@ struct pktgen_thread { #define FIND 0 static const char version[] = - "Packet Generator for packet performance testing. " - "Version: " VERSION "\n"; + "Packet Generator for packet performance testing. Version: " VERSION "\n"; static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); @@ -439,10 +486,11 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, static int pktgen_device_event(struct notifier_block *, unsigned long, void *); static void pktgen_run_all_threads(struct pktgen_net *pn); static void pktgen_reset_all_threads(struct pktgen_net *pn); -static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn); +static void pktgen_stop_all_threads(struct pktgen_net *pn); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static void fill_imix_distribution(struct pktgen_dev *pkt_dev); /* Module parameters, defaults. */ static int pg_count_d __read_mostly = 1000; @@ -470,60 +518,55 @@ static int pgctrl_show(struct seq_file *seq, void *v) static ssize_t pgctrl_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - int err = 0; char data[128]; + size_t max; struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); - if (!capable(CAP_NET_ADMIN)) { - err = -EPERM; - goto out; - } + if (!capable(CAP_NET_ADMIN)) + return -EPERM; - if (count > sizeof(data)) - count = sizeof(data); + if (count < 1) + return -EINVAL; - if (copy_from_user(data, buf, count)) { - err = -EFAULT; - goto out; - } - data[count - 1] = 0; /* Make string */ + max = min(count, sizeof(data) - 1); + if (copy_from_user(data, buf, max)) + return -EFAULT; - if (!strcmp(data, "stop")) - pktgen_stop_all_threads_ifs(pn); + if (data[max - 1] == '\n') + data[max - 1] = 0; /* strip trailing '\n', terminate string */ + else + data[max] = 0; /* terminate string */ + if (!strcmp(data, "stop")) + pktgen_stop_all_threads(pn); else if (!strcmp(data, "start")) pktgen_run_all_threads(pn); - else if (!strcmp(data, "reset")) pktgen_reset_all_threads(pn); - else - pr_warning("Unknown command: %s\n", data); - - err = count; + return -EINVAL; -out: - return err; + return count; } static int pgctrl_open(struct inode *inode, struct file *file) { - return single_open(file, pgctrl_show, PDE_DATA(inode)); + return single_open(file, pgctrl_show, pde_data(inode)); } -static const struct file_operations pktgen_fops = { - .owner = THIS_MODULE, - .open = pgctrl_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pgctrl_write, - .release = single_release, +static const struct proc_ops pktgen_proc_ops = { + .proc_open = pgctrl_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pgctrl_write, + .proc_release = single_release, }; static int pktgen_if_show(struct seq_file *seq, void *v) { const struct pktgen_dev *pkt_dev = seq->private; ktime_t stopped; + unsigned int i; u64 idle; seq_printf(seq, @@ -531,6 +574,16 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); + if (pkt_dev->n_imix_entries > 0) { + seq_puts(seq, " imix_weights: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].weight); + } + seq_puts(seq, "\n"); + } + seq_printf(seq, " frags: %d delay: %llu clone_skb: %d ifname: %s\n", pkt_dev->nfrags, (unsigned long long) pkt_dev->delay, @@ -561,7 +614,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) " dst_min: %s dst_max: %s\n", pkt_dev->dst_min, pkt_dev->dst_max); seq_printf(seq, - " src_min: %s src_max: %s\n", + " src_min: %s src_max: %s\n", pkt_dev->src_min, pkt_dev->src_max); } @@ -571,12 +624,11 @@ static int pktgen_if_show(struct seq_file *seq, void *v) is_zero_ether_addr(pkt_dev->src_mac) ? pkt_dev->odev->dev_addr : pkt_dev->src_mac); - seq_printf(seq, "dst_mac: "); + seq_puts(seq, "dst_mac: "); seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, - " udp_src_min: %d udp_src_max: %d" - " udp_dst_min: %d udp_dst_max: %d\n", + " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); @@ -585,8 +637,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->src_mac_count, pkt_dev->dst_mac_count); if (pkt_dev->nr_labels) { - unsigned int i; - seq_printf(seq, " mpls: "); + seq_puts(seq, " mpls: "); for (i = 0; i < pkt_dev->nr_labels; i++) seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), i == pkt_dev->nr_labels-1 ? "\n" : ", "); @@ -608,64 +659,34 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->traffic_class) seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); + if (pkt_dev->burst > 1) + seq_printf(seq, " burst: %d\n", pkt_dev->burst); + if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); - seq_printf(seq, " Flags: "); + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) + seq_puts(seq, " xmit_mode: netif_receive\n"); + else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) + seq_puts(seq, " xmit_mode: xmit_queue\n"); - if (pkt_dev->flags & F_IPV6) - seq_printf(seq, "IPV6 "); - - if (pkt_dev->flags & F_IPSRC_RND) - seq_printf(seq, "IPSRC_RND "); - - if (pkt_dev->flags & F_IPDST_RND) - seq_printf(seq, "IPDST_RND "); - - if (pkt_dev->flags & F_TXSIZE_RND) - seq_printf(seq, "TXSIZE_RND "); - - if (pkt_dev->flags & F_UDPSRC_RND) - seq_printf(seq, "UDPSRC_RND "); - - if (pkt_dev->flags & F_UDPDST_RND) - seq_printf(seq, "UDPDST_RND "); - - if (pkt_dev->flags & F_MPLS_RND) - seq_printf(seq, "MPLS_RND "); - - if (pkt_dev->flags & F_QUEUE_MAP_RND) - seq_printf(seq, "QUEUE_MAP_RND "); + seq_puts(seq, " Flags: "); - if (pkt_dev->flags & F_QUEUE_MAP_CPU) - seq_printf(seq, "QUEUE_MAP_CPU "); - - if (pkt_dev->cflows) { - if (pkt_dev->flags & F_FLOW_SEQ) - seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/ - else - seq_printf(seq, "FLOW_RND "); - } + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (i == FLOW_SEQ_SHIFT) + if (!pkt_dev->cflows) + continue; + if (pkt_dev->flags & (1 << i)) { + seq_printf(seq, "%s ", pkt_flag_names[i]); #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) - seq_printf(seq, "IPSEC "); + if (i == IPSEC_SHIFT && pkt_dev->spi) + seq_printf(seq, "spi:%u ", pkt_dev->spi); #endif - - if (pkt_dev->flags & F_MACSRC_RND) - seq_printf(seq, "MACSRC_RND "); - - if (pkt_dev->flags & F_MACDST_RND) - seq_printf(seq, "MACDST_RND "); - - if (pkt_dev->flags & F_VID_RND) - seq_printf(seq, "VID_RND "); - - if (pkt_dev->flags & F_SVID_RND) - seq_printf(seq, "SVID_RND "); - - if (pkt_dev->flags & F_NODE) - seq_printf(seq, "NODE_ALLOC "); + } else if (i == FLOW_SEQ_SHIFT) { + seq_puts(seq, "FLOW_RND "); + } + } seq_puts(seq, "\n"); @@ -679,6 +700,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->sofar, (unsigned long long)pkt_dev->errors); + if (pkt_dev->n_imix_entries > 0) { + int i; + + seq_puts(seq, " imix_size_counts: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].count_so_far); + } + seq_puts(seq, "\n"); + } + seq_printf(seq, " started: %lluus stopped: %lluus idle: %lluus\n", (unsigned long long) ktime_to_us(pkt_dev->started_at), @@ -708,40 +741,43 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->result[0]) seq_printf(seq, "Result: %s\n", pkt_dev->result); else - seq_printf(seq, "Result: Idle\n"); + seq_puts(seq, "Result: Idle\n"); return 0; } -static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, - __u32 *num) +static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen, + __u32 *num) { - int i = 0; + size_t i = 0; + *num = 0; for (; i < maxlen; i++) { int value; char c; - *num <<= 4; + if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); - if (value >= 0) + if (value >= 0) { + *num <<= 4; *num |= value; - else + } else { break; + } } return i; } -static int count_trail_chars(const char __user * user_buffer, - unsigned int maxlen) +static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -760,14 +796,15 @@ done: return i; } -static long num_arg(const char __user *user_buffer, unsigned long maxlen, - unsigned long *num) +static ssize_t num_arg(const char __user *user_buffer, size_t maxlen, + unsigned long *num) { - int i; + size_t i; *num = 0; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; if ((c >= '0') && (c <= '9')) { @@ -779,12 +816,13 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen, return i; } -static int strn_len(const char __user * user_buffer, unsigned int maxlen) +static ssize_t strn_len(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -793,8 +831,8 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\r': case '\t': case ' ': + case '=': goto done_str; - break; default: break; } @@ -803,66 +841,176 @@ done_str: return i; } -static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +/* Parses imix entries from user buffer. + * The user buffer should consist of imix entries separated by spaces + * where each entry consists of size and weight delimited by commas. + * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. + */ +static ssize_t get_imix_entries(const char __user *buffer, + size_t maxlen, + struct pktgen_dev *pkt_dev) +{ + size_t i = 0, max; + ssize_t len; + char c; + + pkt_dev->n_imix_entries = 0; + + do { + unsigned long weight; + unsigned long size; + + if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES) + return -E2BIG; + + if (i >= maxlen) + return -EINVAL; + + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &size); + if (len < 0) + return len; + i += len; + if (i >= maxlen) + return -EINVAL; + if (get_user(c, &buffer[i])) + return -EFAULT; + /* Check for comma between size_i and weight_i */ + if (c != ',') + return -EINVAL; + i++; + if (i >= maxlen) + return -EINVAL; + + if (size < 14 + 20 + 8) + size = 14 + 20 + 8; + + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &weight); + if (len < 0) + return len; + if (weight <= 0) + return -EINVAL; + + pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size; + pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; + + i += len; + pkt_dev->n_imix_entries++; + + if (i >= maxlen) + break; + if (get_user(c, &buffer[i])) + return -EFAULT; + i++; + } while (c == ' '); + + return i; +} + +static ssize_t get_labels(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { unsigned int n = 0; + size_t i = 0, max; + ssize_t len; char c; - ssize_t i = 0; - int len; pkt_dev->nr_labels = 0; do { __u32 tmp; - len = hex32_arg(&buffer[i], 8, &tmp); - if (len <= 0) + + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + + if (i >= maxlen) + return -EINVAL; + + max = min(8, maxlen - i); + len = hex32_arg(&buffer[i], max, &tmp); + if (len < 0) return len; + + /* return empty list in case of invalid input or zero value */ + if (len == 0 || tmp == 0) + return maxlen; + pkt_dev->labels[n] = htonl(tmp); if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM) pkt_dev->flags |= F_MPLS_RND; i += len; + n++; + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; i++; - n++; - if (n >= MAX_MPLS_LABELS) - return -E2BIG; } while (c == ','); pkt_dev->nr_labels = n; return i; } +static __u32 pktgen_read_flag(const char *f, bool *disable) +{ + __u32 i; + + if (f[0] == '!') { + *disable = true; + f++; + } + + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT) + continue; + + /* allow only disabling ipv6 flag */ + if (!*disable && i == IPV6_SHIFT) + continue; + + if (strcmp(f, pkt_flag_names[i]) == 0) + return 1 << i; + } + + if (strcmp(f, "FLOW_RND") == 0) { + *disable = !*disable; + return F_FLOW_SEQ; + } + + return 0; +} + static ssize_t pktgen_if_write(struct file *file, - const char __user * user_buffer, size_t count, - loff_t * offset) + const char __user *user_buffer, size_t count, + loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i, max, len; + size_t i, max; + ssize_t len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; - int tmp = 0; char buf[128]; pg_result = &(pkt_dev->result[0]); if (count < 1) { - pr_warning("wrong command format\n"); + pr_warn("wrong command format\n"); return -EINVAL; } max = count; - tmp = count_trail_chars(user_buffer, max); - if (tmp < 0) { - pr_warning("illegal format\n"); - return tmp; + len = count_trail_chars(user_buffer, max); + if (len < 0) { + pr_warn("illegal format\n"); + return len; } - i = tmp; + i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -879,45 +1027,46 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (debug) { - size_t copy = min_t(size_t, count, 1023); - char tb[copy + 1]; - if (copy_from_user(tb, user_buffer, copy)) - return -EFAULT; - tb[copy] = 0; - pr_debug("%s,%lu buffer -:%s:-\n", - name, (unsigned long)count, tb); + size_t copy = min_t(size_t, count + 1, 1024); + char *tp = strndup_user(user_buffer, copy); + + if (IS_ERR(tp)) + return PTR_ERR(tp); + + pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp); + kfree(tp); } if (!strcmp(name, "min_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { pkt_dev->min_pkt_size = value; pkt_dev->cur_pkt_size = value; } - sprintf(pg_result, "OK: min_pkt_size=%u", + sprintf(pg_result, "OK: min_pkt_size=%d", pkt_dev->min_pkt_size); return count; } if (!strcmp(name, "max_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->max_pkt_size) { pkt_dev->max_pkt_size = value; pkt_dev->cur_pkt_size = value; } - sprintf(pg_result, "OK: max_pkt_size=%u", + sprintf(pg_result, "OK: max_pkt_size=%d", pkt_dev->max_pkt_size); return count; } @@ -925,11 +1074,11 @@ static ssize_t pktgen_if_write(struct file *file, /* Shortcut for min = max */ if (!strcmp(name, "pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -937,37 +1086,51 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->max_pkt_size = value; pkt_dev->cur_pkt_size = value; } - sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size); + sprintf(pg_result, "OK: pkt_size=%d", pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "imix_weights")) { + if (pkt_dev->clone_skb > 0) + return -EINVAL; + + max = count - i; + len = get_imix_entries(&user_buffer[i], max, pkt_dev); + if (len < 0) + return len; + + fill_imix_distribution(pkt_dev); + return count; } if (!strcmp(name, "debug")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; debug = value; sprintf(pg_result, "OK: debug=%u", debug); return count; } if (!strcmp(name, "frags")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->nfrags = value; - sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags); + sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags); return count; } if (!strcmp(name, "delay")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value == 0x7FFFFFFF) pkt_dev->delay = ULLONG_MAX; else @@ -978,13 +1141,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "rate")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -993,13 +1156,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "ratep")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = NSEC_PER_SEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1008,11 +1171,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_min) { pkt_dev->udp_src_min = value; pkt_dev->cur_udp_src = value; @@ -1021,11 +1184,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_min) { pkt_dev->udp_dst_min = value; pkt_dev->cur_udp_dst = value; @@ -1034,11 +1197,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_max) { pkt_dev->udp_src_max = value; pkt_dev->cur_udp_src = value; @@ -1047,11 +1210,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_max) { pkt_dev->udp_dst_max = value; pkt_dev->cur_udp_dst = value; @@ -1060,35 +1223,43 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "clone_skb")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; + /* clone_skb is not supported for netif_receive xmit_mode and + * IMIX mode. + */ if ((value > 0) && - (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) - return -ENOTSUPP; - i += len; + ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || + !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + return -EOPNOTSUPP; + if (value > 0 && (pkt_dev->n_imix_entries > 0 || + !(pkt_dev->flags & F_SHARED))) + return -EINVAL; + pkt_dev->clone_skb = value; sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); return count; } if (!strcmp(name, "count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->count = value; sprintf(pg_result, "OK: count=%llu", (unsigned long long)pkt_dev->count); return count; } if (!strcmp(name, "src_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->src_mac_count != value) { pkt_dev->src_mac_count = value; pkt_dev->cur_src_mac_offset = 0; @@ -1098,11 +1269,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->dst_mac_count != value) { pkt_dev->dst_mac_count = value; pkt_dev->cur_dst_mac_offset = 0; @@ -1111,12 +1282,30 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->dst_mac_count); return count; } - if (!strcmp(name, "node")) { - len = num_arg(&user_buffer[i], 10, &value); + if (!strcmp(name, "burst")) { + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; + if ((value > 1) && + ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || + ((pkt_dev->xmit_mode == M_START_XMIT) && + (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) + return -EOPNOTSUPP; + + if (value > 1 && !(pkt_dev->flags & F_SHARED)) + return -EINVAL; + + pkt_dev->burst = value < 1 ? 1 : value; + sprintf(pg_result, "OK: burst=%u", pkt_dev->burst); + return count; + } + if (!strcmp(name, "node")) { + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); + if (len < 0) + return len; if (node_possible(value)) { pkt_dev->node = value; @@ -1125,122 +1314,105 @@ static ssize_t pktgen_if_write(struct file *file, put_page(pkt_dev->page); pkt_dev->page = NULL; } - } - else + } else { sprintf(pg_result, "ERROR: node not possible"); + } return count; } - if (!strcmp(name, "flag")) { + if (!strcmp(name, "xmit_mode")) { char f[32]; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, sizeof(f)); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; - if (strcmp(f, "IPSRC_RND") == 0) - pkt_dev->flags |= F_IPSRC_RND; - - else if (strcmp(f, "!IPSRC_RND") == 0) - pkt_dev->flags &= ~F_IPSRC_RND; - - else if (strcmp(f, "TXSIZE_RND") == 0) - pkt_dev->flags |= F_TXSIZE_RND; - - else if (strcmp(f, "!TXSIZE_RND") == 0) - pkt_dev->flags &= ~F_TXSIZE_RND; - else if (strcmp(f, "IPDST_RND") == 0) - pkt_dev->flags |= F_IPDST_RND; + if (strcmp(f, "start_xmit") == 0) { + pkt_dev->xmit_mode = M_START_XMIT; + } else if (strcmp(f, "netif_receive") == 0) { + /* clone_skb set earlier, not supported in this mode */ + if (pkt_dev->clone_skb > 0) + return -EOPNOTSUPP; - else if (strcmp(f, "!IPDST_RND") == 0) - pkt_dev->flags &= ~F_IPDST_RND; + pkt_dev->xmit_mode = M_NETIF_RECEIVE; - else if (strcmp(f, "UDPSRC_RND") == 0) - pkt_dev->flags |= F_UDPSRC_RND; - - else if (strcmp(f, "!UDPSRC_RND") == 0) - pkt_dev->flags &= ~F_UDPSRC_RND; - - else if (strcmp(f, "UDPDST_RND") == 0) - pkt_dev->flags |= F_UDPDST_RND; - - else if (strcmp(f, "!UDPDST_RND") == 0) - pkt_dev->flags &= ~F_UDPDST_RND; - - else if (strcmp(f, "MACSRC_RND") == 0) - pkt_dev->flags |= F_MACSRC_RND; - - else if (strcmp(f, "!MACSRC_RND") == 0) - pkt_dev->flags &= ~F_MACSRC_RND; - - else if (strcmp(f, "MACDST_RND") == 0) - pkt_dev->flags |= F_MACDST_RND; - - else if (strcmp(f, "!MACDST_RND") == 0) - pkt_dev->flags &= ~F_MACDST_RND; - - else if (strcmp(f, "MPLS_RND") == 0) - pkt_dev->flags |= F_MPLS_RND; - - else if (strcmp(f, "!MPLS_RND") == 0) - pkt_dev->flags &= ~F_MPLS_RND; - - else if (strcmp(f, "VID_RND") == 0) - pkt_dev->flags |= F_VID_RND; - - else if (strcmp(f, "!VID_RND") == 0) - pkt_dev->flags &= ~F_VID_RND; - - else if (strcmp(f, "SVID_RND") == 0) - pkt_dev->flags |= F_SVID_RND; - - else if (strcmp(f, "!SVID_RND") == 0) - pkt_dev->flags &= ~F_SVID_RND; - - else if (strcmp(f, "FLOW_SEQ") == 0) - pkt_dev->flags |= F_FLOW_SEQ; - - else if (strcmp(f, "QUEUE_MAP_RND") == 0) - pkt_dev->flags |= F_QUEUE_MAP_RND; - - else if (strcmp(f, "!QUEUE_MAP_RND") == 0) - pkt_dev->flags &= ~F_QUEUE_MAP_RND; + /* make sure new packet is allocated every time + * pktgen_xmit() is called + */ + pkt_dev->last_ok = 1; + } else if (strcmp(f, "queue_xmit") == 0) { + pkt_dev->xmit_mode = M_QUEUE_XMIT; + pkt_dev->last_ok = 1; + } else { + sprintf(pg_result, + "xmit_mode -:%s:- unknown\nAvailable modes: %s", + f, "start_xmit, netif_receive\n"); + return count; + } + sprintf(pg_result, "OK: xmit_mode=%s", f); + return count; + } + if (!strcmp(name, "flag")) { + bool disable = false; + __u32 flag; + char f[32]; + char *end; - else if (strcmp(f, "QUEUE_MAP_CPU") == 0) - pkt_dev->flags |= F_QUEUE_MAP_CPU; + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); + if (len < 0) + return len; - else if (strcmp(f, "!QUEUE_MAP_CPU") == 0) - pkt_dev->flags &= ~F_QUEUE_MAP_CPU; -#ifdef CONFIG_XFRM - else if (strcmp(f, "IPSEC") == 0) - pkt_dev->flags |= F_IPSEC_ON; -#endif + memset(f, 0, 32); + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; - else if (strcmp(f, "!IPV6") == 0) - pkt_dev->flags &= ~F_IPV6; + flag = pktgen_read_flag(f, &disable); + if (flag) { + if (disable) { + /* If "clone_skb", or "burst" parameters are + * configured, it means that the skb still + * needs to be referenced by the pktgen, so + * the skb must be shared. + */ + if (flag == F_SHARED && (pkt_dev->clone_skb || + pkt_dev->burst > 1)) + return -EINVAL; + pkt_dev->flags &= ~flag; + } else { + pkt_dev->flags |= flag; + } - else if (strcmp(f, "NODE_ALLOC") == 0) - pkt_dev->flags |= F_NODE; + sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); + return count; + } - else if (strcmp(f, "!NODE_ALLOC") == 0) - pkt_dev->flags &= ~F_NODE; + /* Unknown flag */ + end = pkt_dev->result + sizeof(pkt_dev->result); + pg_result += sprintf(pg_result, + "Flag -:%s:- unknown\n" + "Available flags, (prepend ! to un-set flag):\n", f); - else { - sprintf(pg_result, - "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", - f, - "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " - "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); - return count; + for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) { + if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT) + continue; + pg_result += snprintf(pg_result, end - pg_result, + "%s, ", pkt_flag_names[n]); + } + if (!WARN_ON_ONCE(pg_result >= end)) { + /* Remove the comma and whitespace at the end */ + *(pg_result - 2) = '\0'; } - sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); + return count; } if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + max = min(sizeof(pkt_dev->dst_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1248,41 +1420,39 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { - memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strncpy(pkt_dev->dst_min, buf, len); + strscpy_pad(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } if (debug) pr_debug("dst_min set to: %s\n", pkt_dev->dst_min); - i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); return count; } if (!strcmp(name, "dst_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + max = min(sizeof(pkt_dev->dst_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; - if (copy_from_user(buf, &user_buffer[i], len)) return -EFAULT; - buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { - memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strncpy(pkt_dev->dst_max, buf, len); + strscpy_pad(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } if (debug) pr_debug("dst_max set to: %s\n", pkt_dev->dst_max); - i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); return count; } if (!strcmp(name, "dst6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1300,12 +1470,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6=%s", buf); return count; } if (!strcmp(name, "dst6_min")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1322,12 +1492,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_min set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_min=%s", buf); return count; } if (!strcmp(name, "dst6_max")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1343,12 +1513,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_max set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_max=%s", buf); return count; } if (!strcmp(name, "src6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1366,12 +1536,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("src6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: src6=%s", buf); return count; } if (!strcmp(name, "src_min")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + max = min(sizeof(pkt_dev->src_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1379,19 +1549,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { - memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strncpy(pkt_dev->src_min, buf, len); + strscpy_pad(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } if (debug) pr_debug("src_min set to: %s\n", pkt_dev->src_min); - i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); return count; } if (!strcmp(name, "src_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + max = min(sizeof(pkt_dev->src_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1399,19 +1569,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { - memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strncpy(pkt_dev->src_max, buf, len); + strscpy_pad(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } if (debug) pr_debug("src_max set to: %s\n", pkt_dev->src_max); - i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); return count; } if (!strcmp(name, "dst_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1422,13 +1592,14 @@ static ssize_t pktgen_if_write(struct file *file, if (!mac_pton(valstr, pkt_dev->dst_mac)) return -EINVAL; /* Set up Dest MAC */ - memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN); + ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac); sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac); return count; } if (!strcmp(name, "src_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1439,7 +1610,7 @@ static ssize_t pktgen_if_write(struct file *file, if (!mac_pton(valstr, pkt_dev->src_mac)) return -EINVAL; /* Set up Src MAC */ - memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN); + ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac); sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac); return count; @@ -1452,11 +1623,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "flows")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value > MAX_CFLOWS) value = MAX_CFLOWS; @@ -1464,35 +1635,46 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); return count; } +#ifdef CONFIG_XFRM + if (!strcmp(name, "spi")) { + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); + if (len < 0) + return len; + pkt_dev->spi = value; + sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); + return count; + } +#endif if (!strcmp(name, "flowlen")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->lflow = value; sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); return count; } if (!strcmp(name, "queue_map_min")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_min = value; sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); return count; } if (!strcmp(name, "queue_map_max")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_max = value; sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); return count; @@ -1501,10 +1683,11 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "mpls")) { unsigned int n, cnt; - len = get_labels(&user_buffer[i], pkt_dev); + max = count - i; + len = get_labels(&user_buffer[i], max, pkt_dev); if (len < 0) return len; - i += len; + cnt = sprintf(pg_result, "OK: mpls="); for (n = 0; n < pkt_dev->nr_labels; n++) cnt += sprintf(pg_result + cnt, @@ -1522,11 +1705,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value <= 4095) { pkt_dev->vlan_id = value; /* turn on VLAN */ @@ -1549,11 +1732,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_p = value; sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p); @@ -1564,11 +1747,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_cfi = value; sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi); @@ -1579,11 +1762,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { pkt_dev->svlan_id = value; /* turn on SVLAN */ @@ -1606,11 +1789,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_p = value; sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p); @@ -1621,11 +1804,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_cfi = value; sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi); @@ -1636,12 +1819,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "tos")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->tos = tmp_value; sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos); @@ -1652,12 +1836,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "traffic_class")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->traffic_class = tmp_value; sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class); @@ -1668,11 +1853,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "skb_priority")) { - len = num_arg(&user_buffer[i], 9, &value); + max = min(9, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->skb_priority = value; sprintf(pg_result, "OK: skb_priority=%i", pkt_dev->skb_priority); @@ -1685,16 +1870,15 @@ static ssize_t pktgen_if_write(struct file *file, static int pktgen_if_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_if_show, PDE_DATA(inode)); + return single_open(file, pktgen_if_show, pde_data(inode)); } -static const struct file_operations pktgen_if_fops = { - .owner = THIS_MODULE, - .open = pktgen_if_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pktgen_if_write, - .release = single_release, +static const struct proc_ops pktgen_if_proc_ops = { + .proc_open = pktgen_if_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pktgen_if_write, + .proc_release = single_release, }; static int pktgen_thread_show(struct seq_file *seq, void *v) @@ -1704,36 +1888,37 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) BUG_ON(!t); - seq_printf(seq, "Running: "); + seq_puts(seq, "Running: "); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) if (pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); - seq_printf(seq, "\nStopped: "); + seq_puts(seq, "\nStopped: "); - list_for_each_entry(pkt_dev, &t->if_list, list) + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) if (!pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); if (t->result[0]) seq_printf(seq, "\nResult: %s\n", t->result); else - seq_printf(seq, "\nResult: NA\n"); + seq_puts(seq, "\nResult: NA\n"); - if_unlock(t); + rcu_read_unlock(); return 0; } static ssize_t pktgen_thread_write(struct file *file, - const char __user * user_buffer, - size_t count, loff_t * offset) + const char __user *user_buffer, + size_t count, loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i, max, len, ret; + size_t i, max; + ssize_t len, ret; char name[40]; char *pg_result; @@ -1750,8 +1935,8 @@ static ssize_t pktgen_thread_write(struct file *file, i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1780,15 +1965,17 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; + memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) { ret = len; goto out; } if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; + mutex_lock(&pktgen_thread_lock); ret = pktgen_add_device(t, f); mutex_unlock(&pktgen_thread_lock); @@ -1823,16 +2010,15 @@ out: static int pktgen_thread_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_thread_show, PDE_DATA(inode)); + return single_open(file, pktgen_thread_show, pde_data(inode)); } -static const struct file_operations pktgen_thread_fops = { - .owner = THIS_MODULE, - .open = pktgen_thread_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pktgen_thread_write, - .release = single_release, +static const struct proc_ops pktgen_thread_proc_ops = { + .proc_open = pktgen_thread_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pktgen_thread_write, + .proc_release = single_release, }; /* Think find or remove for NN */ @@ -1847,10 +2033,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, pkt_dev = pktgen_find_dev(t, ifname, exact); if (pkt_dev) { if (remove) { - if_lock(t); pkt_dev->removal_mark = 1; t->control |= T_REMDEV; - if_unlock(t); } break; } @@ -1897,9 +2081,12 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d { struct pktgen_thread *t; + mutex_lock(&pktgen_thread_lock); + list_for_each_entry(t, &pn->pktgen_threads, th_list) { struct pktgen_dev *pkt_dev; + if_lock(t); list_for_each_entry(pkt_dev, &t->if_list, list) { if (pkt_dev->odev != dev) continue; @@ -1908,14 +2095,16 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d pkt_dev->entry = proc_create_data(dev->name, 0600, pn->proc_dir, - &pktgen_if_fops, + &pktgen_if_proc_ops, pkt_dev); if (!pkt_dev->entry) pr_err("can't move proc entry for '%s'\n", dev->name); break; } + if_unlock(t); } + mutex_unlock(&pktgen_thread_lock); } static int pktgen_device_event(struct notifier_block *unused, @@ -1973,7 +2162,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn, /* Clean old setups */ if (pkt_dev->odev) { - dev_put(pkt_dev->odev); + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); pkt_dev->odev = NULL; } @@ -1983,14 +2172,15 @@ static int pktgen_setup_dev(const struct pktgen_net *pn, return -ENODEV; } - if (odev->type != ARPHRD_ETHER) { - pr_err("not an ethernet device: \"%s\"\n", ifname); + if (odev->type != ARPHRD_ETHER && odev->type != ARPHRD_LOOPBACK) { + pr_err("not an ethernet or loopback device: \"%s\"\n", ifname); err = -EINVAL; } else if (!netif_running(odev)) { pr_err("device is down: \"%s\"\n", ifname); err = -ENETDOWN; } else { pkt_dev->odev = odev; + netdev_tracker_alloc(odev, &pkt_dev->dev_tracker, GFP_KERNEL); return 0; } @@ -2016,25 +2206,25 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) ntxq = pkt_dev->odev->real_num_tx_queues; if (ntxq <= pkt_dev->queue_map_min) { - pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, - pkt_dev->odevname); + pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_min = (ntxq ?: 1) - 1; } if (pkt_dev->queue_map_max >= ntxq) { - pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, - pkt_dev->odevname); + pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_max = (ntxq ?: 1) - 1; } /* Default to the interface's mac if not explicitly set. */ if (is_zero_ether_addr(pkt_dev->src_mac)) - memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); + ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr); /* Set up Dest MAC */ - memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); + ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac); if (pkt_dev->flags & F_IPV6) { int i, set = 0, err = 1; @@ -2047,7 +2237,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) + pkt_dev->pkt_overhead; } - for (i = 0; i < IN6_ADDR_HSIZE; i++) + for (i = 0; i < sizeof(struct in6_addr); i++) if (pkt_dev->cur_in6_saddr.s6_addr[i]) { set = 1; break; @@ -2098,9 +2288,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) rcu_read_lock(); in_dev = __in_dev_get_rcu(pkt_dev->odev); if (in_dev) { - if (in_dev->ifa_list) { - pkt_dev->saddr_min = - in_dev->ifa_list->ifa_address; + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(in_dev->ifa_list); + if (ifa) { + pkt_dev->saddr_min = ifa->ifa_address; pkt_dev->saddr_max = pkt_dev->saddr_min; } } @@ -2134,14 +2326,12 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) s64 remaining; struct hrtimer_sleeper t; - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); - if (remaining <= 0) { - pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); - return; - } + if (remaining <= 0) + goto out; start_time = ktime_get(); if (remaining < 100000) { @@ -2150,13 +2340,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) end_time = ktime_get(); } while (ktime_compare(end_time, spin_until) < 0); } else { - /* see do_nanosleep */ - hrtimer_init_sleeper(&t, current); do { set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; + hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_ABS); if (likely(t.task)) schedule(); @@ -2168,7 +2354,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) } pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); +out: pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); + destroy_hrtimer_on_stack(&t.timer); } static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) @@ -2198,7 +2386,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = prandom_u32() % pkt_dev->cflows; + flow = get_random_u32_below(pkt_dev->cflows); pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2211,23 +2399,32 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) } -#ifdef CONFIG_XFRM /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... -*/ + */ #define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { +#ifdef CONFIG_XFRM struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { - /*slow path: we dont already have xfrm_state*/ - x = xfrm_stateonly_find(pn->net, DUMMY_MARK, - (xfrm_address_t *)&pkt_dev->cur_daddr, - (xfrm_address_t *)&pkt_dev->cur_saddr, - AF_INET, - pkt_dev->ipsmode, - pkt_dev->ipsproto, 0); + + if (pkt_dev->spi) { + /* We need as quick as possible to find the right SA + * Searching with minimum criteria to achieve, this. + */ + x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); + } else { + /* slow path: we don't already have xfrm_state */ + x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0, + (xfrm_address_t *)&pkt_dev->cur_daddr, + (xfrm_address_t *)&pkt_dev->cur_saddr, + AF_INET, + pkt_dev->ipsmode, + pkt_dev->ipsproto, 0); + } if (x) { pkt_dev->flows[flow].x = x; set_pkt_overhead(pkt_dev); @@ -2235,21 +2432,19 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) } } -} #endif +} static void set_cur_queue_map(struct pktgen_dev *pkt_dev) { - if (pkt_dev->flags & F_QUEUE_MAP_CPU) pkt_dev->cur_queue_map = smp_processor_id(); else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = prandom_u32() % - (pkt_dev->queue_map_max - - pkt_dev->queue_map_min + 1) - + pkt_dev->queue_map_min; + t = get_random_u32_inclusive(pkt_dev->queue_map_min, + pkt_dev->queue_map_max); } else { t = pkt_dev->cur_queue_map + 1; if (t > pkt_dev->queue_map_max) @@ -2278,7 +2473,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = prandom_u32() % pkt_dev->src_mac_count; + mc = get_random_u32_below(pkt_dev->src_mac_count); else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2304,7 +2499,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = prandom_u32() % pkt_dev->dst_mac_count; + mc = get_random_u32_below(pkt_dev->dst_mac_count); else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2328,26 +2523,26 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_MPLS_RND) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)prandom_u32() & + ((__force __be32)get_random_u32() & htonl(0x000fffff)); } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = prandom_u32() & (4096 - 1); + pkt_dev->vlan_id = get_random_u32_below(4096); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = prandom_u32() & (4096 - 1); + pkt_dev->svlan_id = get_random_u32_below(4096); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = prandom_u32() % - (pkt_dev->udp_src_max - pkt_dev->udp_src_min) - + pkt_dev->udp_src_min; + pkt_dev->cur_udp_src = get_random_u32_inclusive(pkt_dev->udp_src_min, + pkt_dev->udp_src_max - 1); else { pkt_dev->cur_udp_src++; @@ -2358,9 +2553,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = prandom_u32() % - (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) - + pkt_dev->udp_dst_min; + pkt_dev->cur_udp_dst = get_random_u32_inclusive(pkt_dev->udp_dst_min, + pkt_dev->udp_dst_max - 1); } else { pkt_dev->cur_udp_dst++; if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) @@ -2374,8 +2568,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) imx = ntohl(pkt_dev->saddr_max); if (imn < imx) { __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) - t = prandom_u32() % (imx - imn) + imn; + t = get_random_u32_inclusive(imn, imx - 1); else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2394,11 +2589,11 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { do { - t = prandom_u32() % - (imx - imn) + imn; + t = get_random_u32_inclusive(imn, imx - 1); s = htonl(t); } while (ipv4_is_loopback(s) || ipv4_is_multicast(s) || @@ -2419,10 +2614,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].flags |= F_INIT; pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; -#ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) + if (pkt_dev->flags & F_IPSEC) get_ipsec_sa(pkt_dev, flow); -#endif pkt_dev->nflows++; } } @@ -2435,7 +2628,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)prandom_u32() | + (((__force __be32)get_random_u32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } @@ -2444,16 +2637,24 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { - t = prandom_u32() % - (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) - + pkt_dev->min_pkt_size; + t = get_random_u32_inclusive(pkt_dev->min_pkt_size, + pkt_dev->max_pkt_size - 1); } else { t = pkt_dev->cur_pkt_size + 1; if (t > pkt_dev->max_pkt_size) t = pkt_dev->min_pkt_size; } pkt_dev->cur_pkt_size = t; + } else if (pkt_dev->n_imix_entries > 0) { + struct imix_pkt *entry; + __u32 t = get_random_u32_below(IMIX_PRECISION); + __u8 entry_index = pkt_dev->imix_distribution[t]; + + entry = &pkt_dev->imix_entries[entry_index]; + entry->count_so_far++; + pkt_dev->cur_pkt_size = entry->size; } set_cur_queue_map(pkt_dev); @@ -2461,33 +2662,76 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].count++; } +static void fill_imix_distribution(struct pktgen_dev *pkt_dev) +{ + int cumulative_probabilites[MAX_IMIX_ENTRIES]; + int j = 0; + __u64 cumulative_prob = 0; + __u64 total_weight = 0; + int i = 0; + + for (i = 0; i < pkt_dev->n_imix_entries; i++) + total_weight += pkt_dev->imix_entries[i].weight; + + /* Fill cumulative_probabilites with sum of normalized probabilities */ + for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) { + cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight * + IMIX_PRECISION, + total_weight); + cumulative_probabilites[i] = cumulative_prob; + } + cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100; + + for (i = 0; i < IMIX_PRECISION; i++) { + if (i == cumulative_probabilites[j]) + j++; + pkt_dev->imix_distribution[i] = j; + } +} #ifdef CONFIG_XFRM +static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { + + [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */ +}; + static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int err = 0; + struct net *net = dev_net(pkt_dev->odev); if (!x) return 0; /* XXX: we dont support tunnel mode for now until - * we resolve the dst issue */ - if (x->props.mode != XFRM_MODE_TRANSPORT) + * we resolve the dst issue + */ + if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; - spin_lock(&x->lock); - - err = x->outer_mode->output(x, skb); - if (err) + /* But when user specify an valid SPI, transformation + * supports both transport/tunnel mode + ESP/AH type. + */ + if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) + skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; + + rcu_read_lock_bh(); + err = pktgen_xfrm_outer_mode_output(x, skb); + rcu_read_unlock_bh(); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); goto error; + } err = x->type->output(x, skb); - if (err) + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR); goto error; - + } + spin_lock_bh(&x->lock); x->curlft.bytes += skb->len; x->curlft.packets++; + spin_unlock_bh(&x->lock); error: - spin_unlock(&x->lock); return err; } @@ -2496,8 +2740,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { xfrm_state_put(x); pkt_dev->flows[i].x = NULL; @@ -2509,12 +2755,15 @@ static void free_SAs(struct pktgen_dev *pkt_dev) static int process_ipsec(struct pktgen_dev *pkt_dev, struct sk_buff *skb, __be16 protocol) { - if (pkt_dev->flags & F_IPSEC_ON) { + if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; + if (x) { + struct ethhdr *eth; + struct iphdr *iph; int ret; - __u8 *eth; + nhead = x->props.header_len - skb_headroom(skb); if (nhead > 0) { ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); @@ -2533,9 +2782,14 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, goto err; } /* restore ll */ - eth = (__u8 *) skb_push(skb, ETH_HLEN); - memcpy(eth, pkt_dev->hh, 12); - *(u16 *) ð[12] = protocol; + eth = skb_push(skb, ETH_HLEN); + memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN); + eth->h_proto = protocol; + + /* Update IPv4 header len as well as checksum value */ + iph = ip_hdr(skb); + iph->tot_len = htons(skb->len - ETH_HLEN); + ip_send_check(iph); } } return 1; @@ -2548,6 +2802,7 @@ err: static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; @@ -2564,14 +2819,14 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi, static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, int datalen) { - struct timeval timestamp; + struct timespec64 timestamp; struct pktgen_hdr *pgh; - pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh)); + pgh = skb_put(skb, sizeof(*pgh)); datalen -= sizeof(*pgh); if (pkt_dev->nfrags <= 0) { - memset(skb_put(skb, datalen), 0, datalen); + skb_put_zero(skb, datalen); } else { int frags = pkt_dev->nfrags; int i, len; @@ -2582,13 +2837,12 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, frags = MAX_SKB_FRAGS; len = datalen - frags * PAGE_SIZE; if (len > 0) { - memset(skb_put(skb, len), 0, len); + skb_put_zero(skb, len); datalen = frags * PAGE_SIZE; } i = 0; - frag_len = (datalen/frags) < PAGE_SIZE ? - (datalen/frags) : PAGE_SIZE; + frag_len = min_t(int, datalen / frags, PAGE_SIZE); while (datalen > 0) { if (unlikely(!pkt_dev->page)) { int node = numa_node_id(); @@ -2600,14 +2854,16 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, break; } get_page(pkt_dev->page); - skb_frag_set_page(skb, i, pkt_dev->page); - skb_shinfo(skb)->frags[i].page_offset = 0; + /*last fragment, fill rest of data*/ if (i == (frags - 1)) - skb_frag_size_set(&skb_shinfo(skb)->frags[i], - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, + min(datalen, PAGE_SIZE)); else - skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); @@ -2622,19 +2878,32 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->pgh_magic = htonl(PKTGEN_MAGIC); pgh->seq_num = htonl(pkt_dev->seq_num); - do_gettimeofday(×tamp); - pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); + if (pkt_dev->flags & F_NO_TIMESTAMP) { + pgh->tv_sec = 0; + pgh->tv_usec = 0; + } else { + /* + * pgh->tv_sec wraps in y2106 when interpreted as unsigned + * as done by wireshark, or y2038 when interpreted as signed. + * This is probably harmless, but if anyone wants to improve + * it, we could introduce a variant that puts 64-bit nanoseconds + * into the respective header bytes. + * This would also be slightly faster to read. + */ + ktime_get_real_ts64(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_nsec / NSEC_PER_USEC); + } } static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, - struct pktgen_dev *pkt_dev, - unsigned int extralen) + struct pktgen_dev *pkt_dev) { + unsigned int extralen = LL_RESERVED_SPACE(dev); struct sk_buff *skb = NULL; - unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + - pkt_dev->pkt_overhead; + unsigned int size; + size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead; if (pkt_dev->flags & F_NODE) { int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); @@ -2644,9 +2913,13 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb->dev = dev; } } else { - skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } + /* the caller pre-fetches from skb->data and reserves for the mac hdr */ + if (likely(skb)) + skb_reserve(skb, extralen - 16); + return skb; } @@ -2678,51 +2951,50 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - datalen = (odev->hard_header_len + 16) & ~0xf; - - skb = pktgen_alloc_skb(odev, pkt_dev, datalen); + skb = pktgen_alloc_skb(odev, pkt_dev); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } prefetchw(skb->data); - skb_reserve(skb, datalen); + skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ - eth = (__u8 *) skb_push(skb, 14); - mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + eth = skb_push(skb, 14); + mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32)); if (pkt_dev->nr_labels) mpls_push(mpls, pkt_dev); if (pkt_dev->vlan_id != 0xffff) { if (pkt_dev->svlan_id != 0xffff) { - svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_tci = skb_put(skb, sizeof(__be16)); *svlan_tci = build_tci(pkt_dev->svlan_id, pkt_dev->svlan_cfi, pkt_dev->svlan_p); - svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_encapsulated_proto = skb_put(skb, + sizeof(__be16)); *svlan_encapsulated_proto = htons(ETH_P_8021Q); } - vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_tci = skb_put(skb, sizeof(__be16)); *vlan_tci = build_tci(pkt_dev->vlan_id, pkt_dev->vlan_cfi, pkt_dev->vlan_p); - vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_encapsulated_proto = skb_put(skb, sizeof(__be16)); *vlan_encapsulated_proto = htons(ETH_P_IP); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); - iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); + iph = skb_put(skb, sizeof(struct iphdr)); skb_set_transport_header(skb, skb->len); - udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); + udph = skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; memcpy(eth, pkt_dev->hh, 12); - *(__be16 *) & eth[12] = protocol; + *(__be16 *)ð[12] = protocol; /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - @@ -2733,7 +3005,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); udph->len = htons(datalen + 8); /* DATA + udphdr */ - udph->check = 0; /* No checksum */ + udph->check = 0; iph->ihl = 5; iph->version = 4; @@ -2747,13 +3019,30 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->frag_off = 0; iplen = 20 + 8 + datalen; iph->tot_len = htons(iplen); - iph->check = 0; - iph->check = ip_fast_csum((void *)iph, iph->ihl); + ip_send_check(iph); skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; + pktgen_finalize_skb(pkt_dev, skb, datalen); + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + udp4_hwcsum(skb, iph->saddr, iph->daddr); + } else { + __wsum csum = skb_checksum(skb, skb_transport_offset(skb), datalen + 8, 0); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen + 8, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + #ifdef CONFIG_XFRM if (!process_ipsec(pkt_dev, skb, protocol)) return NULL; @@ -2768,7 +3057,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct sk_buff *skb = NULL; __u8 *eth; struct udphdr *udph; - int datalen; + int datalen, udplen; struct ipv6hdr *iph; __be16 protocol = htons(ETH_P_IPV6); __be32 *mpls; @@ -2790,7 +3079,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - skb = pktgen_alloc_skb(odev, pkt_dev, 16); + skb = pktgen_alloc_skb(odev, pkt_dev); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; @@ -2800,34 +3089,35 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ - eth = (__u8 *) skb_push(skb, 14); - mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + eth = skb_push(skb, 14); + mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32)); if (pkt_dev->nr_labels) mpls_push(mpls, pkt_dev); if (pkt_dev->vlan_id != 0xffff) { if (pkt_dev->svlan_id != 0xffff) { - svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_tci = skb_put(skb, sizeof(__be16)); *svlan_tci = build_tci(pkt_dev->svlan_id, pkt_dev->svlan_cfi, pkt_dev->svlan_p); - svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_encapsulated_proto = skb_put(skb, + sizeof(__be16)); *svlan_encapsulated_proto = htons(ETH_P_8021Q); } - vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_tci = skb_put(skb, sizeof(__be16)); *vlan_tci = build_tci(pkt_dev->vlan_id, pkt_dev->vlan_cfi, pkt_dev->vlan_p); - vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_encapsulated_proto = skb_put(skb, sizeof(__be16)); *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); - iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + iph = skb_put(skb, sizeof(struct ipv6hdr)); skb_set_transport_header(skb, skb->len); - udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); + udph = skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; @@ -2844,10 +3134,11 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, net_info_ratelimited("increased datalen to %d\n", datalen); } + udplen = datalen + sizeof(struct udphdr); udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); - udph->len = htons(datalen + sizeof(struct udphdr)); - udph->check = 0; /* No checksum */ + udph->len = htons(udplen); + udph->check = 0; *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ @@ -2858,7 +3149,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->hop_limit = 32; - iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->payload_len = htons(udplen); iph->nexthdr = IPPROTO_UDP; iph->daddr = pkt_dev->cur_in6_daddr; @@ -2870,6 +3161,23 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, pktgen_finalize_skb(pkt_dev, skb, datalen); + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0); + } else { + __wsum csum = skb_checksum(skb, skb_transport_offset(skb), udplen, 0); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + return skb; } @@ -2900,8 +3208,8 @@ static void pktgen_run(struct pktgen_thread *t) func_enter(); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { /* * setup odev and create initial packet. @@ -2910,61 +3218,70 @@ static void pktgen_run(struct pktgen_thread *t) if (pkt_dev->odev) { pktgen_clear_counters(pkt_dev); - pkt_dev->running = 1; /* Cranke yeself! */ pkt_dev->skb = NULL; pkt_dev->started_at = pkt_dev->next_tx = ktime_get(); set_pkt_overhead(pkt_dev); - strcpy(pkt_dev->result, "Starting"); + strscpy(pkt_dev->result, "Starting"); + pkt_dev->running = 1; /* Cranke yeself! */ started++; } else - strcpy(pkt_dev->result, "Error starting"); + strscpy(pkt_dev->result, "Error starting"); } - if_unlock(t); + rcu_read_unlock(); if (started) t->control &= ~(T_STOP); } -static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn) +static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags) { struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= T_STOP; + t->control |= (flags); mutex_unlock(&pktgen_thread_lock); } +static void pktgen_stop_all_threads(struct pktgen_net *pn) +{ + func_enter(); + + pktgen_handle_all_threads(pn, T_STOP); +} + static int thread_is_running(const struct pktgen_thread *t) { const struct pktgen_dev *pkt_dev; - list_for_each_entry(pkt_dev, &t->if_list, list) - if (pkt_dev->running) + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) + if (pkt_dev->running) { + rcu_read_unlock(); return 1; + } + rcu_read_unlock(); return 0; } static int pktgen_wait_thread_run(struct pktgen_thread *t) { - if_lock(t); - while (thread_is_running(t)) { - if_unlock(t); - + /* note: 't' will still be around even after the unlock/lock + * cycle because pktgen_thread threads are only cleared at + * net exit + */ + mutex_unlock(&pktgen_thread_lock); msleep_interruptible(100); + mutex_lock(&pktgen_thread_lock); if (signal_pending(current)) goto signal; - if_lock(t); } - if_unlock(t); return 1; signal: return 0; @@ -2975,6 +3292,10 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) struct pktgen_thread *t; int sig = 1; + /* prevent from racing with rmmod */ + if (!try_module_get(THIS_MODULE)) + return sig; + mutex_lock(&pktgen_thread_lock); list_for_each_entry(t, &pn->pktgen_threads, th_list) { @@ -2988,21 +3309,15 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) t->control |= (T_STOP); mutex_unlock(&pktgen_thread_lock); + module_put(THIS_MODULE); return sig; } static void pktgen_run_all_threads(struct pktgen_net *pn) { - struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= (T_RUN); - - mutex_unlock(&pktgen_thread_lock); + pktgen_handle_all_threads(pn, T_RUN); /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); @@ -3012,16 +3327,9 @@ static void pktgen_run_all_threads(struct pktgen_net *pn) static void pktgen_reset_all_threads(struct pktgen_net *pn) { - struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= (T_REMDEVALL); - - mutex_unlock(&pktgen_thread_lock); + pktgen_handle_all_threads(pn, T_REMDEVALL); /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); @@ -3047,7 +3355,19 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC, ktime_to_ns(elapsed)); - bps = pps * 8 * pkt_dev->cur_pkt_size; + if (pkt_dev->n_imix_entries > 0) { + int i; + struct imix_pkt *entry; + + bps = 0; + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + entry = &pkt_dev->imix_entries[i]; + bps += entry->size * entry->count_so_far; + } + bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed)); + } else { + bps = pps * 8 * pkt_dev->cur_pkt_size; + } mbps = bps; do_div(mbps, 1000000); @@ -3064,15 +3384,15 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; if (!pkt_dev->running) { - pr_warning("interface: %s is already stopped\n", - pkt_dev->odevname); + pr_warn("interface: %s is already stopped\n", + pkt_dev->odevname); return -EINVAL; } + pkt_dev->running = 0; kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; pkt_dev->stopped_at = ktime_get(); - pkt_dev->running = 0; show_results(pkt_dev, nr_frags); @@ -3083,9 +3403,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) { struct pktgen_dev *pkt_dev, *best = NULL; - if_lock(t); - - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { if (!pkt_dev->running) continue; if (best == NULL) @@ -3093,7 +3412,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0) best = pkt_dev; } - if_unlock(t); + rcu_read_unlock(); + return best; } @@ -3103,13 +3423,13 @@ static void pktgen_stop(struct pktgen_thread *t) func_enter(); - if_lock(t); + rcu_read_lock(); - list_for_each_entry(pkt_dev, &t->if_list, list) { + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); } - if_unlock(t); + rcu_read_unlock(); } /* @@ -3123,8 +3443,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) func_enter(); - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3138,8 +3456,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) break; } - - if_unlock(t); } static void pktgen_rem_all_ifs(struct pktgen_thread *t) @@ -3151,8 +3467,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) /* Remove all devices, free mem */ - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3161,8 +3475,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) pktgen_remove_device(t, cur); } - - if_unlock(t); } static void pktgen_rem_thread(struct pktgen_thread *t) @@ -3174,6 +3486,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static void pktgen_resched(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); + schedule(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } @@ -3182,7 +3495,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); - while (atomic_read(&(pkt_dev->skb->users)) != 1) { + while (refcount_read(&(pkt_dev->skb->users)) != 1) { if (signal_pending(current)) break; @@ -3196,13 +3509,24 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { + bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED); struct net_device *odev = pkt_dev->odev; - netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) - = odev->netdev_ops->ndo_start_xmit; struct netdev_queue *txq; - u16 queue_map; + unsigned int burst = 1; + struct sk_buff *skb; + int clone_skb = 0; int ret; + /* If 'skb_shared' is false, the read of possible + * new values (if any) for 'burst' and 'clone_skb' will be skipped to + * prevent some concurrent changes from slipping in. And the stabilized + * config will be read in during the next run of pktgen_xmit. + */ + if (skb_shared) { + burst = READ_ONCE(pkt_dev->burst); + clone_skb = READ_ONCE(pkt_dev->clone_skb); + } + /* If device is offline, then don't send */ if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) { pktgen_stop_device(pkt_dev); @@ -3219,7 +3543,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* If no skb or clone count exhausted then get new one */ if (!pkt_dev->skb || (pkt_dev->last_ok && - ++pkt_dev->clone_count >= pkt_dev->clone_skb)) { + ++pkt_dev->clone_count >= clone_skb)) { /* build a new pkt */ kfree_skb(pkt_dev->skb); @@ -3231,37 +3555,111 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) return; } pkt_dev->last_pkt_size = pkt_dev->skb->len; - pkt_dev->allocated_skbs++; pkt_dev->clone_count = 0; /* reset counter */ } if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); - queue_map = skb_get_queue_mapping(pkt_dev->skb); - txq = netdev_get_tx_queue(odev, queue_map); + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) { + skb = pkt_dev->skb; + skb->protocol = eth_type_trans(skb, skb->dev); + if (skb_shared) + refcount_add(burst, &skb->users); + local_bh_disable(); + do { + ret = netif_receive_skb(skb); + if (ret == NET_RX_DROP) + pkt_dev->errors++; + pkt_dev->sofar++; + pkt_dev->seq_num++; + if (unlikely(!skb_shared)) { + pkt_dev->skb = NULL; + break; + } + if (refcount_read(&skb->users) != burst) { + /* skb was queued by rps/rfs or taps, + * so cannot reuse this skb + */ + WARN_ON(refcount_sub_and_test(burst - 1, &skb->users)); + /* get out of the loop and wait + * until skb is consumed + */ + break; + } + /* skb was 'freed' by stack, so clean few + * bits and reuse it + */ + skb_reset_redirect(skb); + } while (--burst > 0); + goto out; /* Skips xmit_mode M_START_XMIT */ + } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { + local_bh_disable(); + if (skb_shared) + refcount_inc(&pkt_dev->skb->users); + + ret = dev_queue_xmit(pkt_dev->skb); + + if (!skb_shared && dev_xmit_complete(ret)) + pkt_dev->skb = NULL; + + switch (ret) { + case NET_XMIT_SUCCESS: + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + /* These are all valid return codes for a qdisc but + * indicate packets are being dropped or will likely + * be dropped soon. + */ + case NETDEV_TX_BUSY: + /* qdisc may call dev_hard_start_xmit directly in cases + * where no queues exist e.g. loopback device, virtual + * devices, etc. In this case we need to handle + * NETDEV_TX_ codes. + */ + default: + pkt_dev->errors++; + net_info_ratelimited("%s xmit error: %d\n", + pkt_dev->odevname, ret); + break; + } + goto out; + } + + txq = skb_get_tx_queue(odev, pkt_dev->skb); + + local_bh_disable(); - __netif_tx_lock_bh(txq); + HARD_TX_LOCK(odev, txq, smp_processor_id()); - if (unlikely(netif_xmit_frozen_or_stopped(txq))) { - ret = NETDEV_TX_BUSY; + if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { pkt_dev->last_ok = 0; goto unlock; } - atomic_inc(&(pkt_dev->skb->users)); - ret = (*xmit)(pkt_dev->skb, odev); + if (skb_shared) + refcount_add(burst, &pkt_dev->skb->users); + +xmit_more: + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); + + if (!skb_shared && dev_xmit_complete(ret)) + pkt_dev->skb = NULL; switch (ret) { case NETDEV_TX_OK: - txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq)) + goto xmit_more; break; case NET_XMIT_DROP: case NET_XMIT_CN: - case NET_XMIT_POLICED: /* skb has been consumed */ pkt_dev->errors++; break; @@ -3269,19 +3667,25 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) net_info_ratelimited("%s xmit error: %d\n", pkt_dev->odevname, ret); pkt_dev->errors++; - /* fallthru */ - case NETDEV_TX_LOCKED: + fallthrough; case NETDEV_TX_BUSY: /* Retry it next time */ - atomic_dec(&(pkt_dev->skb->users)); + if (skb_shared) + refcount_dec(&pkt_dev->skb->users); pkt_dev->last_ok = 0; } + if (unlikely(burst)) + WARN_ON(refcount_sub_and_test(burst, &pkt_dev->skb->users)); unlock: - __netif_tx_unlock_bh(txq); + HARD_TX_UNLOCK(odev, txq); + +out: + local_bh_enable(); /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { - pktgen_wait_for_skb(pkt_dev); + if (pkt_dev->skb) + pktgen_wait_for_skb(pkt_dev); /* Done with this */ pktgen_stop_device(pkt_dev); @@ -3294,20 +3698,17 @@ unlock: static int pktgen_thread_worker(void *arg) { - DEFINE_WAIT(wait); struct pktgen_thread *t = arg; struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - BUG_ON(smp_processor_id() != cpu); + WARN_ON_ONCE(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); complete(&t->start_done); pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); - set_current_state(TASK_INTERRUPTIBLE); - set_freezable(); while (!kthread_should_stop()) { @@ -3316,15 +3717,11 @@ static int pktgen_thread_worker(void *arg) if (unlikely(!pkt_dev && t->control == 0)) { if (t->net->pktgen_exiting) break; - wait_event_interruptible_timeout(t->queue, - t->control != 0, - HZ/10); - try_to_freeze(); + wait_event_freezable_timeout(t->queue, + t->control != 0, HZ / 10); continue; } - __set_current_state(TASK_RUNNING); - if (likely(pkt_dev)) { pktgen_xmit(pkt_dev); @@ -3355,8 +3752,6 @@ static int pktgen_thread_worker(void *arg) } try_to_freeze(); - - set_current_state(TASK_INTERRUPTIBLE); } pr_debug("%s stopping all device\n", t->tsk->comm); @@ -3368,13 +3763,6 @@ static int pktgen_thread_worker(void *arg) pr_debug("%s removing thread\n", t->tsk->comm); pktgen_rem_thread(t); - /* Wait for kthread_stop */ - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - } - __set_current_state(TASK_RUNNING); - return 0; } @@ -3384,8 +3772,8 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, struct pktgen_dev *p, *pkt_dev = NULL; size_t len = strlen(ifname); - if_lock(t); - list_for_each_entry(p, &t->if_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(p, &t->if_list, list) if (strncmp(p->odevname, ifname, len) == 0) { if (p->odevname[len]) { if (exact || p->odevname[len] != '@') @@ -3395,7 +3783,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, break; } - if_unlock(t); + rcu_read_unlock(); pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev); return pkt_dev; } @@ -3409,6 +3797,13 @@ static int add_dev_to_thread(struct pktgen_thread *t, { int rv = 0; + /* This function cannot be called concurrently, as its called + * under pktgen_thread_lock mutex, but it can run from + * userspace on another CPU than the kthread. The if_lock() + * is used here to sync with concurrent instances of + * _rem_dev_from_if_list() invoked via kthread, which is also + * updating the if_list + */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3417,9 +3812,9 @@ static int add_dev_to_thread(struct pktgen_thread *t, goto out; } - list_add(&pkt_dev->list, &t->if_list); - pkt_dev->pg_thread = t; pkt_dev->running = 0; + pkt_dev->pg_thread = t; + list_add_rcu(&pkt_dev->list, &t->if_list); out: if_unlock(t); @@ -3446,8 +3841,9 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) if (!pkt_dev) return -ENOMEM; - strcpy(pkt_dev->odevname, ifname); - pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state), + strscpy(pkt_dev->odevname, ifname); + pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, + sizeof(struct flow_state)), node); if (pkt_dev->flows == NULL) { kfree(pkt_dev); @@ -3469,7 +3865,9 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_p = 0; pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; - pkt_dev->node = -1; + pkt_dev->burst = 1; + pkt_dev->node = NUMA_NO_NODE; + pkt_dev->flags = F_SHARED; /* SKB shared by default */ err = pktgen_setup_dev(t->net, pkt_dev, ifname); if (err) @@ -3478,7 +3876,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->clone_skb = pg_clone_skb_d; pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir, - &pktgen_if_fops, pkt_dev); + &pktgen_if_proc_ops, pkt_dev); if (!pkt_dev->entry) { pr_err("cannot create %s/%s procfs entry\n", PG_PROC_DIR, ifname); @@ -3488,11 +3886,22 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) #ifdef CONFIG_XFRM pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; pkt_dev->ipsproto = IPPROTO_ESP; + + /* xfrm tunnel mode needs additional dst to extract outer + * ip header protocol/ttl/id field, here create a phony one. + * instead of looking for a valid rt, which definitely hurting + * performance under such circumstance. + */ + pkt_dev->dstops.family = AF_INET; + pkt_dev->xdst.u.dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false); + pkt_dev->xdst.child = &pkt_dev->xdst.u.dst; + pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); out2: - dev_put(pkt_dev->odev); + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); out1: #ifdef CONFIG_XFRM free_SAs(pkt_dev); @@ -3515,7 +3924,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) return -ENOMEM; } - spin_lock_init(&t->if_lock); + mutex_init(&t->if_lock); t->cpu = cpu; INIT_LIST_HEAD(&t->if_list); @@ -3523,21 +3932,18 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) list_add_tail(&t->th_list, &pn->pktgen_threads); init_completion(&t->start_done); - p = kthread_create_on_node(pktgen_thread_worker, - t, - cpu_to_node(cpu), - "kpktgend_%d", cpu); + p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d"); if (IS_ERR(p)) { - pr_err("kernel_thread() failed for cpu %d\n", t->cpu); + pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu); list_del(&t->th_list); kfree(t); return PTR_ERR(p); } - kthread_bind(p, cpu); + t->tsk = p; pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, - &pktgen_thread_fops, t); + &pktgen_thread_proc_ops, t); if (!pe) { pr_err("cannot create %s/%s procfs entry\n", PG_PROC_DIR, t->tsk->comm); @@ -3548,6 +3954,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) } t->net = pn; + get_task_struct(p); wake_up_process(p); wait_for_completion(&t->start_done); @@ -3563,11 +3970,13 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t, struct list_head *q, *n; struct pktgen_dev *p; + if_lock(t); list_for_each_safe(q, n, &t->if_list) { p = list_entry(q, struct pktgen_dev, list); if (p == pkt_dev) - list_del(&p->list); + list_del_rcu(&p->list); } + if_unlock(t); } static int pktgen_remove_device(struct pktgen_thread *t, @@ -3576,31 +3985,33 @@ static int pktgen_remove_device(struct pktgen_thread *t, pr_debug("remove_device pkt_dev=%p\n", pkt_dev); if (pkt_dev->running) { - pr_warning("WARNING: trying to remove a running interface, stopping it now\n"); + pr_warn("WARNING: trying to remove a running interface, stopping it now\n"); pktgen_stop_device(pkt_dev); } /* Dis-associate from the interface */ if (pkt_dev->odev) { - dev_put(pkt_dev->odev); + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); pkt_dev->odev = NULL; } - /* And update the thread if_list */ + /* Remove proc before if_list entry, because add_device uses + * list to determine if interface already exist, avoid race + * with proc_create_data() + */ + proc_remove(pkt_dev->entry); + /* And update the thread if_list */ _rem_dev_from_if_list(t, pkt_dev); - if (pkt_dev->entry) - proc_remove(pkt_dev->entry); - #ifdef CONFIG_XFRM free_SAs(pkt_dev); #endif vfree(pkt_dev->flows); if (pkt_dev->page) put_page(pkt_dev->page); - kfree(pkt_dev); + kfree_rcu(pkt_dev, rcu); return 0; } @@ -3618,13 +4029,14 @@ static int __net_init pg_net_init(struct net *net) pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR); return -ENODEV; } - pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops); + pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_proc_ops); if (pe == NULL) { pr_err("cannot create %s procfs entry\n", PGCTRL); ret = -EINVAL; goto remove; } + cpus_read_lock(); for_each_online_cpu(cpu) { int err; @@ -3633,6 +4045,7 @@ static int __net_init pg_net_init(struct net *net) pr_warn("Cannot create thread for cpu %d (%d)\n", cpu, err); } + cpus_read_unlock(); if (list_empty(&pn->pktgen_threads)) { pr_err("Initialization failed for all threads\n"); @@ -3666,7 +4079,7 @@ static void __net_exit pg_net_exit(struct net *net) list_for_each_safe(q, n, &list) { t = list_entry(q, struct pktgen_thread, th_list); list_del(&t->th_list); - kthread_stop(t->tsk); + kthread_stop_put(t->tsk); kfree(t); } @@ -3700,6 +4113,7 @@ static void __exit pg_cleanup(void) { unregister_netdevice_notifier(&pktgen_notifier_block); unregister_pernet_subsys(&pg_net_ops); + /* Don't need rcu_barrier() due to use of kfree_rcu() */ } module_init(pg_init); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c new file mode 100644 index 000000000000..598041b0499e --- /dev/null +++ b/net/core/ptp_classifier.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* PTP classifier + */ + +/* The below program is the bpf_asm (tools/net/) representation of + * the opcode array in the ptp_filter structure. + * + * For convenience, this can easily be altered and reviewed with + * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a + * simple file containing the below program: + * + * ldh [12] ; load ethertype + * + * ; PTP over UDP over IPv4 over Ethernet + * test_ipv4: + * jneq #0x800, test_ipv6 ; ETH_P_IP ? + * ldb [23] ; load proto + * jneq #17, drop_ipv4 ; IPPROTO_UDP ? + * ldh [20] ; load frag offset field + * jset #0x1fff, drop_ipv4 ; don't allow fragments + * ldxb 4*([14]&0xf) ; load IP header len + * ldh [x + 16] ; load UDP dst port + * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 22] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x10 ; PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over Ethernet + * test_ipv6: + * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ? + * ldb [20] ; load proto + * jneq #17, drop_ipv6 ; IPPROTO_UDP ? + * ldh [56] ; load UDP dst port + * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ? + * ldh [62] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x20 ; PTP_CLASS_IPV6 + * ret a ; return PTP class + * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over 802.1Q over Ethernet + * test_8021q: + * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ? + * ldh [16] ; load inner type + * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ? + * ldb [18] ; load payload + * and #0x8 ; as we don't have ports here, test + * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these + * ldh [18] ; reload payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0xc0 ; PTP_CLASS_VLAN|PTP_CLASS_L2 + * ret a ; return PTP class + * + * ; PTP over UDP over IPv4 over 802.1Q over Ethernet + * test_8021q_ipv4: + * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ? + * ldb [27] ; load proto + * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ? + * ldh [24] ; load frag offset field + * jset #0x1fff, drop_8021q_ipv4; don't allow fragments + * ldxb 4*([18]&0xf) ; load IP header len + * ldh [x + 20] ; load UDP dst port + * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 26] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x90 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over 802.1Q over Ethernet + * test_8021q_ipv6: + * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ? + * ldb [24] ; load proto + * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ? + * ldh [60] ; load UDP dst port + * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ? + * ldh [66] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0xa0 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 + * ret a ; return PTP class + * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over Ethernet + * test_ieee1588: + * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * ldb [14] ; load payload + * and #0x8 ; as we don't have ports here, test + * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these + * ldh [14] ; reload payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x40 ; PTP_CLASS_L2 + * ret a ; return PTP class + * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE + */ + +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <linux/ptp_classify.h> + +static struct bpf_prog *ptp_insns __read_mostly; + +unsigned int ptp_classify_raw(const struct sk_buff *skb) +{ + return bpf_prog_run(ptp_insns, skb); +} +EXPORT_SYMBOL_GPL(ptp_classify_raw); + +struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type) +{ + u8 *ptr = skb_mac_header(skb); + + if (type & PTP_CLASS_VLAN) + ptr += VLAN_HLEN; + + switch (type & PTP_CLASS_PMASK) { + case PTP_CLASS_IPV4: + ptr += IPV4_HLEN(ptr) + UDP_HLEN; + break; + case PTP_CLASS_IPV6: + ptr += IP6_HLEN + UDP_HLEN; + break; + case PTP_CLASS_L2: + break; + default: + return NULL; + } + + ptr += ETH_HLEN; + + /* Ensure that the entire header is present in this packet. */ + if (ptr + sizeof(struct ptp_header) > skb->data + skb->len) + return NULL; + + return (struct ptp_header *)ptr; +} +EXPORT_SYMBOL_GPL(ptp_parse_header); + +bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type) +{ + struct ptp_header *hdr; + + hdr = ptp_parse_header(skb, type); + if (!hdr) + return false; + + return ptp_get_msgtype(hdr, type) == PTP_MSGTYPE_SYNC; +} +EXPORT_SYMBOL_GPL(ptp_msg_is_sync); + +void __init ptp_classifier_init(void) +{ + static struct sock_filter ptp_filter[] __initdata = { + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000014 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x0000000e }, + { 0x48, 0, 0, 0x00000010 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x00000016 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000010 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 9, 0x000086dd }, + { 0x30, 0, 0, 0x00000014 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x00000038 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x0000003e }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000020 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 32, 0x00008100 }, + { 0x28, 0, 0, 0x00000010 }, + { 0x15, 0, 7, 0x000088f7 }, + { 0x30, 0, 0, 0x00000012 }, + { 0x54, 0, 0, 0x00000008 }, + { 0x15, 0, 35, 0x00000000 }, + { 0x28, 0, 0, 0x00000012 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x000000c0 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x0000001b }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000018 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x00000012 }, + { 0x48, 0, 0, 0x00000014 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x0000001a }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000090 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 8, 0x000086dd }, + { 0x30, 0, 0, 0x00000018 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x0000003c }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x00000042 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x000000a0 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 7, 0x000088f7 }, + { 0x30, 0, 0, 0x0000000e }, + { 0x54, 0, 0, 0x00000008 }, + { 0x15, 0, 4, 0x00000000 }, + { 0x28, 0, 0, 0x0000000e }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000040 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + }; + struct sock_fprog_kern ptp_prog; + + ptp_prog.len = ARRAY_SIZE(ptp_filter); + ptp_prog.filter = ptp_filter; + + BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog)); +} diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 4425148d2b51..897a8f01a67b 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * NET Generic infrastructure for Network protocols. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -34,101 +30,14 @@ * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ -int sysctl_max_syn_backlog = 256; -EXPORT_SYMBOL(sysctl_max_syn_backlog); -int reqsk_queue_alloc(struct request_sock_queue *queue, - unsigned int nr_table_entries) +void reqsk_queue_alloc(struct request_sock_queue *queue) { - size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt; - - nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); - nr_table_entries = max_t(u32, nr_table_entries, 8); - nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); - lopt_size += nr_table_entries * sizeof(struct request_sock *); - if (lopt_size > PAGE_SIZE) - lopt = vzalloc(lopt_size); - else - lopt = kzalloc(lopt_size, GFP_KERNEL); - if (lopt == NULL) - return -ENOMEM; - - for (lopt->max_qlen_log = 3; - (1 << lopt->max_qlen_log) < nr_table_entries; - lopt->max_qlen_log++); + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); - rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; - lopt->nr_table_entries = nr_table_entries; - - write_lock_bh(&queue->syn_wait_lock); - queue->listen_opt = lopt; - write_unlock_bh(&queue->syn_wait_lock); - - return 0; -} - -void __reqsk_queue_destroy(struct request_sock_queue *queue) -{ - struct listen_sock *lopt; - size_t lopt_size; - - /* - * this is an error recovery path only - * no locking needed and the lopt is not NULL - */ - - lopt = queue->listen_opt; - lopt_size = sizeof(struct listen_sock) + - lopt->nr_table_entries * sizeof(struct request_sock *); - - if (lopt_size > PAGE_SIZE) - vfree(lopt); - else - kfree(lopt); -} - -static inline struct listen_sock *reqsk_queue_yank_listen_sk( - struct request_sock_queue *queue) -{ - struct listen_sock *lopt; - - write_lock_bh(&queue->syn_wait_lock); - lopt = queue->listen_opt; - queue->listen_opt = NULL; - write_unlock_bh(&queue->syn_wait_lock); - - return lopt; -} - -void reqsk_queue_destroy(struct request_sock_queue *queue) -{ - /* make all the listen_opt local to us */ - struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - size_t lopt_size = sizeof(struct listen_sock) + - lopt->nr_table_entries * sizeof(struct request_sock *); - - if (lopt->qlen != 0) { - unsigned int i; - - for (i = 0; i < lopt->nr_table_entries; i++) { - struct request_sock *req; - - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - lopt->qlen--; - reqsk_free(req); - } - } - } - - WARN_ON(lopt->qlen != 0); - if (lopt_size > PAGE_SIZE) - vfree(lopt); - else - kfree(lopt); } /* @@ -168,28 +77,24 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to - * acquire a child's lock while holding listener's socket lock. A corner - * case might also exist in tcp_v4_hnd_req() that will trigger this locking - * order. - * - * When a TFO req is created, it needs to sock_hold its listener to prevent - * the latter data structure from going away. + * acquire a child's lock while holding listener's socket lock. * - * This function also sets "treq->listener" to NULL and unreference listener - * socket. treq->listener is used by the listener so it is protected by the + * This function also sets "treq->tfo_listener" to false. + * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { - struct sock *lsk = tcp_rsk(req)->listener; - struct fastopen_queue *fastopenq = - inet_csk(lsk)->icsk_accept_queue.fastopenq; + struct sock *lsk = req->rsk_listener; + struct fastopen_queue *fastopenq; + + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; - tcp_sk(sk)->fastopen_rsk = NULL; + RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; - tcp_rsk(req)->listener = NULL; + tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; @@ -198,8 +103,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); - sock_put(lsk); - reqsk_free(req); + reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. @@ -209,7 +113,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, * * For more details see CoNext'11 "TCP Fast Open" paper. */ - req->expires = jiffies + 60*HZ; + req->rsk_timer.expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else @@ -220,6 +124,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); - sock_put(lsk); - return; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3de740834d1f..b1ed55141d8a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -7,15 +8,11 @@ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Fixes: - * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ +#include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> @@ -36,10 +33,12 @@ #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> +#include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> +#include <linux/bpf.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/inet.h> #include <linux/netdevice.h> @@ -48,16 +47,30 @@ #include <net/arp.h> #include <net/route.h> #include <net/udp.h> +#include <net/tcp.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include <net/netdev_lock.h> +#include <net/devlink.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/addrconf.h> +#endif +#include <linux/dpll.h> + +#include "dev.h" + +#define RTNL_MAX_TYPE 50 +#define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; - rtnl_calcit_func calcit; + struct module *owner; + unsigned int flags; + struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); @@ -68,9 +81,74 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_interruptible(void) +{ + return mutex_lock_interruptible(&rtnl_mutex); +} + +int rtnl_lock_killable(void) +{ + return mutex_lock_killable(&rtnl_mutex); +} + +static struct sk_buff *defer_kfree_skb_list; +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) +{ + if (head && tail) { + tail->next = defer_kfree_skb_list; + defer_kfree_skb_list = head; + } +} +EXPORT_SYMBOL(rtnl_kfree_skbs); + void __rtnl_unlock(void) { + struct sk_buff *head = defer_kfree_skb_list; + + defer_kfree_skb_list = NULL; + + /* Ensure that we didn't actually add any TODO item when __rtnl_unlock() + * is used. In some places, e.g. in cfg80211, we have code that will do + * something like + * rtnl_lock() + * wiphy_lock() + * ... + * rtnl_unlock() + * + * and because netdev_run_todo() acquires the RTNL for items on the list + * we could cause a situation such as this: + * Thread 1 Thread 2 + * rtnl_lock() + * unregister_netdevice() + * __rtnl_unlock() + * rtnl_lock() + * wiphy_lock() + * rtnl_unlock() + * netdev_run_todo() + * __rtnl_unlock() + * + * // list not empty now + * // because of thread 2 + * rtnl_lock() + * while (!list_empty(...)) + * rtnl_lock() + * wiphy_lock() + * **** DEADLOCK **** + * + * However, usage of __rtnl_unlock() is rare, and so we can ensure that + * it's not used in cases where something is added to do the list. + */ + WARN_ON(!list_empty(&net_todo_list)); + mutex_unlock(&rtnl_mutex); + + while (head) { + struct sk_buff *next = head->next; + + kfree_skb(head); + cond_resched(); + head = next; + } } void rtnl_unlock(void) @@ -92,144 +170,278 @@ int rtnl_is_locked(void) } EXPORT_SYMBOL(rtnl_is_locked); +bool refcount_dec_and_rtnl_lock(refcount_t *r) +{ + return refcount_dec_and_mutex_lock(r, &rtnl_mutex); +} +EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); + #ifdef CONFIG_PROVE_LOCKING -int lockdep_rtnl_is_held(void) +bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net) +{ + ASSERT_RTNL(); -static inline int rtm_msgindex(int msgtype) + mutex_lock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_lock); + +void __rtnl_net_unlock(struct net *net) { - int msgindex = msgtype - RTM_BASE; + ASSERT_RTNL(); - /* - * msgindex < 0 implies someone tried to register a netlink - * control code. msgindex >= RTM_NR_MSGTYPES may indicate that - * the message type has not been added to linux/rtnetlink.h - */ - BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); + mutex_unlock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_unlock); - return msgindex; +void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); + __rtnl_net_lock(net); } +EXPORT_SYMBOL(rtnl_net_lock); -static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) +void rtnl_net_unlock(struct net *net) { - struct rtnl_link *tab; + __rtnl_net_unlock(net); + rtnl_unlock(); +} +EXPORT_SYMBOL(rtnl_net_unlock); - if (protocol <= RTNL_FAMILY_MAX) - tab = rtnl_msg_handlers[protocol]; - else - tab = NULL; +int rtnl_net_trylock(struct net *net) +{ + int ret = rtnl_trylock(); - if (tab == NULL || tab[msgindex].doit == NULL) - tab = rtnl_msg_handlers[PF_UNSPEC]; + if (ret) + __rtnl_net_lock(net); - return tab[msgindex].doit; + return ret; } +EXPORT_SYMBOL(rtnl_net_trylock); -static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) +int rtnl_net_lock_killable(struct net *net) { - struct rtnl_link *tab; + int ret = rtnl_lock_killable(); - if (protocol <= RTNL_FAMILY_MAX) - tab = rtnl_msg_handlers[protocol]; - else - tab = NULL; + if (!ret) + __rtnl_net_lock(net); + + return ret; +} + +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + if (net_eq(net_a, net_b)) + return 0; - if (tab == NULL || tab[msgindex].dumpit == NULL) - tab = rtnl_msg_handlers[PF_UNSPEC]; + /* always init_net first */ + if (net_eq(net_a, &init_net)) + return -1; - return tab[msgindex].dumpit; + if (net_eq(net_b, &init_net)) + return 1; + + /* otherwise lock in ascending order */ + return net_a < net_b ? -1 : 1; } -static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { - struct rtnl_link *tab; + const struct net *net_a, *net_b; - if (protocol <= RTNL_FAMILY_MAX) - tab = rtnl_msg_handlers[protocol]; - else - tab = NULL; + net_a = container_of(a, struct net, rtnl_mutex.dep_map); + net_b = container_of(b, struct net, rtnl_mutex.dep_map); - if (tab == NULL || tab[msgindex].calcit == NULL) - tab = rtnl_msg_handlers[PF_UNSPEC]; + return rtnl_net_cmp_locks(net_a, net_b); +} + +bool rtnl_net_is_locked(struct net *net) +{ + return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_net_is_locked); - return tab[msgindex].calcit; +bool lockdep_rtnl_net_is_held(struct net *net) +{ + return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_net_is_held); +#else +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + /* No need to swap */ + return -1; +} +#endif + +struct rtnl_nets { + /* ->newlink() needs to freeze 3 netns at most; + * 2 for the new device, 1 for its peer. + */ + struct net *net[3]; + unsigned char len; +}; + +static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) +{ + memset(rtnl_nets, 0, sizeof(*rtnl_nets)); +} + +static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) { + put_net(rtnl_nets->net[i]); + rtnl_nets->net[i] = NULL; + } + + rtnl_nets->len = 0; } /** - * __rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @calcit: Function pointer to calc size of dump message + * rtnl_nets_add - Add netns to be locked before ->newlink(). * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. + * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). + * @net: netns pointer with an extra refcnt held. * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - * - * Returns 0 on success or a negative error code. + * The extra refcnt is released in rtnl_nets_destroy(). */ -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - rtnl_calcit_func calcit) +static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) +{ + int i; + + DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); + + for (i = 0; i < rtnl_nets->len; i++) { + switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { + case 0: + put_net(net); + return; + case 1: + swap(rtnl_nets->net[i], net); + } + } + + rtnl_nets->net[i] = net; + rtnl_nets->len++; +} + +static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) +{ + int i; + + rtnl_lock(); + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_lock(rtnl_nets->net[i]); +} + +static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) { - struct rtnl_link *tab; + int i; + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_unlock(rtnl_nets->net[i]); + + rtnl_unlock(); +} + +static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; + +static inline int rtm_msgindex(int msgtype) +{ + int msgindex = msgtype - RTM_BASE; + + /* + * msgindex < 0 implies someone tried to register a netlink + * control code. msgindex >= RTM_NR_MSGTYPES may indicate that + * the message type has not been added to linux/rtnetlink.h + */ + BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); + + return msgindex; +} + +static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) +{ + struct rtnl_link __rcu **tab; + + if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) + protocol = PF_UNSPEC; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); + if (!tab) + tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); + + return rcu_dereference_rtnl(tab[msgtype]); +} + +static int rtnl_register_internal(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + struct rtnl_link *link, *old; + struct rtnl_link __rcu **tab; int msgindex; + int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - tab = rtnl_msg_handlers[protocol]; + rtnl_lock(); + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { - tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); - if (tab == NULL) - return -ENOBUFS; + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); + if (!tab) + goto unlock; - rtnl_msg_handlers[protocol] = tab; + /* ensures we see the 0 stores */ + rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } - if (doit) - tab[msgindex].doit = doit; - - if (dumpit) - tab[msgindex].dumpit = dumpit; - - if (calcit) - tab[msgindex].calcit = calcit; + old = rtnl_dereference(tab[msgindex]); + if (old) { + link = kmemdup(old, sizeof(*old), GFP_KERNEL); + if (!link) + goto unlock; + } else { + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + goto unlock; + } - return 0; -} -EXPORT_SYMBOL_GPL(__rtnl_register); + WARN_ON(link->owner && link->owner != owner); + link->owner = owner; -/** - * rtnl_register - Register a rtnetlink message type - * - * Identical to __rtnl_register() but panics on failure. This is useful - * as failure of this function is very unlikely, it can only happen due - * to lack of memory when allocating the chain to store all message - * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continuing. - */ -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - rtnl_calcit_func calcit) -{ - if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0) - panic("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", - protocol, msgtype); + WARN_ON(doit && link->doit && link->doit != doit); + if (doit) + link->doit = doit; + WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); + if (dumpit) + link->dumpit = dumpit; + + WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL && + (flags & RTNL_FLAG_BULK_DEL_SUPPORTED)); + link->flags |= flags; + + /* publish protocol:msgtype */ + rcu_assign_pointer(tab[msgindex], link); + ret = 0; + if (old) + kfree_rcu(old, rcu); +unlock: + rtnl_unlock(); + return ret; } -EXPORT_SYMBOL_GPL(rtnl_register); /** * rtnl_unregister - Unregister a rtnetlink message type @@ -238,74 +450,143 @@ EXPORT_SYMBOL_GPL(rtnl_register); * * Returns 0 on success or a negative error code. */ -int rtnl_unregister(int protocol, int msgtype) +static int rtnl_unregister(int protocol, int msgtype) { + struct rtnl_link __rcu **tab; + struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - if (rtnl_msg_handlers[protocol] == NULL) + rtnl_lock(); + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!tab) { + rtnl_unlock(); return -ENOENT; + } + + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); + rtnl_unlock(); - rtnl_msg_handlers[protocol][msgindex].doit = NULL; - rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + kfree_rcu(link, rcu); return 0; } -EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * - * Identical to calling rtnl_unregster() for all registered message types + * Identical to calling rtnl_unregister() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) { - BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + struct rtnl_link __rcu **tab; + struct rtnl_link *link; + int msgindex; - kfree(rtnl_msg_handlers[protocol]); - rtnl_msg_handlers[protocol] = NULL; -} -EXPORT_SYMBOL_GPL(rtnl_unregister_all); + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); -static LIST_HEAD(link_ops); + rtnl_lock(); + tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); + if (!tab) { + rtnl_unlock(); + return; + } + for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); + kfree_rcu(link, rcu); + } + rtnl_unlock(); -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) -{ - const struct rtnl_link_ops *ops; + synchronize_net(); - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; - } - return NULL; + kfree(tab); } +EXPORT_SYMBOL_GPL(rtnl_unregister_all); /** - * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register + * __rtnl_register_many - Register rtnetlink message types + * @handlers: Array of struct rtnl_msg_handlers + * @n: The length of @handlers * - * The caller must hold the rtnl_mutex. This function should be used - * by drivers that create devices during module initialization. It - * must be called before registering the devices. + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. * - * Returns 0 on success or a negative error code. + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * When one element of @handlers fails to register, + * 1) built-in: panics. + * 2) modules : the previous successful registrations are unwinded + * and an error is returned. + * + * Use rtnl_register_many(). */ -int __rtnl_link_register(struct rtnl_link_ops *ops) +int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { - if (rtnl_link_ops_get(ops->kind)) - return -EEXIST; + const struct rtnl_msg_handler *handler; + int i, err; - if (!ops->dellink) - ops->dellink = unregister_netdevice_queue; + for (i = 0, handler = handlers; i < n; i++, handler++) { + err = rtnl_register_internal(handler->owner, handler->protocol, + handler->msgtype, handler->doit, + handler->dumpit, handler->flags); + if (err) { + if (!handler->owner) + panic("Unable to register rtnetlink message " + "handlers, %pS\n", handlers); - list_add_tail(&ops->list, &link_ops); - return 0; + __rtnl_unregister_many(handlers, i); + break; + } + } + + return err; +} +EXPORT_SYMBOL_GPL(__rtnl_register_many); + +void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) +{ + const struct rtnl_msg_handler *handler; + int i; + + for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) + rtnl_unregister(handler->protocol, handler->msgtype); +} +EXPORT_SYMBOL_GPL(__rtnl_unregister_many); + +static DEFINE_MUTEX(link_ops_mutex); +static LIST_HEAD(link_ops); + +static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) +{ + struct rtnl_link_ops *ops; + + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } + } + + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } -EXPORT_SYMBOL_GPL(__rtnl_link_register); /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. @@ -315,11 +596,39 @@ EXPORT_SYMBOL_GPL(__rtnl_link_register); */ int rtnl_link_register(struct rtnl_link_ops *ops) { + struct rtnl_link_ops *tmp; int err; - rtnl_lock(); - err = __rtnl_link_register(ops); - rtnl_unlock(); + /* Sanity-check max sizes to avoid stack buffer overflow. */ + if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || + ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) + return -EINVAL; + + /* The check for alloc/setup is here because if ops + * does not have that filled up, it is not possible + * to use the ops for creating device. So do not + * fill up dellink as well. That disables rtnl_dellink. + */ + if ((ops->alloc || ops->setup) && !ops->dellink) + ops->dellink = unregister_netdevice_queue; + + err = init_srcu_struct(&ops->srcu); + if (err) + return err; + + mutex_lock(&link_ops_mutex); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) { + err = -EEXIST; + goto unlock; + } + } + + list_add_tail_rcu(&ops->list, &link_ops); +unlock: + mutex_unlock(&link_ops_mutex); + return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); @@ -336,22 +645,27 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -/** - * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - * - * The caller must hold the rtnl_mutex. +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. */ -void __rtnl_link_unregister(struct rtnl_link_ops *ops) +static void rtnl_lock_unregistering_all(void) { - struct net *net; + DEFINE_WAIT_FUNC(wait, woken_wake_function); - for_each_net(net) { - __rtnl_kill_links(net, ops); + add_wait_queue(&netdev_unregistering_wq, &wait); + for (;;) { + rtnl_lock(); + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ + if (!atomic_read(&dev_unreg_count)) + break; + __rtnl_unlock(); + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - list_del(&ops->list); + remove_wait_queue(&netdev_unregistering_wq, &wait); } -EXPORT_SYMBOL_GPL(__rtnl_link_unregister); /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. @@ -359,12 +673,51 @@ EXPORT_SYMBOL_GPL(__rtnl_link_unregister); */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { - rtnl_lock(); - __rtnl_link_unregister(ops); + struct net *net; + + mutex_lock(&link_ops_mutex); + list_del_rcu(&ops->list); + mutex_unlock(&link_ops_mutex); + + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + + /* Close the race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + rtnl_lock_unregistering_all(); + + for_each_net(net) + __rtnl_kill_links(net, ops); + rtnl_unlock(); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); +static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) +{ + struct net_device *master_dev; + const struct rtnl_link_ops *ops; + size_t size = 0; + + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); + if (!master_dev) + goto out; + + ops = master_dev->rtnl_link_ops; + if (!ops || !ops->get_slave_size) + goto out; + /* IFLA_INFO_SLAVE_DATA + nested data */ + size = nla_total_size(sizeof(struct nlattr)) + + ops->get_slave_size(master_dev, dev); + +out: + rcu_read_unlock(); + return size; +} + static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; @@ -385,66 +738,60 @@ static size_t rtnl_link_get_size(const struct net_device *dev) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); + size += rtnl_link_get_slave_info_data_size(dev); + return size; } static LIST_HEAD(rtnl_af_ops); -static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { - const struct rtnl_af_ops *ops; + struct rtnl_af_ops *ops; + + ASSERT_RTNL(); + + rcu_read_lock(); - list_for_each_entry(ops, &rtnl_af_ops, list) { - if (ops->family == family) - return ops; + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + if (ops->family == family) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; } -/** - * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. - * @ops: struct rtnl_af_ops * to register - * - * The caller must hold the rtnl_mutex. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_af_register(struct rtnl_af_ops *ops) +static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) { - list_add_tail(&ops->list, &rtnl_af_ops); - return 0; + srcu_read_unlock(&ops->srcu, srcu_index); } -EXPORT_SYMBOL_GPL(__rtnl_af_register); /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * - * Returns 0 on success or a negative error code. + * Return: 0 on success or a negative error code. */ int rtnl_af_register(struct rtnl_af_ops *ops) { - int err; + int err = init_srcu_struct(&ops->srcu); + + if (err) + return err; rtnl_lock(); - err = __rtnl_af_register(ops); + list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); - return err; -} -EXPORT_SYMBOL_GPL(rtnl_af_register); -/** - * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. - * @ops: struct rtnl_af_ops * to unregister - * - * The caller must hold the rtnl_mutex. - */ -void __rtnl_af_unregister(struct rtnl_af_ops *ops) -{ - list_del(&ops->list); + return 0; } -EXPORT_SYMBOL_GPL(__rtnl_af_unregister); +EXPORT_SYMBOL_GPL(rtnl_af_register); /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. @@ -453,12 +800,17 @@ EXPORT_SYMBOL_GPL(__rtnl_af_unregister); void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); - __rtnl_af_unregister(ops); + list_del_rcu(&ops->list); rtnl_unlock(); + + synchronize_rcu(); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); -static size_t rtnl_link_get_af_size(const struct net_device *dev) +static size_t rtnl_link_get_af_size(const struct net_device *dev, + u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; @@ -466,51 +818,117 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev) /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); - list_for_each_entry(af_ops, &rtnl_af_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + - af_ops->get_link_af_size(dev); + af_ops->get_link_af_size(dev, ext_filter_mask); } } + rcu_read_unlock(); return size; } -static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +static bool rtnl_have_link_slave_info(const struct net_device *dev) { - const struct rtnl_link_ops *ops = dev->rtnl_link_ops; - struct nlattr *linkinfo, *data; - int err = -EMSGSIZE; + struct net_device *master_dev; + bool ret = false; - linkinfo = nla_nest_start(skb, IFLA_LINKINFO); - if (linkinfo == NULL) - goto out; + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); + if (master_dev && master_dev->rtnl_link_ops) + ret = true; + rcu_read_unlock(); + return ret; +} + +static int rtnl_link_slave_info_fill(struct sk_buff *skb, + const struct net_device *dev) +{ + struct net_device *master_dev; + const struct rtnl_link_ops *ops; + struct nlattr *slave_data; + int err; + + master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + if (!master_dev) + return 0; + ops = master_dev->rtnl_link_ops; + if (!ops) + return 0; + if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) + return -EMSGSIZE; + if (ops->fill_slave_info) { + slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); + if (!slave_data) + return -EMSGSIZE; + err = ops->fill_slave_info(skb, master_dev, dev); + if (err < 0) + goto err_cancel_slave_data; + nla_nest_end(skb, slave_data); + } + return 0; + +err_cancel_slave_data: + nla_nest_cancel(skb, slave_data); + return err; +} +static int rtnl_link_info_fill(struct sk_buff *skb, + const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + struct nlattr *data; + int err; + + if (!ops) + return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) - goto err_cancel_link; + return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) - goto err_cancel_link; + return err; } if (ops->fill_info) { - data = nla_nest_start(skb, IFLA_INFO_DATA); - if (data == NULL) { - err = -EMSGSIZE; - goto err_cancel_link; - } + data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); + if (data == NULL) + return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } - - nla_nest_end(skb, linkinfo); return 0; err_cancel_data: nla_nest_cancel(skb, data); + return err; +} + +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +{ + struct nlattr *linkinfo; + int err = -EMSGSIZE; + + linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); + if (linkinfo == NULL) + goto out; + + err = rtnl_link_info_fill(skb, dev); + if (err < 0) + goto err_cancel_link; + + err = rtnl_link_slave_info_fill(skb, dev); + if (err < 0) + goto err_cancel_link; + + nla_nest_end(skb, linkinfo); + return 0; + err_cancel_link: nla_nest_cancel(skb, linkinfo); out: @@ -520,15 +938,8 @@ out: int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; - int err = 0; - NETLINK_CB(skb).dst_group = group; - if (echo) - atomic_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); - if (echo) - err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); - return err; + return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) @@ -540,15 +951,11 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, - struct nlmsghdr *nlh, gfp_t flags) + const struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; - int report = 0; - if (nlh) - report = nlmsg_report(nlh); - - nlmsg_notify(rtnl, skb, pid, group, report, flags); + nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); @@ -565,15 +972,37 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) struct nlattr *mx; int i, valid = 0; - mx = nla_nest_start(skb, RTA_METRICS); + /* nothing is dumped for dst_default_metrics, so just skip the loop */ + if (metrics == dst_default_metrics.metrics) + return 0; + + mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { + if (i == RTAX_CC_ALGO - 1) { + char tmp[TCP_CA_NAME_MAX], *name; + + name = tcp_ca_get_name_by_key(metrics[i], tmp); + if (!name) + continue; + if (nla_put_string(skb, i + 1, name)) + goto nla_put_failure; + } else if (i == RTAX_FEATURES - 1) { + u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + + if (!user_features) + continue; + BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); + if (nla_put_u32(skb, i + 1, user_features)) + goto nla_put_failure; + } else { + if (nla_put_u32(skb, i + 1, metrics[i])) + goto nla_put_failure; + } valid++; - if (nla_put_u32(skb, i+1, metrics[i])) - goto nla_put_failure; } } @@ -594,13 +1023,17 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { - .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse), - .rta_used = dst->__use, - .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, .rta_id = id, }; + unsigned long delta; + if (dst) { + delta = jiffies - READ_ONCE(dst->lastuse); + ci.rta_lastuse = jiffies_delta_to_clock_t(delta); + ci.rta_used = dst->__use; + ci.rta_clntref = rcuref_read(&dst->__rcuref); + } if (expires) { unsigned long clock; @@ -612,31 +1045,44 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); +void netif_set_operstate(struct net_device *dev, int newstate) +{ + unsigned int old = READ_ONCE(dev->operstate); + + do { + if (old == newstate) + return; + } while (!try_cmpxchg(&dev->operstate, &old, newstate)); + + netif_state_change(dev); +} +EXPORT_SYMBOL(netif_set_operstate); + static void set_operstate(struct net_device *dev, unsigned char transition) { - unsigned char operstate = dev->operstate; + unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) + !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; + case IF_OPER_TESTING: + if (netif_oper_up(dev)) + operstate = IF_OPER_TESTING; + break; + case IF_OPER_DORMANT: - if (operstate == IF_OPER_UP || - operstate == IF_OPER_UNKNOWN) + if (netif_oper_up(dev)) operstate = IF_OPER_DORMANT; break; } - if (dev->operstate != operstate) { - write_lock_bh(&dev_base_lock); - dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); - netdev_state_change(dev); - } + netif_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) @@ -688,38 +1134,64 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; -} -static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) -{ - memcpy(v, b, sizeof(*b)); + a->rx_nohandler = b->rx_nohandler; } /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { - if (dev->dev.parent && dev_is_pci(dev->dev.parent) && - (ext_filter_mask & RTEXT_FILTER_VF)) { + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); - size_t size = nla_total_size(sizeof(struct nlattr)); - size += nla_total_size(num_vfs * sizeof(struct nlattr)); + size_t size = nla_total_size(0); size += num_vfs * - (nla_total_size(sizeof(struct ifla_vf_mac)) + + (nla_total_size(0) + + nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct ifla_vf_vlan_info)) + + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + - nla_total_size(sizeof(struct ifla_vf_spoofchk))); + nla_total_size(sizeof(struct ifla_vf_rate)) + + nla_total_size(sizeof(struct ifla_vf_link_state)) + + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + + nla_total_size(sizeof(struct ifla_vf_trust))); + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + size += num_vfs * + (nla_total_size(0) + /* nest IFLA_VF_STATS */ + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64))); + } + if (dev->netdev_ops->ndo_get_vf_guid) + size += num_vfs * 2 * + nla_total_size(sizeof(struct ifla_vf_guid)); return size; } else return 0; } -static size_t rtnl_port_size(const struct net_device *dev) +static size_t rtnl_port_size(const struct net_device *dev, + u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ - + nla_total_size(sizeof(struct ifla_port_vsi)) - /* PORT_VSI_TYPE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ @@ -730,7 +1202,8 @@ static size_t rtnl_port_size(const struct net_device *dev) size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; - if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + @@ -739,16 +1212,71 @@ static size_t rtnl_port_size(const struct net_device *dev) return port_self_size; } +static size_t rtnl_xdp_size(void) +{ + size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ + nla_total_size(1) + /* XDP_ATTACHED */ + nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ + nla_total_size(4); /* XDP_<mode>_PROG_ID */ + + return xdp_size; +} + +static size_t rtnl_prop_list_size(const struct net_device *dev) +{ + struct netdev_name_node *name_node; + unsigned int cnt = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) + cnt++; + rcu_read_unlock(); + + if (!cnt) + return 0; + + return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); +} + +static size_t rtnl_proto_down_size(const struct net_device *dev) +{ + size_t size = nla_total_size(1); + + /* Assume dev->proto_down_reason is not zero. */ + size += nla_total_size(0) + nla_total_size(4); + + return size; +} + +static size_t rtnl_devlink_port_size(const struct net_device *dev) +{ + size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */ + + if (dev->devlink_port) + size += devlink_nl_port_handle_size(dev->devlink_port); + + return size; +} + +static size_t rtnl_dpll_pin_size(const struct net_device *dev) +{ + size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ + + size += dpll_netdev_pin_handle_size(dev); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { - return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + size_t size; + + size = NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ - + nla_total_size(sizeof(struct rtnl_link_ifmap)) - + nla_total_size(sizeof(struct rtnl_link_stats)) - + nla_total_size(sizeof(struct rtnl_link_stats64)) + + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ @@ -758,16 +1286,55 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */ + + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + + nla_total_size(4) /* IFLA_LINK_NETNSID */ + + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ - + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ - + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ + + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + + rtnl_xdp_size() /* IFLA_XDP */ + + nla_total_size(4) /* IFLA_EVENT */ + + nla_total_size(4) /* IFLA_NEW_NETNSID */ + + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + + rtnl_proto_down_size(dev) /* proto down */ + + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + + nla_total_size(4) /* IFLA_MIN_MTU */ + + nla_total_size(4) /* IFLA_MAX_MTU */ + + rtnl_prop_list_size(dev) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + + rtnl_devlink_port_size(dev) + + rtnl_dpll_pin_size(dev) + + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + + nla_total_size(2) /* IFLA_HEADROOM */ + + nla_total_size(2) /* IFLA_TAILROOM */ + + 0; + + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) + size += nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + + return size; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -777,12 +1344,12 @@ static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) int vf; int err; - vf_ports = nla_nest_start(skb, IFLA_VF_PORTS); + vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { - vf_port = nla_nest_start(skb, IFLA_VF_PORT); + vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) @@ -811,7 +1378,7 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) struct nlattr *port_self; int err; - port_self = nla_nest_start(skb, IFLA_PORT_SELF); + port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; @@ -826,11 +1393,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) return 0; } -static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, + u32 ext_filter_mask) { int err; - if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); @@ -846,17 +1415,626 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) return 0; } -static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, +static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_item_id ppid; + + err = dev_get_phys_port_id(dev, &ppid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + +static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) +{ + char name[IFNAMSIZ]; + int err; + + err = dev_get_phys_port_name(dev, name, sizeof(name)); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) + return -EMSGSIZE; + + return 0; +} + +static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct netdev_phys_item_id ppid = { }; + int err; + + err = netif_get_port_parent_id(dev, &ppid, false); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + +static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, + struct net_device *dev) +{ + struct rtnl_link_stats64 *sp; + struct nlattr *attr; + + attr = nla_reserve_64bit(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64), IFLA_PAD); + if (!attr) + return -EMSGSIZE; + + sp = nla_data(attr); + dev_get_stats(dev, sp); + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats(nla_data(attr), sp); + + return 0; +} + +static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, + struct net_device *dev, + int vfs_num, + u32 ext_filter_mask) +{ + struct ifla_vf_rss_query_en vf_rss_query_en; + struct nlattr *vf, *vfstats, *vfvlanlist; + struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_vlan_info vf_vlan_info; + struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_stats vf_stats; + struct ifla_vf_trust vf_trust; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_rate vf_rate; + struct ifla_vf_mac vf_mac; + struct ifla_vf_broadcast vf_broadcast; + struct ifla_vf_info ivi; + struct ifla_vf_guid node_guid; + struct ifla_vf_guid port_guid; + + memset(&ivi, 0, sizeof(ivi)); + + /* Not all SR-IOV capable drivers support the + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. + */ + ivi.spoofchk = -1; + ivi.rss_query_en = -1; + ivi.trusted = -1; + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; + /* VLAN Protocol by default is 802.1Q */ + ivi.vlan_proto = htons(ETH_P_8021Q); + if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) + return 0; + + memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); + memset(&node_guid, 0, sizeof(node_guid)); + memset(&port_guid, 0, sizeof(port_guid)); + + vf_mac.vf = + vf_vlan.vf = + vf_vlan_info.vf = + vf_rate.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = + vf_linkstate.vf = + vf_rss_query_en.vf = + vf_trust.vf = + node_guid.vf = + port_guid.vf = ivi.vf; + + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_vlan_info.vlan = ivi.vlan; + vf_vlan_info.qos = ivi.qos; + vf_vlan_info.vlan_proto = ivi.vlan_proto; + vf_tx_rate.rate = ivi.max_tx_rate; + vf_rate.min_tx_rate = ivi.min_tx_rate; + vf_rate.max_tx_rate = ivi.max_tx_rate; + vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; + vf_trust.setting = ivi.trusted; + vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); + if (!vf) + return -EMSGSIZE; + if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || + nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || + nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), + &vf_rate) || + nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate) || + nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en) || + nla_put(skb, IFLA_VF_TRUST, + sizeof(vf_trust), &vf_trust)) + goto nla_put_vf_failure; + + if (dev->netdev_ops->ndo_get_vf_guid && + !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, + &port_guid)) { + if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), + &node_guid) || + nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), + &port_guid)) + goto nla_put_vf_failure; + } + vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); + if (!vfvlanlist) + goto nla_put_vf_failure; + if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), + &vf_vlan_info)) { + nla_nest_cancel(skb, vfvlanlist); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfvlanlist); + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); + if (!vfstats) + goto nla_put_vf_failure; + if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfstats); + } + nla_nest_end(skb, vf); + return 0; + +nla_put_vf_failure: + nla_nest_cancel(skb, vf); + return -EMSGSIZE; +} + +static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, + struct net_device *dev, + u32 ext_filter_mask) +{ + struct nlattr *vfinfo; + int i, num_vfs; + + if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) + return 0; + + num_vfs = dev_num_vf(dev->dev.parent); + if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) + return -EMSGSIZE; + + if (!dev->netdev_ops->ndo_get_vf_config) + return 0; + + vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + return -EMSGSIZE; + + for (i = 0; i < num_vfs; i++) { + if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + } + + nla_nest_end(skb, vfinfo); + return 0; +} + +static int rtnl_fill_link_ifmap(struct sk_buff *skb, + const struct net_device *dev) +{ + struct rtnl_link_ifmap map; + + memset(&map, 0, sizeof(map)); + map.mem_start = READ_ONCE(dev->mem_start); + map.mem_end = READ_ONCE(dev->mem_end); + map.base_addr = READ_ONCE(dev->base_addr); + map.irq = READ_ONCE(dev->irq); + map.dma = READ_ONCE(dev->dma); + map.port = READ_ONCE(dev->if_port); + + if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) + return -EMSGSIZE; + + return 0; +} + +static u32 rtnl_xdp_prog_skb(struct net_device *dev) +{ + const struct bpf_prog *generic_xdp_prog; + u32 res = 0; + + rcu_read_lock(); + generic_xdp_prog = rcu_dereference(dev->xdp_prog); + if (generic_xdp_prog) + res = generic_xdp_prog->aux->id; + rcu_read_unlock(); + + return res; +} + +static u32 rtnl_xdp_prog_drv(struct net_device *dev) +{ + return dev_xdp_prog_id(dev, XDP_MODE_DRV); +} + +static u32 rtnl_xdp_prog_hw(struct net_device *dev) +{ + return dev_xdp_prog_id(dev, XDP_MODE_HW); +} + +static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, + u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, + u32 (*get_prog_id)(struct net_device *dev)) +{ + u32 curr_id; + int err; + + curr_id = get_prog_id(dev); + if (!curr_id) + return 0; + + *prog_id = curr_id; + err = nla_put_u32(skb, attr, curr_id); + if (err) + return err; + + if (*mode != XDP_ATTACHED_NONE) + *mode = XDP_ATTACHED_MULTI; + else + *mode = tgt_mode; + + return 0; +} + +static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *xdp; + u32 prog_id; + int err; + u8 mode; + + xdp = nla_nest_start_noflag(skb, IFLA_XDP); + if (!xdp) + return -EMSGSIZE; + + prog_id = 0; + mode = XDP_ATTACHED_NONE; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, + IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); + if (err) + goto err_cancel; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, + IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); + if (err) + goto err_cancel; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, + IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); + if (err) + goto err_cancel; + + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); + if (err) + goto err_cancel; + + if (prog_id && mode != XDP_ATTACHED_MULTI) { + err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); + if (err) + goto err_cancel; + } + + nla_nest_end(skb, xdp); + return 0; + +err_cancel: + nla_nest_cancel(skb, xdp); + return err; +} + +static u32 rtnl_get_event(unsigned long event) +{ + u32 rtnl_event_type = IFLA_EVENT_NONE; + + switch (event) { + case NETDEV_REBOOT: + rtnl_event_type = IFLA_EVENT_REBOOT; + break; + case NETDEV_FEAT_CHANGE: + rtnl_event_type = IFLA_EVENT_FEATURES; + break; + case NETDEV_BONDING_FAILOVER: + rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; + break; + case NETDEV_NOTIFY_PEERS: + rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; + break; + case NETDEV_RESEND_IGMP: + rtnl_event_type = IFLA_EVENT_IGMP_RESEND; + break; + case NETDEV_CHANGEINFODATA: + rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; + break; + default: + break; + } + + return rtnl_event_type; +} + +static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device *upper_dev; + int ret = 0; + + rcu_read_lock(); + + upper_dev = netdev_master_upper_dev_get_rcu(dev); + if (upper_dev) + ret = nla_put_u32(skb, IFLA_MASTER, + READ_ONCE(upper_dev->ifindex)); + + rcu_read_unlock(); + return ret; +} + +static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, + bool force) +{ + int iflink = dev_get_iflink(dev); + + if (force || READ_ONCE(dev->ifindex) != iflink) + return nla_put_u32(skb, IFLA_LINK, iflink); + + return 0; +} + +static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, + struct net_device *dev) +{ + char buf[IFALIASZ]; + int ret; + + ret = dev_get_alias(dev, buf, sizeof(buf)); + return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; +} + +static int rtnl_fill_link_netnsid(struct sk_buff *skb, + const struct net_device *dev, + struct net *src_net, gfp_t gfp) +{ + bool put_iflink = false; + + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { + struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); + + if (!net_eq(dev_net(dev), link_net)) { + int id = peernet2id_alloc(src_net, link_net, gfp); + + if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) + return -EMSGSIZE; + + put_iflink = true; + } + } + + return nla_put_iflink(skb, dev, put_iflink); +} + +static int rtnl_fill_link_af(struct sk_buff *skb, + const struct net_device *dev, + u32 ext_filter_mask) +{ + const struct rtnl_af_ops *af_ops; + struct nlattr *af_spec; + + af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); + if (!af_spec) + return -EMSGSIZE; + + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { + struct nlattr *af; + int err; + + if (!af_ops->fill_link_af) + continue; + + af = nla_nest_start_noflag(skb, af_ops->family); + if (!af) + return -EMSGSIZE; + + err = af_ops->fill_link_af(skb, dev, ext_filter_mask); + /* + * Caller may return ENODATA to indicate that there + * was no data to be dumped. This is not an error, it + * means we should trim the attribute header and + * continue. + */ + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + return -EMSGSIZE; + + nla_nest_end(skb, af); + } + + nla_nest_end(skb, af_spec); + return 0; +} + +static int rtnl_fill_alt_ifnames(struct sk_buff *skb, + const struct net_device *dev) +{ + struct netdev_name_node *name_node; + int count = 0; + + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) { + if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) + return -EMSGSIZE; + count++; + } + return count; +} + +/* RCU protected. */ +static int rtnl_fill_prop_list(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *prop_list; + int ret; + + prop_list = nla_nest_start(skb, IFLA_PROP_LIST); + if (!prop_list) + return -EMSGSIZE; + + ret = rtnl_fill_alt_ifnames(skb, dev); + if (ret <= 0) + goto nest_cancel; + + nla_nest_end(skb, prop_list); + return 0; + +nest_cancel: + nla_nest_cancel(skb, prop_list); + return ret; +} + +static int rtnl_fill_proto_down(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *pr; + u32 preason; + + if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) + goto nla_put_failure; + + preason = READ_ONCE(dev->proto_down_reason); + if (!preason) + return 0; + + pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); + if (!pr) + return -EMSGSIZE; + + if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { + nla_nest_cancel(skb, pr); + goto nla_put_failure; + } + + nla_nest_end(skb, pr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int rtnl_fill_devlink_port(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *devlink_port_nest; + int ret; + + devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT); + if (!devlink_port_nest) + return -EMSGSIZE; + + if (dev->devlink_port) { + ret = devlink_nl_port_handle_fill(skb, dev->devlink_port); + if (ret < 0) + goto nest_cancel; + } + + nla_nest_end(skb, devlink_port_nest); + return 0; + +nest_cancel: + nla_nest_cancel(skb, devlink_port_nest); + return ret; +} + +static int rtnl_fill_dpll_pin(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *dpll_pin_nest; + int ret; + + dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN); + if (!dpll_pin_nest) + return -EMSGSIZE; + + ret = dpll_netdev_add_pin_handle(skb, dev); + if (ret < 0) + goto nest_cancel; + + nla_nest_end(skb, dpll_pin_nest); + return 0; + +nest_cancel: + nla_nest_cancel(skb, dpll_pin_nest); + return ret; +} + +static int rtnl_fill_ifinfo(struct sk_buff *skb, + struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, u32 ext_filter_mask) + unsigned int flags, u32 ext_filter_mask, + u32 event, int *new_nsid, int new_ifindex, + int tgt_netnsid, gfp_t gfp) { + char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; - struct rtnl_link_stats64 temp; - const struct rtnl_link_stats64 *stats; - struct nlattr *attr, *af_spec; - struct rtnl_af_ops *af_ops; - struct net_device *upper_dev = netdev_master_upper_dev_get(dev); + struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -866,44 +2044,73 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; - ifm->ifi_type = dev->type; - ifm->ifi_index = dev->ifindex; - ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_type = READ_ONCE(dev->type); + ifm->ifi_index = READ_ONCE(dev->ifindex); + ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = change; - if (nla_put_string(skb, IFLA_IFNAME, dev->name) || - nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || + if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) + goto nla_put_failure; + + netdev_copy_name(dev, devname); + if (nla_put_string(skb, IFLA_IFNAME, devname)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || - nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - nla_put_u32(skb, IFLA_GROUP, dev->group) || - nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || - nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || + netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN) || + nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || + nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || + nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || + nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || + nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || + nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, + READ_ONCE(dev->num_tx_queues)) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, + READ_ONCE(dev->gso_max_segs)) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, + READ_ONCE(dev->gso_max_size)) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, + READ_ONCE(dev->gro_max_size)) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, + READ_ONCE(dev->gso_ipv4_max_size)) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, + READ_ONCE(dev->gro_ipv4_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, + READ_ONCE(dev->tso_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, + READ_ONCE(dev->tso_max_segs)) || + nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, + READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS - nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, + READ_ONCE(dev->num_rx_queues)) || #endif - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink)) || - (upper_dev && - nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || + put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (dev->qdisc && - nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || - (dev->ifalias && - nla_put_string(skb, IFLA_IFALIAS, dev->ifalias))) + nla_put_ifalias(skb, dev) || + nla_put_u32(skb, IFLA_CARRIER_CHANGES, + atomic_read(&dev->carrier_up_count) + + atomic_read(&dev->carrier_down_count)) || + nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, + atomic_read(&dev->carrier_up_count)) || + nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, + atomic_read(&dev->carrier_down_count)) || + nla_put_u16(skb, IFLA_HEADROOM, + READ_ONCE(dev->needed_headroom)) || + nla_put_u16(skb, IFLA_TAILROOM, + READ_ONCE(dev->needed_tailroom))) + goto nla_put_failure; + + if (rtnl_fill_proto_down(skb, dev)) goto nla_put_failure; - if (1) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; - if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) + if (event != IFLA_EVENT_NONE) { + if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; } @@ -913,182 +2120,86 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); - if (attr == NULL) + if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; - stats = dev_get_stats(dev, &temp); - copy_rtnl_link_stats(nla_data(attr), stats); - - attr = nla_reserve(skb, IFLA_STATS64, - sizeof(struct rtnl_link_stats64)); - if (attr == NULL) + if (rtnl_phys_port_name_fill(skb, dev)) goto nla_put_failure; - copy_rtnl_link_stats64(nla_data(attr), stats); - if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && - nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) + if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent - && (ext_filter_mask & RTEXT_FILTER_VF)) { - int i; + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS) && + rtnl_fill_stats(skb, dev)) + goto nla_put_failure; - struct nlattr *vfinfo, *vf; - int num_vfs = dev_num_vf(dev->dev.parent); + if (rtnl_fill_vf(skb, dev, ext_filter_mask)) + goto nla_put_failure; - vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); - if (!vfinfo) - goto nla_put_failure; - for (i = 0; i < num_vfs; i++) { - struct ifla_vf_info ivi; - struct ifla_vf_mac vf_mac; - struct ifla_vf_vlan vf_vlan; - struct ifla_vf_tx_rate vf_tx_rate; - struct ifla_vf_spoofchk vf_spoofchk; - struct ifla_vf_link_state vf_linkstate; - - /* - * Not all SR-IOV capable drivers support the - * spoofcheck query. Preset to -1 so the user - * space tool can detect that the driver didn't - * report anything. - */ - ivi.spoofchk = -1; - memset(ivi.mac, 0, sizeof(ivi.mac)); - /* The default value for VF link state is "auto" - * IFLA_VF_LINK_STATE_AUTO which equals zero - */ - ivi.linkstate = 0; - if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) - break; - vf_mac.vf = - vf_vlan.vf = - vf_tx_rate.vf = - vf_spoofchk.vf = - vf_linkstate.vf = ivi.vf; - - memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); - vf_vlan.vlan = ivi.vlan; - vf_vlan.qos = ivi.qos; - vf_tx_rate.rate = ivi.tx_rate; - vf_spoofchk.setting = ivi.spoofchk; - vf_linkstate.link_state = ivi.linkstate; - vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - goto nla_put_failure; - } - if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || - nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || - nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), - &vf_tx_rate) || - nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), - &vf_spoofchk) || - nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), - &vf_linkstate)) - goto nla_put_failure; - nla_nest_end(skb, vf); - } - nla_nest_end(skb, vfinfo); - } + if (rtnl_port_fill(skb, dev, ext_filter_mask)) + goto nla_put_failure; - if (rtnl_port_fill(skb, dev)) + if (rtnl_xdp_fill(skb, dev)) goto nla_put_failure; - if (dev->rtnl_link_ops) { + if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } - if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) + if (new_nsid && + nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) + goto nla_put_failure; + if (new_ifindex && + nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; - list_for_each_entry(af_ops, &rtnl_af_ops, list) { - if (af_ops->fill_link_af) { - struct nlattr *af; - int err; + if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && + nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) + goto nla_put_failure; - if (!(af = nla_nest_start(skb, af_ops->family))) - goto nla_put_failure; + rcu_read_lock(); + if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) + goto nla_put_failure_rcu; + qdisc = rcu_dereference(dev->qdisc); + if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) + goto nla_put_failure_rcu; + if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) + goto nla_put_failure_rcu; + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure_rcu; + if (rtnl_fill_prop_list(skb, dev)) + goto nla_put_failure_rcu; + rcu_read_unlock(); - err = af_ops->fill_link_af(skb, dev); + if (dev->dev.parent && + nla_put_string(skb, IFLA_PARENT_DEV_NAME, + dev_name(dev->dev.parent))) + goto nla_put_failure; - /* - * Caller may return ENODATA to indicate that there - * was no data to be dumped. This is not an error, it - * means we should trim the attribute header and - * continue. - */ - if (err == -ENODATA) - nla_nest_cancel(skb, af); - else if (err < 0) - goto nla_put_failure; + if (dev->dev.parent && dev->dev.parent->bus && + nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, + dev->dev.parent->bus->name)) + goto nla_put_failure; - nla_nest_end(skb, af); - } - } + if (rtnl_fill_devlink_port(skb, dev)) + goto nla_put_failure; - nla_nest_end(skb, af_spec); + if (rtnl_fill_dpll_pin(skb, dev)) + goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; +nla_put_failure_rcu: + rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } -static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - int h, s_h; - int idx = 0, s_idx; - struct net_device *dev; - struct hlist_head *head; - struct nlattr *tb[IFLA_MAX+1]; - u32 ext_filter_mask = 0; - - s_h = cb->args[0]; - s_idx = cb->args[1]; - - rcu_read_lock(); - cb->seq = net->dev_base_seq; - - if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, - ifla_policy) >= 0) { - - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); - } - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI, - ext_filter_mask) <= 0) - goto out; - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - } -out: - rcu_read_unlock(); - cb->args[1] = idx; - cb->args[0] = h; - - return skb->len; -} - -const struct nla_policy ifla_policy[IFLA_MAX+1] = { +static const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, @@ -1104,7 +2215,10 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, - [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, + /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to + * allow 0-length string (needed to remove an alias). + */ + [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, @@ -1113,46 +2227,339 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, + [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ + [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, + [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, + [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, + [IFLA_XDP] = { .type = NLA_NESTED }, + [IFLA_EVENT] = { .type = NLA_U32 }, + [IFLA_GROUP] = { .type = NLA_U32 }, + [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, + [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, + [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, + [IFLA_MIN_MTU] = { .type = NLA_U32 }, + [IFLA_MAX_MTU] = { .type = NLA_U32 }, + [IFLA_PROP_LIST] = { .type = NLA_NESTED }, + [IFLA_ALT_IFNAME] = { .type = NLA_STRING, + .len = ALTIFNAMSIZ - 1 }, + [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, + [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, + [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), + [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, + [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, + [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, + [IFLA_ALLMULTI] = { .type = NLA_REJECT }, + [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), + [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, + [IFLA_HEADROOM] = { .type = NLA_REJECT }, + [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; -EXPORT_SYMBOL(ifla_policy); static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, -}; - -static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { - [IFLA_VF_INFO] = { .type = NLA_NESTED }, + [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, + [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { - [IFLA_VF_MAC] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_mac) }, - [IFLA_VF_VLAN] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_vlan) }, - [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_tx_rate) }, - [IFLA_VF_SPOOFCHK] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_vf_spoofchk) }, + [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, + [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, + [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, + [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, + [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, + [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, + [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, + [IFLA_VF_STATS] = { .type = NLA_NESTED }, + [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, + [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, + [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, - [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_port_vsi)}, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, + + /* Unused, but we need to keep it here since user space could + * fill it. It's also broken with regard to NLA_BINARY use in + * combination with structs. + */ + [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_port_vsi) }, }; -struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { + [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, + [IFLA_XDP_FD] = { .type = NLA_S32 }, + [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, + [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, + [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, + [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, + int *ops_srcu_index) +{ + struct nlattr *linfo[IFLA_INFO_MAX + 1]; + struct rtnl_link_ops *ops = NULL; + + if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) + return NULL; + + if (linfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + + nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind, ops_srcu_index); + } + + return ops; +} + +static bool link_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + + /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need + * another invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool link_kind_filtered(const struct net_device *dev, + const struct rtnl_link_ops *kind_ops) +{ + if (kind_ops && dev->rtnl_link_ops != kind_ops) + return true; + + return false; +} + +static bool link_dump_filtered(struct net_device *dev, + int master_idx, + const struct rtnl_link_ops *kind_ops) +{ + if (link_master_filtered(dev, master_idx) || + link_kind_filtered(dev, kind_ops)) + return true; + + return false; +} + +/** + * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. + * @sk: netlink socket + * @netnsid: network namespace identifier + * + * Returns the network namespace identified by netnsid on success or an error + * pointer on failure. + */ +struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; + + net = get_net_ns_by_id(sock_net(sk), netnsid); + if (!net) + return ERR_PTR(-EINVAL); + + /* For now, the caller is required to have CAP_NET_ADMIN in + * the user namespace owning the target net ns. + */ + if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EACCES); + } + return net; +} +EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); + +static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, + bool strict_check, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int hdrlen; + + if (strict_check) { + struct ifinfomsg *ifm; + + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { + NL_SET_ERR_MSG(extack, "Invalid header for link dump"); + return -EINVAL; + } + + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change) { + NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); + return -EINVAL; + } + if (ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); + return -EINVAL; + } + + return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, + IFLA_MAX, ifla_policy, + extack); + } + + /* A hack to preserve kernel<->userspace interface. + * The correct header is ifinfomsg. It is consistent with rtnl_getlink. + * However, before Linux v3.9 the code here assumed rtgenmsg and that's + * what iproute2 < v3.9.0 used. + * We can detect the old iproute2. Even including the IFLA_EXT_MASK + * attribute, its netlink message is shorter than struct ifinfomsg. + */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, + extack); +} + +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct rtnl_link_ops *kind_ops = NULL; + const struct nlmsghdr *nlh = cb->nlh; + struct net *net = sock_net(skb->sk); + unsigned int flags = NLM_F_MULTI; + struct nlattr *tb[IFLA_MAX+1]; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; + struct net *tgt_net = net; + u32 ext_filter_mask = 0; + struct net_device *dev; + int ops_srcu_index; + int master_idx = 0; + int netnsid = -1; + int err, i; + + err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); + if (err < 0) { + if (cb->strict_check) + return err; + + goto walk_entries; + } + + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; + + /* new attributes should only be added with strict checking */ + switch (i) { + case IFLA_TARGET_NETNSID: + netnsid = nla_get_s32(tb[i]); + tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); + if (IS_ERR(tgt_net)) { + NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); + err = PTR_ERR(tgt_net); + netnsid = -1; + goto out; + } + break; + case IFLA_EXT_MASK: + ext_filter_mask = nla_get_u32(tb[i]); + break; + case IFLA_MASTER: + master_idx = nla_get_u32(tb[i]); + break; + case IFLA_LINKINFO: + kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); + break; + default: + if (cb->strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); + err = -EINVAL; + goto out; + } + } + } + + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; + +walk_entries: + err = 0; + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; + err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, 0, flags, + ext_filter_mask, 0, NULL, 0, + netnsid, GFP_KERNEL); + if (err < 0) + break; + } + + + cb->seq = tgt_net->dev_base_seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + +out: + + if (kind_ops) + rtnl_link_ops_put(kind_ops, ops_srcu_index); + if (netnsid >= 0) + put_net(tgt_net); + + return err; +} + +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr) +{ + const struct ifinfomsg *ifmp; + const struct nlattr *attrs; + size_t len; + + ifmp = nla_data(nla_peer); + attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); + len = nla_len(nla_peer) - sizeof(struct ifinfomsg); + + if (ifmp->ifi_index < 0) { + NL_SET_ERR_MSG_ATTR(exterr, nla_peer, + "ifindex can't be negative"); + return -EINVAL; + } + + return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, + exterr); +} +EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); + +static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) +{ + struct net *net = NULL; + /* Examine the link attributes and figure out which * network namespace we are talking about. */ @@ -1160,22 +2567,149 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); - else + + return net; +} + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net = rtnl_link_get_net_ifla(tb); + + if (!net) net = get_net(src_net); + return net; } EXPORT_SYMBOL(rtnl_link_get_net); -static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) +/* Figure out which network namespace we are talking about by + * examining the link attributes in the following order: + * + * 1. IFLA_NET_NS_PID + * 2. IFLA_NET_NS_FD + * 3. IFLA_TARGET_NETNSID + */ +static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, + struct nlattr *tb[]) { - if (dev) { - if (tb[IFLA_ADDRESS] && - nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) - return -EINVAL; + struct net *net; - if (tb[IFLA_BROADCAST] && - nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) - return -EINVAL; + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) + return rtnl_link_get_net(src_net, tb); + + if (!tb[IFLA_TARGET_NETNSID]) + return get_net(src_net); + + net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); + if (!net) + return ERR_PTR(-EINVAL); + + return net; +} + +static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, + struct net *src_net, + struct nlattr *tb[], int cap) +{ + struct net *net; + + net = rtnl_link_get_net_by_nlattr(src_net, tb); + if (IS_ERR(net)) + return net; + + if (!netlink_ns_capable(skb, net->user_ns, cap)) { + put_net(net); + return ERR_PTR(-EPERM); + } + + return net; +} + +/* Verify that rtnetlink requests do not pass additional properties + * potentially referring to different network namespaces. + */ +static int rtnl_ensure_unique_netns(struct nlattr *tb[], + struct netlink_ext_ack *extack, + bool netns_id_only) +{ + + if (netns_id_only) { + if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) + return 0; + + NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); + return -EOPNOTSUPP; + } + + if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) + goto invalid_attr; + + if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) + goto invalid_attr; + + if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) + goto invalid_attr; + + return 0; + +invalid_attr: + NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); + return -EINVAL; +} + +static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, + int max_tx_rate) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_set_vf_rate) + return -EOPNOTSUPP; + if (max_tx_rate && max_tx_rate < min_tx_rate) + return -EINVAL; + + return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); +} + +static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + if (tb[IFLA_ADDRESS] && + nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) + return -EINVAL; + + if (tb[IFLA_BROADCAST] && + nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) + return -EINVAL; + + if (tb[IFLA_GSO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GSO_MAX_SEGS] && + (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || + nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { + NL_SET_ERR_MSG(extack, "too big gso_max_segs"); + return -EINVAL; + } + + if (tb[IFLA_GRO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GSO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); + return -EINVAL; } if (tb[IFLA_AF_SPEC]) { @@ -1183,101 +2717,235 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); + if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + else if (af_ops->validate_link_af) + err = af_ops->validate_link_af(dev, af, extack); + else + err = 0; - if (af_ops->validate_link_af) { - err = af_ops->validate_link_af(dev, af); - if (err < 0) - return err; - } + rtnl_af_put(af_ops, af_ops_srcu_index); + + if (err < 0) + return err; } } return 0; } -static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) +static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, + int guid_type) { - int rem, err = -EINVAL; - struct nlattr *vf; const struct net_device_ops *ops = dev->netdev_ops; - nla_for_each_nested(vf, attr, rem) { - switch (nla_type(vf)) { - case IFLA_VF_MAC: { - struct ifla_vf_mac *ivm; - ivm = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_mac) - err = ops->ndo_set_vf_mac(dev, ivm->vf, - ivm->mac); - break; - } - case IFLA_VF_VLAN: { - struct ifla_vf_vlan *ivv; - ivv = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_vlan) - err = ops->ndo_set_vf_vlan(dev, ivv->vf, - ivv->vlan, - ivv->qos); - break; - } - case IFLA_VF_TX_RATE: { - struct ifla_vf_tx_rate *ivt; - ivt = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_tx_rate) - err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, - ivt->rate); - break; - } - case IFLA_VF_SPOOFCHK: { - struct ifla_vf_spoofchk *ivs; - ivs = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_spoofchk) - err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, - ivs->setting); - break; - } - case IFLA_VF_LINK_STATE: { - struct ifla_vf_link_state *ivl; - ivl = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_link_state) - err = ops->ndo_set_vf_link_state(dev, ivl->vf, - ivl->link_state); - break; - } - default: - err = -EINVAL; - break; + return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); +} + +static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) +{ + if (dev->type != ARPHRD_INFINIBAND) + return -EOPNOTSUPP; + + return handle_infiniband_guid(dev, ivt, guid_type); +} + +static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err = -EINVAL; + + if (tb[IFLA_VF_MAC]) { + struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); + + if (ivm->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, + ivm->mac); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN]) { + struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); + + if (ivv->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, + ivv->qos, + htons(ETH_P_8021Q)); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN_LIST]) { + struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; + struct nlattr *attr; + int rem, len = 0; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_vlan) + return err; + + nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { + if (nla_type(attr) != IFLA_VF_VLAN_INFO || + nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { + return -EINVAL; + } + if (len >= MAX_VLAN_LIST_LEN) + return -EOPNOTSUPP; + ivvl[len] = nla_data(attr); + + len++; } - if (err) - break; + if (len == 0) + return -EINVAL; + + if (ivvl[0]->vf >= INT_MAX) + return -EINVAL; + err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, + ivvl[0]->qos, ivvl[0]->vlan_proto); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_TX_RATE]) { + struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); + struct ifla_vf_info ivf; + + if (ivt->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_get_vf_config) + err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); + if (err < 0) + return err; + + err = rtnl_set_vf_rate(dev, ivt->vf, + ivf.min_tx_rate, ivt->rate); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_RATE]) { + struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); + + if (ivt->vf >= INT_MAX) + return -EINVAL; + + err = rtnl_set_vf_rate(dev, ivt->vf, + ivt->min_tx_rate, ivt->max_tx_rate); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_SPOOFCHK]) { + struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); + + if (ivs->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_spoofchk) + err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, + ivs->setting); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_LINK_STATE]) { + struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); + + if (ivl->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_link_state) + err = ops->ndo_set_vf_link_state(dev, ivl->vf, + ivl->link_state); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_RSS_QUERY_EN]) { + struct ifla_vf_rss_query_en *ivrssq_en; + + err = -EOPNOTSUPP; + ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); + if (ivrssq_en->vf >= INT_MAX) + return -EINVAL; + if (ops->ndo_set_vf_rss_query_en) + err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, + ivrssq_en->setting); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_TRUST]) { + struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); + + if (ivt->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_trust) + err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_IB_NODE_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); + + if (ivt->vf >= INT_MAX) + return -EINVAL; + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); + } + + if (tb[IFLA_VF_IB_PORT_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); + + if (ivt->vf >= INT_MAX) + return -EINVAL; + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + + return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); } + return err; } -static int do_set_master(struct net_device *dev, int ifindex) +static int do_set_master(struct net_device *dev, int ifindex, + struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; + /* Release the lower lock, the upper is responsible for locking + * the lower if needed. None of the existing upper devices + * use netdev instance lock, so don't grab it. + */ + if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { + netdev_unlock_ops(dev); err = ops->ndo_del_slave(upper_dev, dev); + netdev_lock_ops(dev); if (err) return err; } else { @@ -1291,7 +2959,9 @@ static int do_set_master(struct net_device *dev, int ifindex) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { - err = ops->ndo_add_slave(upper_dev, dev); + netdev_unlock_ops(dev); + err = ops->ndo_add_slave(upper_dev, dev, extack); + netdev_lock_ops(dev); if (err) return err; } else { @@ -1301,29 +2971,102 @@ static int do_set_master(struct net_device *dev, int ifindex) return 0; } -static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, - struct nlattr **tb, char *ifname, int modified) +static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { + [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, + [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, +}; + +static int do_set_proto_down(struct net_device *dev, + struct nlattr *nl_proto_down, + struct nlattr *nl_proto_down_reason, + struct netlink_ext_ack *extack) { - const struct net_device_ops *ops = dev->netdev_ops; + struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; + unsigned long mask = 0; + u32 value; + bool proto_down; int err; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { - struct net *net = rtnl_link_get_net(dev_net(dev), tb); - if (IS_ERR(net)) { - err = PTR_ERR(net); - goto errout; + if (!dev->change_proto_down) { + NL_SET_ERR_MSG(extack, "Protodown not supported by device"); + return -EOPNOTSUPP; + } + + if (nl_proto_down_reason) { + err = nla_parse_nested_deprecated(pdreason, + IFLA_PROTO_DOWN_REASON_MAX, + nl_proto_down_reason, + ifla_proto_down_reason_policy, + NULL); + if (err < 0) + return err; + + if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { + NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); + return -EINVAL; } - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { - err = -EPERM; - goto errout; + + value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); + + if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) + mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); + + netdev_change_proto_down_reason_locked(dev, mask, value); + } + + if (nl_proto_down) { + proto_down = nla_get_u8(nl_proto_down); + + /* Don't turn off protodown if there are active reasons */ + if (!proto_down && dev->proto_down_reason) { + NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); + return -EBUSY; } - err = dev_change_net_namespace(dev, net, ifname); - put_net(net); + err = netif_change_proto_down(dev, proto_down); if (err) - goto errout; - modified = 1; + return err; } + return 0; +} + +#define DO_SETLINK_MODIFIED 0x01 +/* notify flag means notify + modified. */ +#define DO_SETLINK_NOTIFY 0x03 +static int do_setlink(const struct sk_buff *skb, struct net_device *dev, + struct net *tgt_net, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb, int status) +{ + const struct net_device_ops *ops = dev->netdev_ops; + char ifname[IFNAMSIZ]; + int err; + + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + + if (!net_eq(tgt_net, dev_net(dev))) { + const char *pat = ifname[0] ? ifname : NULL; + int new_ifindex; + + new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); + + err = __dev_change_net_namespace(dev, tgt_net, pat, + new_ifindex, extack); + if (err) + return err; + + status |= DO_SETLINK_MODIFIED; + } + + netdev_lock_ops(dev); + if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; @@ -1350,39 +3093,40 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { - struct sockaddr *sa; - int len; + struct sockaddr_storage ss = { }; + + netdev_unlock_ops(dev); - len = sizeof(sa_family_t) + dev->addr_len; - sa = kmalloc(len, GFP_KERNEL); - if (!sa) { - err = -ENOMEM; + /* dev_addr_sem is an outer lock, enforce proper ordering */ + down_write(&dev_addr_sem); + netdev_lock_ops(dev); + + ss.ss_family = dev->type; + memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); + err = netif_set_mac_address(dev, &ss, extack); + if (err) { + up_write(&dev_addr_sem); goto errout; } - sa->sa_family = dev->type; - memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), - dev->addr_len); - err = dev_set_mac_address(dev, sa); - kfree(sa); - if (err) - goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; + + up_write(&dev_addr_sem); } if (tb[IFLA_MTU]) { - err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); - modified = 1; + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + status |= DO_SETLINK_NOTIFY; } /* @@ -1391,18 +3135,18 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { - err = dev_change_name(dev, ifname); + err = netif_change_name(dev, ifname); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { - err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), - nla_len(tb[IFLA_IFALIAS])); + err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), + nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { @@ -1411,49 +3155,112 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { - err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); + err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } - if (tb[IFLA_TXQLEN]) - dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_TXQLEN]) { + unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); + + err = netif_change_tx_queue_len(dev, value); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_GSO_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); + + if (dev->gso_max_size ^ max_size) { + netif_set_gso_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GSO_MAX_SEGS]) { + u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + + if (dev->gso_max_segs ^ max_segs) { + netif_set_gso_max_segs(dev, max_segs); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); + + if (dev->gro_max_size ^ gro_max_size) { + netif_set_gro_max_size(dev, gro_max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); + + if (dev->gso_ipv4_max_size ^ max_size) { + netif_set_gso_ipv4_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); + + if (dev->gro_ipv4_max_size ^ gro_max_size) { + netif_set_gro_ipv4_max_size(dev, gro_max_size); + status |= DO_SETLINK_MODIFIED; + } + } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { - write_lock_bh(&dev_base_lock); - dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); - write_unlock_bh(&dev_base_lock); + unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); + + if (dev->link_mode ^ value) + status |= DO_SETLINK_NOTIFY; + WRITE_ONCE(dev->link_mode, value); } if (tb[IFLA_VFINFO_LIST]) { + struct nlattr *vfinfo[IFLA_VF_MAX + 1]; struct nlattr *attr; int rem; + nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { - if (nla_type(attr) != IFLA_VF_INFO) { + if (nla_type(attr) != IFLA_VF_INFO || + nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } - err = do_setvfinfo(dev, attr); + err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, + attr, + ifla_vf_policy, + NULL); + if (err < 0) + goto errout; + err = do_setvfinfo(dev, vfinfo); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1469,10 +3276,15 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { - if (nla_type(attr) != IFLA_VF_PORT) - continue; - err = nla_parse_nested(port, IFLA_PORT_MAX, - attr, ifla_port_policy); + if (nla_type(attr) != IFLA_VF_PORT || + nla_len(attr) < NLA_HDRLEN) { + err = -EINVAL; + goto errout; + } + err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, + attr, + ifla_port_policy, + NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { @@ -1483,7 +3295,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1491,8 +3303,9 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, if (tb[IFLA_PORT_SELF]) { struct nlattr *port[IFLA_PORT_MAX+1]; - err = nla_parse_nested(port, IFLA_PORT_MAX, - tb[IFLA_PORT_SELF], ifla_port_policy); + err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, + tb[IFLA_PORT_SELF], + ifla_port_policy, NULL); if (err < 0) goto errout; @@ -1501,7 +3314,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { @@ -1509,135 +3322,302 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; + + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); + if (!af_ops) { + err = -EAFNOSUPPORT; + goto errout; + } - if (!(af_ops = rtnl_af_lookup(nla_type(af)))) - BUG(); + err = af_ops->set_link_af(dev, af, extack); + rtnl_af_put(af_ops, af_ops_srcu_index); - err = af_ops->set_link_af(dev, af); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; + if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { + err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], + tb[IFLA_PROTO_DOWN_REASON], extack); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + + if (tb[IFLA_XDP]) { + struct nlattr *xdp[IFLA_XDP_MAX + 1]; + u32 xdp_flags = 0; + + err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, + tb[IFLA_XDP], + ifla_xdp_policy, NULL); + if (err < 0) + goto errout; + + if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { + err = -EINVAL; + goto errout; + } + + if (xdp[IFLA_XDP_FLAGS]) { + xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); + if (xdp_flags & ~XDP_FLAGS_MASK) { + err = -EINVAL; + goto errout; + } + if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { + err = -EINVAL; + goto errout; + } + } + + if (xdp[IFLA_XDP_FD]) { + int expected_fd = -1; + + if (xdp_flags & XDP_FLAGS_REPLACE) { + if (!xdp[IFLA_XDP_EXPECTED_FD]) { + err = -EINVAL; + goto errout; + } + expected_fd = + nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); + } + + err = dev_change_xdp_fd(dev, extack, + nla_get_s32(xdp[IFLA_XDP_FD]), + expected_fd, + xdp_flags); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + } + errout: - if (err < 0 && modified) - net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", - dev->name); + if (status & DO_SETLINK_MODIFIED) { + if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) + netif_state_change(dev); + + if (err < 0) + net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", + dev->name); + } + + netdev_unlock_ops(dev); return err; } -static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static struct net_device *rtnl_dev_get(struct net *net, + struct nlattr *tb[]) { + char ifname[ALTIFNAMSIZ]; + + if (tb[IFLA_IFNAME]) + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else if (tb[IFLA_ALT_IFNAME]) + nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); + else + return NULL; + + return __dev_get_by_name(net, ifname); +} + +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); - struct ifinfomsg *ifm; - struct net_device *dev; - int err; struct nlattr *tb[IFLA_MAX+1]; - char ifname[IFNAMSIZ]; + struct net_device *dev = NULL; + struct rtnl_nets rtnl_nets; + struct net *tgt_net; + int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); if (err < 0) goto errout; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - else - ifname[0] = '\0'; + err = rtnl_ensure_unique_netns(tb, extack, false); + if (err < 0) + goto errout; + + tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + err = PTR_ERR(tgt_net); + goto errout; + } + + rtnl_nets_init(&rtnl_nets); + rtnl_nets_add(&rtnl_nets, get_net(net)); + rtnl_nets_add(&rtnl_nets, tgt_net); + + rtnl_nets_lock(&rtnl_nets); - err = -EINVAL; - ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb); else - goto errout; + err = -EINVAL; - if (dev == NULL) { + if (dev) + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); + else if (!err) err = -ENODEV; - goto errout; - } - err = validate_linkmsg(dev, tb); - if (err < 0) - goto errout; - - err = do_setlink(dev, ifm, tb, ifname, 0); + rtnl_nets_unlock(&rtnl_nets); + rtnl_nets_destroy(&rtnl_nets); errout: return err; } -static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_group_dellink(const struct net *net, int group) +{ + struct net_device *dev, *aux; + LIST_HEAD(list_kill); + bool found = false; + + if (!group) + return -EPERM; + + for_each_netdev(net, dev) { + if (dev->group == group) { + const struct rtnl_link_ops *ops; + + found = true; + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + } + } + + if (!found) + return -ENODEV; + + for_each_netdev_safe(net, dev, aux) { + if (dev->group == group) { + const struct rtnl_link_ops *ops; + + ops = dev->rtnl_link_ops; + ops->dellink(dev, &list_kill); + } + } + unregister_netdevice_many(&list_kill); + + return 0; +} + +int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh) { - struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - struct net_device *dev; - struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; + LIST_HEAD(list_kill); + + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many_notify(&list_kill, portid, nlh); + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_delete_link); + +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm = nlmsg_data(nlh); + struct net *net = sock_net(skb->sk); + u32 portid = NETLINK_CB(skb).portid; struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + struct net *tgt_net = net; + int netnsid = -1; int err; - LIST_HEAD(list_kill); - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); if (err < 0) return err; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err < 0) + return err; + + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + + rtnl_net_lock(tgt_net); - ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + dev = __dev_get_by_index(tgt_net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(tgt_net, tb); + + if (dev) + err = rtnl_delete_link(dev, portid, nlh); + else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + err = -ENODEV; + else if (tb[IFLA_GROUP]) + err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - return -EINVAL; + err = -EINVAL; - if (!dev) - return -ENODEV; + rtnl_net_unlock(tgt_net); - ops = dev->rtnl_link_ops; - if (!ops) - return -EOPNOTSUPP; + if (netnsid >= 0) + put_net(tgt_net); - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - list_del(&list_kill); - return 0; + return err; } -int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, + u32 portid, const struct nlmsghdr *nlh) { - unsigned int old_flags; + unsigned int old_flags, changed; int err; old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { - err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); if (err < 0) return err; } - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + changed = old_flags ^ dev->flags; + if (dev->rtnl_link_initializing) { + dev->rtnl_link_initializing = false; + changed = ~0U; + } - __dev_notify_flags(dev, old_flags); + __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *net, - char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) { - int err; struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; + int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); @@ -1649,21 +3629,53 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - err = -ENOMEM; - dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup, - num_tx_queues, num_rx_queues); + if (num_tx_queues < 1 || num_tx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); + return ERR_PTR(-EINVAL); + } + + if (num_rx_queues < 1 || num_rx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); + return ERR_PTR(-EINVAL); + } + + if (ops->alloc) { + dev = ops->alloc(tb, ifname, name_assign_type, + num_tx_queues, num_rx_queues); + if (IS_ERR(dev)) + return dev; + } else { + dev = alloc_netdev_mqs(ops->priv_size, ifname, + name_assign_type, ops->setup, + num_tx_queues, num_rx_queues); + } + if (!dev) - goto err; + return ERR_PTR(-ENOMEM); + + err = validate_linkmsg(dev, tb, extack); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } dev_net_set(dev, net); dev->rtnl_link_ops = ops; - dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->rtnl_link_initializing = true; - if (tb[IFLA_MTU]) - dev->mtu = nla_get_u32(tb[IFLA_MTU]); + if (tb[IFLA_MTU]) { + u32 mtu = nla_get_u32(tb[IFLA_MTU]); + + err = dev_validate_mtu(dev, mtu, extack); + if (err) { + free_netdev(dev); + return ERR_PTR(err); + } + dev->mtu = mtu; + } if (tb[IFLA_ADDRESS]) { - memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), - nla_len(tb[IFLA_ADDRESS])); + __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]), + nla_len(tb[IFLA_ADDRESS])); dev->addr_assign_type = NET_ADDR_SET; } if (tb[IFLA_BROADCAST]) @@ -1676,25 +3688,106 @@ struct net_device *rtnl_create_link(struct net *net, if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + if (tb[IFLA_GSO_MAX_SIZE]) + netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); + if (tb[IFLA_GSO_MAX_SEGS]) + netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); + if (tb[IFLA_GRO_MAX_SIZE]) + netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) + netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) + netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; - -err: - return ERR_PTR(err); } EXPORT_SYMBOL(rtnl_create_link); -static int rtnl_group_changelink(struct net *net, int group, - struct ifinfomsg *ifm, - struct nlattr **tb) +struct rtnl_newlink_tbs { + struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; +}; + +static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net_device *dev, struct net *tgt_net, + struct rtnl_newlink_tbs *tbs, + struct nlattr **data, + struct netlink_ext_ack *extack) { - struct net_device *dev; + struct nlattr ** const linkinfo = tbs->linkinfo; + struct nlattr ** const tb = tbs->tb; + int status = 0; int err; - for_each_netdev(net, dev) { + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + const struct rtnl_link_ops *m_ops = NULL; + struct nlattr **slave_data = NULL; + struct net_device *master_dev; + + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + m_ops = master_dev->rtnl_link_ops; + + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; + + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; + + if (m_ops->slave_maxtype) { + err = nla_parse_nested_deprecated(tbs->slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + + slave_data = tbs->slave_attr; + } + + err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); +} + +static int rtnl_group_changelink(const struct sk_buff *skb, + struct net *net, struct net *tgt_net, + int group, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb) +{ + struct net_device *dev, *aux; + int err; + + for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = do_setlink(dev, ifm, tb, NULL, 0); + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } @@ -1703,221 +3796,538 @@ static int rtnl_group_changelink(struct net *net, int group, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, + const struct rtnl_link_ops *ops, + struct net *tgt_net, struct net *link_net, + struct net *peer_net, + const struct nlmsghdr *nlh, + struct nlattr **tb, struct nlattr **data, + struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; + unsigned char name_assign_type = NET_NAME_USER; + struct rtnl_newlink_params params = { + .src_net = sock_net(skb->sk), + .link_net = link_net, + .peer_net = peer_net, + .tb = tb, + .data = data, + }; + u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; - struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; int err; -#ifdef CONFIG_MODULES -replay: -#endif - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); - if (err < 0) - return err; + if (!ops->alloc && !ops->setup) + return -EOPNOTSUPP; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + if (tb[IFLA_IFNAME]) { + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + } else { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } + + dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb, + extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } + + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) + err = ops->newlink(dev, ¶ms, extack); else - ifname[0] = '\0'; + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; + } - ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); - else { - if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; + netdev_lock_ops(dev); + + err = rtnl_configure_link(dev, ifm, portid, nlh); + if (err < 0) + goto out_unregister; + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; } - err = validate_linkmsg(dev, tb); + netdev_unlock_ops(dev); +out: + return err; +out_unregister: + netdev_unlock_ops(dev); + if (ops->newlink) { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); + } + goto out; +} + +static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *tbp[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX + 1]; + int err; + + if (!data || !data[ops->peer_type]) + return rtnl_link_get_net_ifla(tbp); + + err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) - return err; + return ERR_PTR(err); - if (tb[IFLA_LINKINFO]) { - err = nla_parse_nested(linkinfo, IFLA_INFO_MAX, - tb[IFLA_LINKINFO], ifla_info_policy); + if (ops->validate) { + err = ops->validate(tb, NULL, extack); if (err < 0) - return err; - } else - memset(linkinfo, 0, sizeof(linkinfo)); + return ERR_PTR(err); + } - if (linkinfo[IFLA_INFO_KIND]) { - nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + return rtnl_link_get_net_ifla(tb); +} + +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net *tgt_net, struct net *link_net, + struct net *peer_net, + struct rtnl_newlink_tbs *tbs, + struct nlattr **data, + struct netlink_ext_ack *extack) +{ + struct nlattr ** const tb = tbs->tb; + struct net *net = sock_net(skb->sk); + struct net *device_net; + struct net_device *dev; + struct ifinfomsg *ifm; + bool link_specified; + + /* When creating, lookup for existing device in target net namespace */ + device_net = (nlh->nlmsg_flags & NLM_F_CREATE) && + (nlh->nlmsg_flags & NLM_F_EXCL) ? + tgt_net : net; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) { + link_specified = true; + dev = __dev_get_by_index(device_net, ifm->ifi_index); + } else if (ifm->ifi_index < 0) { + NL_SET_ERR_MSG(extack, "ifindex can't be negative"); + return -EINVAL; + } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { + link_specified = true; + dev = rtnl_dev_get(device_net, tb); } else { - kind[0] = '\0'; - ops = NULL; - } - - if (1) { - struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; - struct net *dest_net; - - if (ops) { - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data); - if (err < 0) - return err; - } - } + link_specified = false; + dev = NULL; + } - if (dev) { - int modified = 0; + if (dev) + return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, + * or it's for a group + */ + if (link_specified || !tb[IFLA_GROUP]) + return -ENODEV; - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + return rtnl_group_changelink(skb, net, tgt_net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, extack, tb); + } - err = ops->changelink(dev, tb, data); - if (err < 0) - return err; - modified = 1; - } + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; + + if (!ops) { + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } + + return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh, + tb, data, extack); +} - return do_setlink(dev, ifm, tb, ifname, modified); +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *tgt_net, *link_net = NULL, *peer_net = NULL; + struct nlattr **tb, **linkinfo, **data = NULL; + struct rtnl_link_ops *ops = NULL; + struct rtnl_newlink_tbs *tbs; + struct rtnl_nets rtnl_nets; + int ops_srcu_index; + int ret; + + tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); + if (!tbs) + return -ENOMEM; + + tb = tbs->tb; + ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + if (ret < 0) + goto free; + + ret = rtnl_ensure_unique_netns(tb, extack, false); + if (ret < 0) + goto free; + + linkinfo = tbs->linkinfo; + if (tb[IFLA_LINKINFO]) { + ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, + tb[IFLA_LINKINFO], + ifla_info_policy, NULL); + if (ret < 0) + goto free; + } else { + memset(linkinfo, 0, sizeof(tbs->linkinfo)); + } + + if (linkinfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); +#ifdef CONFIG_MODULES + if (!ops) { + request_module("rtnl-link-%s", kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); } +#endif + } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(net, - nla_get_u32(tb[IFLA_GROUP]), - ifm, tb); - return -ENODEV; + rtnl_nets_init(&rtnl_nets); + + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) { + ret = -EINVAL; + goto put_ops; } - if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (ret < 0) + goto put_ops; - if (!ops) { -#ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; + data = tbs->attr; + } + + if (ops->validate) { + ret = ops->validate(tb, data, extack); + if (ret < 0) + goto put_ops; + } + + if (ops->peer_type) { + peer_net = rtnl_get_peer_net(ops, tb, data, extack); + if (IS_ERR(peer_net)) { + ret = PTR_ERR(peer_net); + goto put_ops; } -#endif - return -EOPNOTSUPP; + if (peer_net) + rtnl_nets_add(&rtnl_nets, peer_net); } + } - if (!ifname[0]) - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + ret = PTR_ERR(tgt_net); + goto put_net; + } - dest_net = rtnl_link_get_net(net, tb); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); + rtnl_nets_add(&rtnl_nets, tgt_net); - dev = rtnl_create_link(dest_net, ifname, ops, tb); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); - goto out; + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + + link_net = get_net_ns_by_id(tgt_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + ret = -EINVAL; + goto put_net; } - dev->ifindex = ifm->ifi_index; + rtnl_nets_add(&rtnl_nets, link_net); - if (ops->newlink) - err = ops->newlink(net, dev, tb, data); - else - err = register_netdevice(dev); + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto put_net; + } + } - if (err < 0 && !IS_ERR(dev)) - free_netdev(dev); - if (err < 0) - goto out; + rtnl_nets_lock(&rtnl_nets); + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack); + rtnl_nets_unlock(&rtnl_nets); + +put_net: + rtnl_nets_destroy(&rtnl_nets); +put_ops: + if (ops) + rtnl_link_ops_put(ops, ops_srcu_index); +free: + kfree(tbs); + return ret; +} - err = rtnl_configure_link(dev, ifm); - if (err < 0) - unregister_netdevice(dev); -out: - put_net(dest_net); +static int rtnl_valid_getlink_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm; + int i, err; + + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { + NL_SET_ERR_MSG(extack, "Invalid header for get link"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change) { + NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + if (err) return err; + + for (i = 0; i <= IFLA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case IFLA_IFNAME: + case IFLA_ALT_IFNAME: + case IFLA_EXT_MASK: + case IFLA_TARGET_NETNSID: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); + return -EINVAL; + } } + + return 0; } -static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct net *tgt_net = net; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; + int netnsid = -1; int err; u32 ext_filter_mask = 0; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = rtnl_valid_getlink_req(skb, nlh, tb, extack); if (err < 0) return err; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err < 0) + return err; + + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + dev = __dev_get_by_index(tgt_net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(tgt_net, tb); else - return -EINVAL; + goto out; + err = -ENODEV; if (dev == NULL) - return -ENODEV; + goto out; - nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); + err = -ENOBUFS; + nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) - return -ENOBUFS; + goto out; - err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask); + /* Synchronize the carrier state so we don't report a state + * that we're not actually going to honour immediately; if + * the driver just did a carrier off->on transition, we can + * only TX if link watch work has run, but without this we'd + * already report carrier on, even if it doesn't work yet. + */ + linkwatch_sync_dev(dev); + + err = rtnl_fill_ifinfo(nskb, dev, net, + RTM_NEWLINK, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, 0, 0, ext_filter_mask, + 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); +out: + if (netnsid >= 0) + put_net(tgt_net); return err; } -static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, + bool *changed, struct netlink_ext_ack *extack) +{ + char *alt_ifname; + size_t size; + int err; + + err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + if (cmd == RTM_NEWLINKPROP) { + size = rtnl_prop_list_size(dev); + size += nla_total_size(ALTIFNAMSIZ); + if (size >= U16_MAX) { + NL_SET_ERR_MSG(extack, + "effective property list too long"); + return -EINVAL; + } + } + + alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); + if (!alt_ifname) + return -ENOMEM; + + if (cmd == RTM_NEWLINKPROP) { + err = netdev_name_node_alt_create(dev, alt_ifname); + if (!err) + alt_ifname = NULL; + } else if (cmd == RTM_DELLINKPROP) { + err = netdev_name_node_alt_destroy(dev, alt_ifname); + } else { + WARN_ON_ONCE(1); + err = -EINVAL; + } + + kfree(alt_ifname); + if (!err) + *changed = true; + return err; +} + +static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; - struct nlattr *tb[IFLA_MAX+1]; + struct ifinfomsg *ifm; + bool changed = false; + struct nlattr *attr; + int err, rem; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (!tb[IFLA_PROP_LIST]) + return 0; + + nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { + switch (nla_type(attr)) { + case IFLA_ALT_IFNAME: + err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); + if (err) + return err; + break; + } + } + + if (changed) + netdev_state_change(dev); + return 0; +} + +static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); +} + +static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); +} + +static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb, + struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + size_t min_ifinfo_dump_size = 0; u32 ext_filter_mask = 0; - u16 min_ifinfo_dump_size = 0; + struct net_device *dev; + struct nlattr *nla; + int hdrlen, rem; + + /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return NLMSG_GOODSIZE; - if (nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, - ifla_policy) >= 0) { - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + nla_for_each_attr_type(nla, IFLA_EXT_MASK, + nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), rem) { + if (nla_len(nla) == sizeof(u32)) + ext_filter_mask = nla_get_u32(nla); } if (!ext_filter_mask) @@ -1926,77 +4336,146 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) * traverse the list of net devices and compute the minimum * buffer size based upon the filter mask. */ - list_for_each_entry(dev, &net->dev_base_head, dev_list) { - min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, - if_nlmsg_size(dev, - ext_filter_mask)); + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + min_ifinfo_dump_size = max(min_ifinfo_dump_size, + if_nlmsg_size(dev, ext_filter_mask)); } + rcu_read_unlock(); - return min_ifinfo_dump_size; + return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; + int type = cb->nlh->nlmsg_type - RTM_BASE; + int ret = 0; if (s_idx == 0) s_idx = 1; + for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { - int type = cb->nlh->nlmsg_type-RTM_BASE; + struct rtnl_link __rcu **tab; + struct rtnl_link *link; + rtnl_dumpit_func dumpit; + if (idx < s_idx || idx == PF_PACKET) continue; - if (rtnl_msg_handlers[idx] == NULL || - rtnl_msg_handlers[idx][type].dumpit == NULL) + + if (type < 0 || type >= RTM_NR_MSGTYPES) continue; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); + if (!tab) + continue; + + link = rcu_dereference_rtnl(tab[type]); + if (!link) + continue; + + dumpit = link->dumpit; + if (!dumpit) + continue; + if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); cb->prev_seq = 0; cb->seq = 0; } - if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) + ret = dumpit(skb, cb); + if (ret) break; } cb->family = idx; - return skb->len; + return skb->len ? : ret; } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change) +struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, + unsigned int change, + u32 event, gfp_t flags, int *new_nsid, + int new_ifindex, u32 portid, + const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - size_t if_info_size; + u32 seq = 0; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); + skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); + if (nlmsg_report(nlh)) + seq = nlmsg_seq(nlh); + else + portid = 0; + + err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), + type, portid, seq, change, 0, 0, event, + new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); - return; + return skb; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_LINK, err); + rtnl_set_sk_err(net, RTNLGRP_LINK, err); + return NULL; +} + +void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, + u32 portid, const struct nlmsghdr *nlh) +{ + struct net *net = dev_net(dev); + + rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags); +} + +static void rtmsg_ifinfo_event(int type, struct net_device *dev, + unsigned int change, u32 event, + gfp_t flags, int *new_nsid, int new_ifindex, + u32 portid, const struct nlmsghdr *nlh) +{ + struct sk_buff *skb; + + if (dev->reg_state != NETREG_REGISTERED) + return; + + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, + new_ifindex, portid, nlh); + if (skb) + rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags, u32 portid, const struct nlmsghdr *nlh) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + NULL, 0, portid, nlh); +} + +void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, + gfp_t flags, int *new_nsid, int new_ifindex) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + new_nsid, new_ifindex, 0, NULL); } -EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, - u8 *addr, u32 pid, u32 seq, - int type, unsigned int flags) + u8 *addr, u16 vid, u32 pid, u32 seq, + int type, unsigned int flags, + int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; @@ -2007,34 +4486,43 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = NUD_PERMANENT; + ndm->ndm_state = ndm_state; - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) + if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; + if (vid) + if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) + goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } -static inline size_t rtnl_fdb_nlmsg_size(void) +static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { - return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN); + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ + nla_total_size(sizeof(u16)) + /* NDA_VLAN */ + 0; } -static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type) +static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, + u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC); + skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; - err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF); + err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, + 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; @@ -2046,13 +4534,13 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -/** +/* * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry */ int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, + const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; @@ -2061,7 +4549,17 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { - pr_info("%s: FDB only supports static addresses\n", dev->name); + netdev_info(dev, "default FDB implementation only supports local addresses\n"); + return err; + } + + if (tb[NDA_FLAGS_EXT]) { + netdev_info(dev, "invalid flags given to default FDB implementation\n"); + return err; + } + + if (vid) { + netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } @@ -2078,47 +4576,83 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_add); -static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) +static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, + struct netlink_ext_ack *extack) +{ + u16 vid = 0; + + if (vlan_attr) { + if (nla_len(vlan_attr) != sizeof(u16)) { + NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); + return -EINVAL; + } + + vid = nla_get_u16(vlan_attr); + + if (!vid || vid >= VLAN_VID_MASK) { + NL_SET_ERR_MSG(extack, "invalid vlan id"); + return -EINVAL; + } + } + *p_vid = vid; + return 0; +} + +static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; + u16 vid; int err; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, + extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n"); + NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n"); + NL_SET_ERR_MSG(extack, "invalid address"); + return -EINVAL; + } + + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); + if (err) + return err; + err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && - (dev->priv_flags & IFF_BRIDGE_PORT)) { + netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; + bool notified = false; - err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags); + err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, + nlh->nlmsg_flags, ¬ified, extack); if (err) goto out; else @@ -2127,15 +4661,20 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { + bool notified = false; + if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, - nlh->nlmsg_flags); + vid, + nlh->nlmsg_flags, + ¬ified, extack); else - err = ndo_dflt_fdb_add(ndm, tb, dev, addr, + err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); - if (!err) { - rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH); + if (!err && !notified) { + rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, + ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } @@ -2143,80 +4682,104 @@ out: return err; } -/** +/* * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry */ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr) + const unsigned char *addr, u16 vid) { - int err = -EOPNOTSUPP; + int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ - if (ndm->ndm_state & NUD_PERMANENT) { - pr_info("%s: FDB only supports static addresses\n", dev->name); - return -EINVAL; + if (!(ndm->ndm_state & NUD_PERMANENT)) { + netdev_info(dev, "default FDB implementation only supports local addresses\n"); + return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); - else - err = -EINVAL; return err; } EXPORT_SYMBOL(ndo_dflt_fdb_del); -static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { + bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct net *net = sock_net(skb->sk); + const struct net_device_ops *ops; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; - int err = -EINVAL; - __u8 *addr; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; + __u8 *addr = NULL; + int err; + u16 vid; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + if (!del_bulk) { + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, + NULL, extack); + } else { + /* For bulk delete, the drivers will parse the message with + * policy. + */ + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); + } if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n"); + NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } - if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n"); - return -EINVAL; + if (!del_bulk) { + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "invalid address"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); + + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); + if (err) + return err; } - addr = nla_data(tb[NDA_LLADDR]); + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); + return -EINVAL; + } err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && - (dev->priv_flags & IFF_BRIDGE_PORT)) { + netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); - const struct net_device_ops *ops = br_dev->netdev_ops; + bool notified = false; - if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr); + ops = br_dev->netdev_ops; + if (!del_bulk) { + if (ops->ndo_fdb_del) + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); + } else { + if (ops->ndo_fdb_del_bulk) + err = ops->ndo_fdb_del_bulk(nlh, dev, extack); + } if (err) goto out; @@ -2226,13 +4789,26 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { - if (dev->netdev_ops->ndo_fdb_del) - err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr); - else - err = ndo_dflt_fdb_del(ndm, tb, dev, addr); + bool notified = false; + + ops = dev->netdev_ops; + if (!del_bulk) { + if (ops->ndo_fdb_del) + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); + else + err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); + } else { + /* in case err was cleared by NTF_MASTER call */ + err = -EOPNOTSUPP; + if (ops->ndo_fdb_del_bulk) + err = ops->ndo_fdb_del_bulk(nlh, dev, extack); + } if (!err) { - rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); + if (!del_bulk && !notified) + rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, + ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } @@ -2246,20 +4822,22 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, int *idx, struct netdev_hw_addr_list *list) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; - int err; u32 portid, seq; + int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[0]) + if (*idx < ctx->fdb_idx) goto skip; - err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, + err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, - RTM_NEWNEIGH, NTF_SELF); + RTM_NEWNEIGH, NTF_SELF, + NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: @@ -2270,8 +4848,11 @@ skip: /** * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. - * @nlh: netlink message header + * @skb: socket buffer to store message in + * @cb: netlink callback * @dev: netdevice + * @filter_dev: ignored + * @idx: the number of FDB table entries dumped is added to *@idx * * Default netdevice operation to dump the existing unicast address list. * Returns number of addresses from list put in skb. @@ -2279,60 +4860,377 @@ skip: int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, - int idx) + struct net_device *filter_dev, + int *idx) { int err; + if (dev->type != ARPHRD_ETHER) + return -EINVAL; + netif_addr_lock_bh(dev); - err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; - nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); - return idx; + return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); +static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); + return -EINVAL; + } + + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_flags || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, NULL, extack); + if (err < 0) + return err; + + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_IFINDEX: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); + return -EINVAL; + } + *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); + break; + case NDA_MASTER: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); + return -EINVAL; + } + *br_idx = nla_get_u32(tb[NDA_MASTER]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); + return -EINVAL; + } + } + + return 0; +} + +static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX+1]; + int err; + + /* A hack to preserve kernel<->userspace interface. + * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. + * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. + * So, check for ndmsg with an optional u32 attribute (not used here). + * Fortunately these sizes don't conflict with the size of ifinfomsg + * with an optional attribute. + */ + if (nlmsg_len(nlh) != sizeof(struct ndmsg) && + (nlmsg_len(nlh) != sizeof(struct ndmsg) + + nla_attr_size(sizeof(u32)))) { + struct ifinfomsg *ifm; + + err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { + if (tb[IFLA_MASTER]) + *br_idx = nla_get_u32(tb[IFLA_MASTER]); + } + + ifm = nlmsg_data(nlh); + *brport_idx = ifm->ifi_index; + } + return 0; +} + static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int idx = 0; + const struct net_device_ops *ops = NULL, *cops = NULL; + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; + struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); - struct net_device *dev; + int brport_idx = 0; + int br_idx = 0; + int fidx = 0; + int err; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (dev->priv_flags & IFF_BRIDGE_PORT) { - struct net_device *br_dev; - const struct net_device_ops *ops; + NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); - br_dev = netdev_master_upper_dev_get(dev); - ops = br_dev->netdev_ops; - if (ops->ndo_fdb_dump) - idx = ops->ndo_fdb_dump(skb, cb, dev, idx); + if (cb->strict_check) + err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, + cb->extack); + else + err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, + cb->extack); + if (err < 0) + return err; + + if (br_idx) { + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) + return -ENODEV; + + ops = br_dev->netdev_ops; + } + + for_each_netdev_dump(net, dev, ctx->ifindex) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; + + if (!br_idx) { /* user did not specify a specific bridge */ + if (netif_is_bridge_port(dev)) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; + } + } else { + if (dev != br_dev && + !netif_is_bridge_port(dev)) + continue; + + if (br_dev != netdev_master_upper_dev_get(dev) && + !netif_is_bridge_master(dev)) + continue; + cops = ops; + } + + if (netif_is_bridge_port(dev)) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + break; + } } if (dev->netdev_ops->ndo_fdb_dump) - idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx); + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, + &fidx); else - idx = ndo_dflt_fdb_dump(skb, cb, dev, idx); + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); + if (err == -EMSGSIZE) + break; + + cops = NULL; + + /* reset fdb offset to 0 for rest of the interfaces */ + ctx->fdb_idx = 0; + fidx = 0; } - rcu_read_unlock(); - cb->args[0] = idx; + ctx->fdb_idx = fidx; + return skb->len; } +static int valid_fdb_get_strict(const struct nlmsghdr *nlh, + struct nlattr **tb, u8 *ndm_flags, + int *br_idx, int *brport_idx, u8 **addr, + u16 *vid, struct netlink_ext_ack *extack) +{ + struct ndmsg *ndm; + int err, i; + + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); + return -EINVAL; + } + + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_MASTER: + *br_idx = nla_get_u32(tb[i]); + break; + case NDA_LLADDR: + if (nla_len(tb[i]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); + return -EINVAL; + } + *addr = nla_data(tb[i]); + break; + case NDA_VLAN: + err = fdb_vid_parse(tb[i], vid, extack); + if (err) + return err; + break; + case NDA_VNI: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL, *br_dev = NULL; + const struct net_device_ops *ops = NULL; + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct sk_buff *skb; + int brport_idx = 0; + u8 ndm_flags = 0; + int br_idx = 0; + u8 *addr = NULL; + u16 vid = 0; + int err; + + err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, + &brport_idx, &addr, &vid, extack); + if (err < 0) + return err; + + if (!addr) { + NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); + return -EINVAL; + } + + if (brport_idx) { + dev = __dev_get_by_index(net, brport_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (br_idx) { + if (dev) { + NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); + return -EINVAL; + } + + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Invalid master ifindex"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } + + if (dev) { + if (!ndm_flags || (ndm_flags & NTF_MASTER)) { + if (!netif_is_bridge_port(dev)) { + NL_SET_ERR_MSG(extack, "Device is not a bridge port"); + return -EINVAL; + } + br_dev = netdev_master_upper_dev_get(dev); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Master of device not found"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } else { + if (!(ndm_flags & NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); + return -EINVAL; + } + ops = dev->netdev_ops; + } + } + + if (!br_dev && !dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -ENODEV; + } + + if (!ops || !ops->ndo_fdb_get) { + NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); + return -EOPNOTSUPP; + } + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (br_dev) + dev = br_dev; + err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); + if (err) + goto out; + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + kfree_skb(skb); + return err; +} + +static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, + unsigned int attrnum, unsigned int flag) +{ + if (mask & flag) + return nla_put_u8(skb, attrnum, !!(flags & flag)); + return 0; +} + int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, - struct net_device *dev, u16 mode) + struct net_device *dev, u16 mode, + u32 flags, u32 mask, int nlflags, + u32 filter_mask, + int (*vlan_fill)(struct sk_buff *skb, + struct net_device *dev, + u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; + struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); + int err = 0; - nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; @@ -2341,7 +5239,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; - ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = 0; @@ -2352,42 +5250,142 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink))) + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; - br_afspec = nla_nest_start(skb, IFLA_AF_SPEC); + br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!br_afspec) goto nla_put_failure; - if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || - nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } + + if (mode != BRIDGE_MODE_UNDEF) { + if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } + if (vlan_fill) { + err = vlan_fill(skb, dev, filter_mask); + if (err) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } nla_nest_end(skb, br_afspec); - return nlmsg_end(skb, nlh); + protinfo = nla_nest_start(skb, IFLA_PROTINFO); + if (!protinfo) + goto nla_put_failure; + + if (brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_FAST_LEAVE, + BR_MULTICAST_FAST_LEAVE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING, BR_LEARNING) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROXYARP, BR_PROXYARP) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { + nla_nest_cancel(skb, protinfo); + goto nla_put_failure; + } + + nla_nest_end(skb, protinfo); + + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); - return -EMSGSIZE; + return err ? err : -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); + +static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, + bool strict_check, u32 *filter_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX+1]; + int err, i; + + if (strict_check) { + struct ifinfomsg *ifm; + + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { + NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); + return -EINVAL; + } + + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, + sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); + } else { + err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); + } + if (err < 0) + return err; + + /* new attributes should only be added with strict checking */ + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case IFLA_EXT_MASK: + *filter_mask = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); + return -EINVAL; + } + } + } + + return 0; } -EXPORT_SYMBOL(ndo_dflt_bridge_getlink); static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; - u32 seq = cb->nlh->nlmsg_seq; - struct nlattr *extfilt; + u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; + int err; - extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct rtgenmsg), - IFLA_EXT_MASK); - if (extfilt) - filter_mask = nla_get_u32(extfilt); + err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, + cb->extack); + if (err < 0 && cb->strict_check) + return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { @@ -2395,25 +5393,42 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask) < 0) - break; + if (idx >= cb->args[0]) { + err = br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, + filter_mask, NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } + } idx++; } if (ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask) < 0) - break; + if (idx >= cb->args[0]) { + err = ops->ndo_bridge_getlink(skb, portid, + seq, dev, + filter_mask, + NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } + } idx++; } } + err = skb->len; +out_err: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline size_t bridge_nlmsg_size(void) @@ -2431,51 +5446,51 @@ static inline size_t bridge_nlmsg_size(void) + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ } -static int rtnl_bridge_notify(struct net_device *dev, u16 flags) +static int rtnl_bridge_notify(struct net_device *dev) { struct net *net = dev_net(dev); - struct net_device *br_dev = netdev_master_upper_dev_get(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; + if (!dev->netdev_ops->ndo_bridge_getlink) + return 0; + skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); if (!skb) { err = -ENOMEM; goto errout; } - if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) && - br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { - err = br_dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); - if (err < 0) - goto errout; - } + err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); + if (err < 0) + goto errout; - if ((flags & BRIDGE_FLAGS_SELF) && - dev->netdev_ops->ndo_bridge_getlink) { - err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); - if (err < 0) - goto errout; - } + /* Notification info is only filled for bridge ports, not the bridge + * device itself. Therefore, a zero notification length is valid and + * should not result in an error. + */ + if (!skb->len) + goto errout; rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; errout: WARN_ON(err == -EMSGSIZE); kfree_skb(skb); - rtnl_set_sk_err(net, RTNLGRP_LINK, err); + if (err) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); return err; } -static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; - struct nlattr *br_spec, *attr = NULL; + struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; - u16 oflags, flags = 0; - bool have_flags = false; + u16 flags = 0; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; @@ -2486,23 +5501,28 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { - pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { - have_flags = true; + if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; + + br_flags_attr = attr; flags = nla_get_u16(attr); - break; + } + + if (nla_type(attr) == IFLA_BRIDGE_MODE) { + if (nla_len(attr) < sizeof(u16)) + return -EINVAL; } } } - oflags = flags; - if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); @@ -2511,7 +5531,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh); + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, + extack); if (err) goto out; @@ -2522,29 +5543,34 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (!dev->netdev_ops->ndo_bridge_setlink) err = -EOPNOTSUPP; else - err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh); - - if (!err) + err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, + flags, + extack); + if (!err) { flags &= ~BRIDGE_FLAGS_SELF; + + /* Generate event to notify upper layer of bridge + * change + */ + err = rtnl_bridge_notify(dev); + } } - if (have_flags) - memcpy(nla_data(attr), &flags, sizeof(flags)); - /* Generate event to notify upper layer of bridge change */ - if (!err) - err = rtnl_bridge_notify(dev, oflags); + if (br_flags_attr) + memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } -static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr = NULL; int rem, err = -EOPNOTSUPP; - u16 oflags, flags = 0; + u16 flags = 0; bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) @@ -2556,23 +5582,23 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { - pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { - have_flags = true; - flags = nla_get_u16(attr); - break; - } + nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, + rem) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; + + have_flags = true; + flags = nla_get_u16(attr); + break; } } - oflags = flags; - if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); @@ -2581,7 +5607,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh); + err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (err) goto out; @@ -2592,31 +5618,1256 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) if (!dev->netdev_ops->ndo_bridge_dellink) err = -EOPNOTSUPP; else - err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh); + err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, + flags); - if (!err) + if (!err) { flags &= ~BRIDGE_FLAGS_SELF; + + /* Generate event to notify upper layer of bridge + * change + */ + err = rtnl_bridge_notify(dev); + } } if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); - /* Generate event to notify upper layer of bridge change */ - if (!err) - err = rtnl_bridge_notify(dev, oflags); out: return err; } +static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) +{ + return (mask & IFLA_STATS_FILTER_BIT(attrid)) && + (!idxattr || idxattr == attrid); +} + +static bool +rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id) +{ + return dev->netdev_ops && + dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats && + dev->netdev_ops->ndo_has_offload_stats(dev, attr_id); +} + +static unsigned int +rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id) +{ + return rtnl_offload_xstats_have_ndo(dev, attr_id) ? + sizeof(struct rtnl_link_stats64) : 0; +} + +static int +rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id, + struct sk_buff *skb) +{ + unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id); + struct nlattr *attr = NULL; + void *attr_data; + int err; + + if (!size) + return -ENODATA; + + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + return -EMSGSIZE; + + attr_data = nla_data(attr); + memset(attr_data, 0, size); + + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); + if (err) + return err; + + return 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_stats(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + bool enabled = netdev_offload_xstats_enabled(dev, type); + + return enabled ? sizeof(struct rtnl_hw_stats64) : 0; +} + +struct rtnl_offload_xstats_request_used { + bool request; + bool used; +}; + +static int +rtnl_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_offload_xstats_request_used *ru, + struct rtnl_hw_stats64 *stats, + struct netlink_ext_ack *extack) +{ + bool request; + bool used; + int err; + + request = netdev_offload_xstats_enabled(dev, type); + if (!request) { + used = false; + goto out; + } + + err = netdev_offload_xstats_get(dev, type, stats, &used, extack); + if (err) + return err; + +out: + if (ru) { + ru->request = request; + ru->used = used; + } + return 0; +} + +static int +rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id, + struct rtnl_offload_xstats_request_used *ru) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, attr_id); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int +rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + struct rtnl_offload_xstats_request_used ru_l3; + struct nlattr *nest; + int err; + + err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack); + if (err) + return err; + + nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO); + if (!nest) + return -EMSGSIZE; + + if (rtnl_offload_xstats_fill_hw_s_info_one(skb, + IFLA_OFFLOAD_XSTATS_L3_STATS, + &ru_l3)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev, + int *prividx, u32 off_filter_mask, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO; + int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS; + int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; + bool have_data = false; + int err; + + if (*prividx <= attr_id_cpu_hit && + (off_filter_mask & + IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) { + err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb); + if (!err) { + have_data = true; + } else if (err != -ENODATA) { + *prividx = attr_id_cpu_hit; + return err; + } + } + + if (*prividx <= attr_id_hw_s_info && + (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) { + *prividx = attr_id_hw_s_info; + + err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack); + if (err) + return err; + + have_data = true; + *prividx = 0; + } + + if (*prividx <= attr_id_l3_stats && + (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) { + unsigned int size_l3; + struct nlattr *attr; + + *prividx = attr_id_l3_stats; + + size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3); + if (!size_l3) + goto skip_l3_stats; + attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + return -EMSGSIZE; + + err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL, + nla_data(attr), extack); + if (err) + return err; + + have_data = true; +skip_l3_stats: + *prividx = 0; + } + + if (!have_data) + return -ENODATA; + + *prividx = 0; + return 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + return nla_total_size(0) + + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ + nla_total_size(sizeof(u8)) + + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ + nla_total_size(sizeof(u8)) + + 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + + return nla_total_size(0) + + /* IFLA_OFFLOAD_XSTATS_L3_STATS */ + rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) + + 0; +} + +static int rtnl_offload_xstats_get_size(const struct net_device *dev, + u32 off_filter_mask) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; + int nla_size = 0; + int size; + + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) { + size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit); + nla_size += nla_total_size_64bit(size); + } + + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO)) + nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev); + + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) { + size = rtnl_offload_xstats_get_size_stats(dev, t_l3); + nla_size += nla_total_size_64bit(size); + } + + if (nla_size != 0) + nla_size += nla_total_size(0); + + return nla_size; +} + +struct rtnl_stats_dump_filters { + /* mask[0] filters outer attributes. Then individual nests have their + * filtering mask at the index of the nested attribute. + */ + u32 mask[IFLA_STATS_MAX + 1]; +}; + +static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change, + unsigned int flags, + const struct rtnl_stats_dump_filters *filters, + int *idxattr, int *prividx, + struct netlink_ext_ack *extack) +{ + unsigned int filter_mask = filters->mask[0]; + struct if_stats_msg *ifsm; + struct nlmsghdr *nlh; + struct nlattr *attr; + int s_prividx = *prividx; + int err; + + ASSERT_RTNL(); + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); + if (!nlh) + return -EMSGSIZE; + + ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; + ifsm->ifindex = dev->ifindex; + ifsm->filter_mask = filter_mask; + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { + struct rtnl_link_stats64 *sp; + + attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, + sizeof(struct rtnl_link_stats64), + IFLA_STATS_UNSPEC); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + sp = nla_data(attr); + dev_get_stats(dev, sp); + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + + if (ops && ops->fill_linkxstats) { + *idxattr = IFLA_STATS_LINK_XSTATS; + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_XSTATS); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, + *idxattr)) { + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + master = netdev_master_upper_dev_get(dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->fill_linkxstats) { + *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_XSTATS_SLAVE); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, + *idxattr)) { + u32 off_filter_mask; + + off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; + *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_OFFLOAD_XSTATS); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = rtnl_offload_xstats_fill(skb, dev, prividx, + off_filter_mask, extack); + if (err == -ENODATA) + nla_nest_cancel(skb, attr); + else + nla_nest_end(skb, attr); + + if (err && err != -ENODATA) + goto nla_put_failure; + *idxattr = 0; + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { + struct rtnl_af_ops *af_ops; + + *idxattr = IFLA_STATS_AF_SPEC; + attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_stats_af) { + struct nlattr *af; + + af = nla_nest_start_noflag(skb, + af_ops->family); + if (!af) { + rcu_read_unlock(); + err = -EMSGSIZE; + goto nla_put_failure; + } + err = af_ops->fill_stats_af(skb, dev); + + if (err == -ENODATA) { + nla_nest_cancel(skb, af); + } else if (err < 0) { + rcu_read_unlock(); + goto nla_put_failure; + } + + nla_nest_end(skb, af); + } + } + rcu_read_unlock(); + + nla_nest_end(skb, attr); + + *idxattr = 0; + } + + nlmsg_end(skb, nlh); + + return 0; + +nla_put_failure: + /* not a multi message or no progress mean a real error */ + if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) + nlmsg_cancel(skb, nlh); + else + nlmsg_end(skb, nlh); + + return err; +} + +static size_t if_nlmsg_stats_size(const struct net_device *dev, + const struct rtnl_stats_dump_filters *filters) +{ + size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); + unsigned int filter_mask = filters->mask[0]; + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) + size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + int attr = IFLA_STATS_LINK_XSTATS; + + if (ops && ops->get_linkxstats_size) { + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); + /* for IFLA_STATS_LINK_XSTATS */ + size += nla_total_size(0); + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { + struct net_device *_dev = (struct net_device *)dev; + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + /* netdev_master_upper_dev_get can't take const */ + master = netdev_master_upper_dev_get(_dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->get_linkxstats_size) { + int attr = IFLA_STATS_LINK_XSTATS_SLAVE; + + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); + /* for IFLA_STATS_LINK_XSTATS_SLAVE */ + size += nla_total_size(0); + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) { + u32 off_filter_mask; + + off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; + size += rtnl_offload_xstats_get_size(dev, off_filter_mask); + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { + struct rtnl_af_ops *af_ops; + + /* for IFLA_STATS_AF_SPEC */ + size += nla_total_size(0); + + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_stats_af_size) { + size += nla_total_size( + af_ops->get_stats_af_size(dev)); + + /* for AF_* */ + size += nla_total_size(0); + } + } + rcu_read_unlock(); + } + + return size; +} + +#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1) + +static const struct nla_policy +rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = { + [IFLA_STATS_LINK_OFFLOAD_XSTATS] = + NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID), +}; + +static const struct nla_policy +rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_GET_FILTERS] = + NLA_POLICY_NESTED(rtnl_stats_get_policy_filters), +}; + +static const struct nla_policy +ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), +}; + +static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, + struct rtnl_stats_dump_filters *filters, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_STATS_MAX + 1]; + int err; + int at; + + err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters, + rtnl_stats_get_policy_filters, extack); + if (err < 0) + return err; + + for (at = 1; at <= IFLA_STATS_MAX; at++) { + if (tb[at]) { + if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) { + NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask"); + return -EINVAL; + } + filters->mask[at] = nla_get_u32(tb[at]); + } + } + + return 0; +} + +static int rtnl_stats_get_parse(const struct nlmsghdr *nlh, + u32 filter_mask, + struct rtnl_stats_dump_filters *filters, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; + int err; + int i; + + filters->mask[0] = filter_mask; + for (i = 1; i < ARRAY_SIZE(filters->mask); i++) + filters->mask[i] = -1U; + + err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb, + IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack); + if (err < 0) + return err; + + if (tb[IFLA_STATS_GET_FILTERS]) { + err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS], + filters, extack); + if (err) + return err; + } + + return 0; +} + +static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, + bool is_dump, struct netlink_ext_ack *extack) +{ + struct if_stats_msg *ifsm; + + ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); + if (!ifsm) { + NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); + return -EINVAL; + } + + if (!strict_check) + return 0; + + /* only requests using strict checks can pass data to influence + * the dump. The legacy exception is filter_mask. + */ + if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { + NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); + return -EINVAL; + } + if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { + NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); + return -EINVAL; + } + + return 0; +} + +static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct rtnl_stats_dump_filters filters; + struct net *net = sock_net(skb->sk); + struct net_device *dev = NULL; + int idxattr = 0, prividx = 0; + struct if_stats_msg *ifsm; + struct sk_buff *nskb; + int err; + + err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), + false, extack); + if (err) + return err; + + ifsm = nlmsg_data(nlh); + if (ifsm->ifindex > 0) + dev = __dev_get_by_index(net, ifsm->ifindex); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (!ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); + return -EINVAL; + } + + err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack); + if (err) + return err; + + nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL); + if (!nskb) + return -ENOBUFS; + + err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, + NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + 0, &filters, &idxattr, &prividx, extack); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(nskb); + } else { + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); + } + + return err; +} + +static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct rtnl_stats_dump_filters filters; + struct net *net = sock_net(skb->sk); + unsigned int flags = NLM_F_MULTI; + struct if_stats_msg *ifsm; + struct { + unsigned long ifindex; + int idxattr; + int prividx; + } *ctx = (void *)cb->ctx; + struct net_device *dev; + int err; + + cb->seq = net->dev_base_seq; + + err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); + if (err) + return err; + + ifsm = nlmsg_data(cb->nlh); + if (!ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); + return -EINVAL; + } + + err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters, + extack); + if (err) + return err; + + for_each_netdev_dump(net, dev, ctx->ifindex) { + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, &filters, + &ctx->idxattr, &ctx->prividx, + extack); + /* If we ran out of room on the first message, + * we're in trouble. + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + + if (err < 0) + break; + ctx->prividx = 0; + ctx->idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + } + + return err; +} + +void rtnl_offload_xstats_notify(struct net_device *dev) +{ + struct rtnl_stats_dump_filters response_filters = {}; + struct net *net = dev_net(dev); + int idxattr = 0, prividx = 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + ASSERT_RTNL(); + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + + skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), + GFP_KERNEL); + if (!skb) + goto errout; + + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, + &response_filters, &idxattr, &prividx, NULL); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_STATS, err); +} +EXPORT_SYMBOL(rtnl_offload_xstats_notify); + +static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + struct rtnl_stats_dump_filters response_filters = {}; + struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct net_device *dev = NULL; + struct if_stats_msg *ifsm; + bool notify = false; + int err; + + err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), + false, extack); + if (err) + return err; + + ifsm = nlmsg_data(nlh); + if (ifsm->family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC"); + return -EINVAL; + } + + if (ifsm->ifindex > 0) + dev = __dev_get_by_index(net, ifsm->ifindex); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set"); + return -EINVAL; + } + + err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX, + ifla_stats_set_policy, extack); + if (err < 0) + return err; + + if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { + u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); + + if (req) + err = netdev_offload_xstats_enable(dev, t_l3, extack); + else + err = netdev_offload_xstats_disable(dev, t_l3); + + if (!err) + notify = true; + else if (err != -EALREADY) + return err; + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + } + + if (notify) + rtnl_offload_xstats_notify(dev); + + return 0; +} + +static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct br_port_msg *bpm; + + bpm = nlmsg_payload(nlh, sizeof(*bpm)); + if (!bpm) { + NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); + return -EINVAL; + } + + if (bpm->ifindex) { + NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); + return -EINVAL; + } + if (nlmsg_attrlen(nlh, sizeof(*bpm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); + return -EINVAL; + } + + return 0; +} + +struct rtnl_mdb_dump_ctx { + long idx; +}; + +static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int idx, s_idx; + int err; + + NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); + + if (cb->strict_check) { + err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); + if (err) + return err; + } + + s_idx = ctx->idx; + idx = 0; + + for_each_netdev(net, dev) { + if (idx < s_idx) + goto skip; + if (!dev->netdev_ops->ndo_mdb_dump) + goto skip; + + err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb); + if (err == -EMSGSIZE) + goto out; + /* Moving on to next device, reset markers and sequence + * counters since they are all maintained per-device. + */ + memset(cb->ctx, 0, sizeof(cb->ctx)); + cb->prev_seq = 0; + cb->seq = 0; +skip: + idx++; + } + +out: + ctx->idx = idx; + return skb->len; +} + +static int rtnl_validate_mdb_entry_get(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->ifindex) { + NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified"); + return -EINVAL; + } + + if (entry->state) { + NL_SET_ERR_MSG(extack, "Entry state cannot be specified"); + return -EINVAL; + } + + if (entry->flags) { + NL_SET_ERR_MSG(extack, "Entry flags cannot be specified"); + return -EINVAL; + } + + if (entry->vid >= VLAN_VID_MASK) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + if (entry->addr.proto != htons(ETH_P_IP) && + entry->addr.proto != htons(ETH_P_IPV6) && + entry->addr.proto != 0) { + NL_SET_ERR_MSG(extack, "Unknown entry protocol"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = { + [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry_get, + sizeof(struct br_mdb_entry)), + [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + +static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1]; + struct net *net = sock_net(in_skb->sk); + struct br_port_msg *bpm; + struct net_device *dev; + int err; + + err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb, + MDBA_GET_ENTRY_MAX, mdba_get_policy, extack); + if (err) + return err; + + bpm = nlmsg_data(nlh); + if (!bpm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, bpm->ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "Device doesn't exist"); + return -ENODEV; + } + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) { + NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute"); + return -EINVAL; + } + + if (!dev->netdev_ops->ndo_mdb_get) { + NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); + return -EOPNOTSUPP; + } + + return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); +} + +static int rtnl_validate_mdb_entry(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->ifindex == 0) { + NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed"); + return -EINVAL; + } + + if (entry->addr.proto == htons(ETH_P_IP)) { + if (!ipv4_is_multicast(entry->addr.u.ip4) && + !ipv4_is_zeronet(entry->addr.u.ip4)) { + NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0"); + return -EINVAL; + } + if (ipv4_is_local_multicast(entry->addr.u.ip4)) { + NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast"); + return -EINVAL; + } +#if IS_ENABLED(CONFIG_IPV6) + } else if (entry->addr.proto == htons(ETH_P_IPV6)) { + if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { + NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes"); + return -EINVAL; + } +#endif + } else if (entry->addr.proto == 0) { + /* L2 mdb */ + if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { + NL_SET_ERR_MSG(extack, "L2 entry group is not multicast"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(extack, "Unknown entry protocol"); + return -EINVAL; + } + + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { + NL_SET_ERR_MSG(extack, "Unknown entry state"); + return -EINVAL; + } + if (entry->vid >= VLAN_VID_MASK) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = { + [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 }, + [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry, + sizeof(struct br_mdb_entry)), + [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + +static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct br_port_msg *bpm; + struct net_device *dev; + int err; + + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, mdba_policy, extack); + if (err) + return err; + + bpm = nlmsg_data(nlh); + if (!bpm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, bpm->ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "Device doesn't exist"); + return -ENODEV; + } + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { + NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); + return -EINVAL; + } + + if (!dev->netdev_ops->ndo_mdb_add) { + NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); + return -EOPNOTSUPP; + } + + return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack); +} + +static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + struct br_mdb_entry zero_entry = {}; + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { + NL_SET_ERR_MSG(extack, "Unknown entry state"); + return -EINVAL; + } + + if (entry->flags) { + NL_SET_ERR_MSG(extack, "Entry flags cannot be set"); + return -EINVAL; + } + + if (entry->vid >= VLAN_N_VID - 1) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) { + NL_SET_ERR_MSG(extack, "Entry address cannot be set"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = { + [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry_del_bulk, + sizeof(struct br_mdb_entry)), + [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + +static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); + struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct br_port_msg *bpm; + struct net_device *dev; + int err; + + if (!del_bulk) + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, mdba_policy, + extack); + else + err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, + mdba_del_bulk_policy, extack); + if (err) + return err; + + bpm = nlmsg_data(nlh); + if (!bpm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, bpm->ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "Device doesn't exist"); + return -ENODEV; + } + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { + NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); + return -EINVAL; + } + + if (del_bulk) { + if (!dev->netdev_ops->ndo_mdb_del_bulk) { + NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion"); + return -EOPNOTSUPP; + } + return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack); + } + + if (!dev->netdev_ops->ndo_mdb_del) { + NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); + return -EOPNOTSUPP; + } + + return dev->netdev_ops->ndo_mdb_del(dev, tb, extack); +} + /* Process one rtnetlink message. */ -static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED); + rtnl_dumpit_func dumpit = cb->data; + int err; + + /* Previous iteration have already finished, avoid calling->dumpit() + * again, it may not expect to be called after it reached the end. + */ + if (!dumpit) + return 0; + + if (needs_lock) + rtnl_lock(); + err = dumpit(skb, cb); + if (needs_lock) + rtnl_unlock(); + + /* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). + * Some applications which parse netlink manually depend on this. + */ + if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { + if (err < 0 && err != -EMSGSIZE) + return err; + if (!err) + cb->data = NULL; + + return skb->len; + } + return err; +} + +static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) +{ + if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE || + !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) { + WARN_ON(control->data); + control->data = control->dump; + control->dump = rtnl_dumpit; + } + + return netlink_dump_start(ssk, skb, nlh, control); +} + +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct rtnl_link *link; + enum rtnl_kinds kind; + struct module *owner; + int err = -EOPNOTSUPP; rtnl_doit_func doit; - int sz_idx, kind; + unsigned int flags; int family; int type; - int err; type = nlh->nlmsg_type; if (type > RTM_MAX) @@ -2629,50 +6880,121 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; - sz_idx = type>>2; - kind = type&3; + kind = rtnl_msgtype_kind(type); - if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + rcu_read_lock(); + if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) { struct sock *rtnl; rtnl_dumpit_func dumpit; - rtnl_calcit_func calcit; - u16 min_dump_alloc = 0; + u32 min_dump_alloc = 0; + + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) { + family = PF_UNSPEC; + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) + goto err_unlock; + } + owner = link->owner; + dumpit = link->dumpit; + flags = link->flags; - dumpit = rtnl_get_dumpit(family, type); - if (dumpit == NULL) - return -EOPNOTSUPP; - calcit = rtnl_get_calcit(family, type); - if (calcit) - min_dump_alloc = calcit(skb, nlh); + if (type == RTM_GETLINK - RTM_BASE) + min_dump_alloc = rtnl_calcit(skb, nlh); + + err = 0; + /* need to do this before rcu_read_unlock() */ + if (!try_module_get(owner)) + err = -EPROTONOSUPPORT; + + rcu_read_unlock(); - __rtnl_unlock(); rtnl = net->rtnl; - { + if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, + .module = owner, + .flags = flags, }; - err = netlink_dump_start(rtnl, skb, nlh, &c); + err = rtnetlink_dump_start(rtnl, skb, nlh, &c); + /* netlink_dump_start() will keep a reference on + * module if dump is still in progress. + */ + module_put(owner); } - rtnl_lock(); return err; } - doit = rtnl_get_doit(family, type); - if (doit == NULL) - return -EOPNOTSUPP; + link = rtnl_get_link(family, type); + if (!link || !link->doit) { + family = PF_UNSPEC; + link = rtnl_get_link(PF_UNSPEC, type); + if (!link || !link->doit) + goto out_unlock; + } + + owner = link->owner; + if (!try_module_get(owner)) { + err = -EPROTONOSUPPORT; + goto out_unlock; + } + + flags = link->flags; + if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) && + !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) { + NL_SET_ERR_MSG(extack, "Bulk delete is not supported"); + module_put(owner); + goto err_unlock; + } + + if (flags & RTNL_FLAG_DOIT_UNLOCKED) { + doit = link->doit; + rcu_read_unlock(); + if (doit) + err = doit(skb, nlh, extack); + module_put(owner); + return err; + } + rcu_read_unlock(); + + rtnl_lock(); + link = rtnl_get_link(family, type); + if (link && link->doit) + err = link->doit(skb, nlh, extack); + rtnl_unlock(); + + module_put(owner); + + return err; - return doit(skb, nlh); +out_unlock: + rcu_read_unlock(); + return err; + +err_unlock: + rcu_read_unlock(); + return -EOPNOTSUPP; } static void rtnetlink_rcv(struct sk_buff *skb) { - rtnl_lock(); netlink_rcv_skb(skb, &rtnetlink_rcv_msg); - rtnl_unlock(); +} + +static int rtnetlink_bind(struct net *net, int group) +{ + switch (group) { + case RTNLGRP_IPV4_MROUTE_R: + case RTNLGRP_IPV6_MROUTE_R: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + break; + } + return 0; } static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -2680,21 +7002,23 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_PRE_UP: - case NETDEV_POST_INIT: - case NETDEV_REGISTER: - case NETDEV_CHANGE: - case NETDEV_PRE_TYPE_CHANGE: - case NETDEV_GOING_DOWN: - case NETDEV_UNREGISTER: - case NETDEV_UNREGISTER_FINAL: - case NETDEV_RELEASE: - case NETDEV_JOIN: + case NETDEV_REBOOT: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_POST_TYPE_CHANGE: + case NETDEV_NOTIFY_PEERS: + case NETDEV_CHANGEUPPER: + case NETDEV_RESEND_IGMP: + case NETDEV_CHANGEINFODATA: + case NETDEV_CHANGELOWERSTATE: + case NETDEV_CHANGE_TX_QUEUE_LEN: + rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), + GFP_KERNEL, NULL, 0, 0, NULL); break; default: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); break; } return NOTIFY_DONE; @@ -2711,8 +7035,8 @@ static int __net_init rtnetlink_net_init(struct net *net) struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, - .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, + .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); @@ -2733,6 +7057,41 @@ static struct pernet_operations rtnetlink_net_ops = { .exit = rtnetlink_net_exit, }; +static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, + {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, + .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, + {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, + {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get, + .dumpit = rtnl_stats_dump}, + {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set}, + {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop}, + {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK, + .dumpit = rtnl_bridge_getlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK, + .doit = rtnl_bridge_dellink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK, + .doit = rtnl_bridge_setlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get, + .dumpit = rtnl_fdb_dump}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get, + .dumpit = rtnl_mdb_dump}, +}; + void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) @@ -2740,21 +7099,5 @@ void __init rtnetlink_init(void) register_netdevice_notifier(&rtnetlink_dev_notifier); - rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, - rtnl_dump_ifinfo, rtnl_calcit); - rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL); - - rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL); - rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL); - - rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL); - rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL); - - rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); - rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); - rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); + rtnl_register_many(rtnetlink_rtnl_msg_handlers); } - diff --git a/net/core/scm.c b/net/core/scm.c index 03795d0147f2..cd87f66671aa 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* scm.c - Socket level control messages processing. * * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Alignment and value checking mods by Craig Metz - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -14,6 +10,7 @@ #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/sched/user.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/stat.h> @@ -26,10 +23,14 @@ #include <linux/security.h> #include <linux/pid_namespace.h> #include <linux/pid.h> +#include <uapi/linux/pidfd.h> +#include <linux/pidfs.h> #include <linux/nsproxy.h> #include <linux/slab.h> +#include <linux/errqueue.h> +#include <linux/io_uring.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -37,6 +38,7 @@ #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> +#include <net/af_unix.h> /* @@ -54,11 +56,11 @@ static __inline__ int scm_check_creds(struct ucred *creds) return -EINVAL; if ((creds->pid == task_tgid_vnr(current) || - ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) && + ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || - uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && + uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || - gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) { + gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) { return 0; } return -EPERM; @@ -71,7 +73,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) struct file **fpp; int i, num; - num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int); if (num <= 0) return 0; @@ -81,12 +83,20 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); if (!fpl) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; + fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; + fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -104,9 +114,21 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; + /* don't allow io_uring files */ + if (io_is_uring_fops(file)) { + fput(file); + return -EINVAL; + } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } + + if (!fpl->user) + fpl->user = get_uid(current_user()); + return num; } @@ -119,18 +141,35 @@ void __scm_destroy(struct scm_cookie *scm) scm->fp = NULL; for (i=fpl->count-1; i>=0; i--) fput(fpl->fp[i]); + free_uid(fpl->user); kfree(fpl); } } EXPORT_SYMBOL(__scm_destroy); +static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid) +{ + int err; + + /* drop all previous references */ + scm_destroy_cred(scm); + + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + scm->pid = pid; + scm->creds.pid = pid_vnr(pid); + return 0; +} + int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct cmsghdr *cmsg; int err; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) - { + for_each_cmsghdr(cmsg, msg) { err = -EINVAL; /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ @@ -150,7 +189,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) switch (cmsg->cmsg_type) { case SCM_RIGHTS: - if (!sock->ops || sock->ops->family != PF_UNIX) + if (!ops || ops->family != PF_UNIX) goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) @@ -168,15 +207,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - p->creds.pid = creds.pid; if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(creds.pid); if (!pid) goto error; - put_pid(p->pid); - p->pid = pid; + + /* pass a struct pid reference from + * find_get_pid() to scm_replace_pid(). + */ + err = scm_replace_pid(p, pid); + if (err) { + put_pid(pid); + goto error; + } } err = -EINVAL; @@ -209,16 +254,12 @@ EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { - struct cmsghdr __user *cm - = (__force struct cmsghdr __user *)msg->msg_control; - struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); - int err; - if (MSG_CMSG_COMPAT & msg->msg_flags) + if (msg->msg_flags & MSG_CMSG_COMPAT) return put_cmsg_compat(msg, level, type, len, data); - if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } @@ -226,96 +267,133 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - - err = -EFAULT; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) - goto out; - if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) - goto out; - cmlen = CMSG_SPACE(len); - if (msg->msg_controllen < cmlen) - cmlen = msg->msg_controllen; - msg->msg_control += cmlen; + + if (msg->msg_control_is_user) { + struct cmsghdr __user *cm = msg->msg_control_user; + + check_object_size(data, cmlen - sizeof(*cm), true); + + scoped_user_write_access_size(cm, cmlen, efault) { + unsafe_put_user(cmlen, &cm->cmsg_len, efault); + unsafe_put_user(level, &cm->cmsg_level, efault); + unsafe_put_user(type, &cm->cmsg_type, efault); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault); + } + } else { + struct cmsghdr *cm = msg->msg_control; + + cm->cmsg_level = level; + cm->cmsg_type = type; + cm->cmsg_len = cmlen; + memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); + } + + cmlen = min(CMSG_SPACE(len), msg->msg_controllen); + if (msg->msg_control_is_user) + msg->msg_control_user += cmlen; + else + msg->msg_control += cmlen; msg->msg_controllen -= cmlen; - err = 0; -out: - return err; + return 0; + +efault: + return -EFAULT; } EXPORT_SYMBOL(put_cmsg); -void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data) { - struct cmsghdr __user *cm - = (__force struct cmsghdr __user*)msg->msg_control; + /* Don't produce truncated CMSGs */ + if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) + return -ETOOSMALL; + + return put_cmsg(msg, level, type, len, data); +} - int fdmax = 0; - int fdnum = scm->fp->count; - struct file **fp = scm->fp->fp; - int __user *cmfptr; +void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ + struct scm_timestamping64 tss; + int i; + + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { + tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; + tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + } + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping64); + +void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ + struct scm_timestamping tss; + int i; + + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { + tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; + tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + } + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping); + +static int scm_max_fds(struct msghdr *msg) +{ + if (msg->msg_controllen <= sizeof(struct cmsghdr)) + return 0; + return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int); +} + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ + struct cmsghdr __user *cm = + (__force struct cmsghdr __user *)msg->msg_control_user; + unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); + int __user *cmsg_data = CMSG_USER_DATA(cm); int err = 0, i; - if (MSG_CMSG_COMPAT & msg->msg_flags) { + /* no use for FD passing from kernel space callers */ + if (WARN_ON_ONCE(!msg->msg_control_is_user)) + return; + + if (msg->msg_flags & MSG_CMSG_COMPAT) { scm_detach_fds_compat(msg, scm); return; } - if (msg->msg_controllen > sizeof(struct cmsghdr)) - fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) - / sizeof(int)); - - if (fdnum < fdmax) - fdmax = fdnum; - - for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax; - i++, cmfptr++) - { - struct socket *sock; - int new_fd; - err = security_file_receive(fp[i]); - if (err) - break; - err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags - ? O_CLOEXEC : 0); + for (i = 0; i < fdmax; i++) { + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; - new_fd = err; - err = put_user(new_fd, cmfptr); - if (err) { - put_unused_fd(new_fd); - break; - } - /* Bump the usage count and install the file. */ - sock = sock_from_file(fp[i], &err); - if (sock) { - sock_update_netprioidx(sock->sk); - sock_update_classid(sock->sk); - } - fd_install(new_fd, get_file(fp[i])); } - if (i > 0) - { - int cmlen = CMSG_LEN(i*sizeof(int)); + if (i > 0) { + int cmlen = CMSG_LEN(i * sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) err = put_user(cmlen, &cm->cmsg_len); if (!err) { - cmlen = CMSG_SPACE(i*sizeof(int)); - msg->msg_control += cmlen; + cmlen = CMSG_SPACE(i * sizeof(int)); + if (msg->msg_controllen < cmlen) + cmlen = msg->msg_controllen; + msg->msg_control_user += cmlen; msg->msg_controllen -= cmlen; } } - if (i < fdnum || (fdnum && fdmax <= 0)) + + if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) msg->msg_flags |= MSG_CTRUNC; /* - * All of the files that fit in the message have had their - * usage counts incremented, so we just free the list. + * All of the files that fit in the message have had their usage counts + * incremented, so we just free the list. */ __scm_destroy(scm); } @@ -330,12 +408,141 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) return NULL; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } EXPORT_SYMBOL(scm_fp_dup); + +#ifdef CONFIG_SECURITY_NETWORK +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ + struct lsm_context ctx; + int err; + + if (sk->sk_scm_security) { + err = security_secid_to_secctx(scm->secid, &ctx); + + if (err >= 0) { + put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, + ctx.context); + + security_release_secctx(&ctx); + } + } +} + +static bool scm_has_secdata(struct sock *sk) +{ + return sk->sk_scm_security; +} +#else +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ +} + +static bool scm_has_secdata(struct sock *sk) +{ + return false; +} +#endif + +static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) +{ + struct file *pidfd_file = NULL; + int len, pidfd; + + /* put_cmsg() doesn't return an error if CMSG is truncated, + * that's why we need to opencode these checks here. + */ + if (msg->msg_flags & MSG_CMSG_COMPAT) + len = sizeof(struct compat_cmsghdr) + sizeof(int); + else + len = sizeof(struct cmsghdr) + sizeof(int); + + if (msg->msg_controllen < len) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + + if (!scm->pid) + return; + + pidfd = pidfd_prepare(scm->pid, PIDFD_STALE, &pidfd_file); + + if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + + return; + } + + if (pidfd_file) + fd_install(pidfd, pidfd_file); +} + +static bool __scm_recv_common(struct sock *sk, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!msg->msg_control) { + if (sk->sk_scm_credentials || sk->sk_scm_pidfd || + scm->fp || scm_has_secdata(sk)) + msg->msg_flags |= MSG_CTRUNC; + + scm_destroy(scm); + return false; + } + + if (sk->sk_scm_credentials) { + struct user_namespace *current_ns = current_user_ns(); + struct ucred ucreds = { + .pid = scm->creds.pid, + .uid = from_kuid_munged(current_ns, scm->creds.uid), + .gid = from_kgid_munged(current_ns, scm->creds.gid), + }; + + put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); + } + + scm_passec(sk, msg, scm); + + if (scm->fp) + scm_detach_fds(msg, scm); + + return true; +} + +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + scm_destroy_cred(scm); +} +EXPORT_SYMBOL(scm_recv); + +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + if (sock->sk->sk_scm_pidfd) + scm_pidfd_recv(msg, scm); + + scm_destroy_cred(scm); +} diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 6a2f13cee86a..9a3965680451 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -1,21 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + #include <linux/kernel.h> #include <linux/init.h> -#include <linux/cryptohash.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/random.h> #include <linux/hrtimer.h> #include <linux/ktime.h> #include <linux/string.h> - +#include <linux/net.h> +#include <linux/siphash.h> #include <net/secure_seq.h> -static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; +#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#include <linux/in6.h> +#include <net/tcp.h> + +static siphash_aligned_key_t net_secret; +static siphash_aligned_key_t ts_secret; + +#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) + +static __always_inline void net_secret_init(void) +{ + net_get_random_once(&net_secret, sizeof(net_secret)); +} -void net_secret_init(void) +static __always_inline void ts_secret_init(void) { - get_random_bytes(net_secret, sizeof(net_secret)); + net_get_random_once(&ts_secret, sizeof(ts_secret)); } +#endif #ifdef CONFIG_INET static u32 seq_scale(u32 seq) @@ -30,156 +48,111 @@ static u32 seq_scale(u32 seq) * overlaps less than one time per MSL (2 minutes). * Choosing a clock of 64 ns period is OK. (period of 274 s) */ - return seq + (ktime_to_ns(ktime_get_real()) >> 6); + return seq + (ktime_get_real_ns() >> 6); } #endif #if IS_ENABLED(CONFIG_IPV6) -__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport) +u32 secure_tcpv6_ts_off(const struct net *net, + const __be32 *saddr, const __be32 *daddr) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u32 i; - - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32)daddr[i]; - secret[4] = net_secret[4] + - (((__force u16)sport << 16) + (__force u16)dport); - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + }; + + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + return 0; + + ts_secret_init(); + return siphash(&combined, offsetofend(typeof(combined), daddr), + &ts_secret); +} +EXPORT_IPV6_MOD(secure_tcpv6_ts_off); - return seq_scale(hash[0]); +u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) +{ + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; + u32 hash; + + net_secret_init(); + hash = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + return seq_scale(hash); } -EXPORT_SYMBOL(secure_tcpv6_sequence_number); +EXPORT_SYMBOL(secure_tcpv6_seq); -u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, +u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u32 i; - - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32) daddr[i]; - secret[4] = net_secret[4] + (__force u32)dport; - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - return hash[0]; + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + unsigned int timeseed; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + .dport = dport, + }; + net_secret_init(); + return siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); } EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -__u32 secure_ip_id(__be32 daddr) -{ - u32 hash[MD5_DIGEST_WORDS]; - - hash[0] = (__force __u32) daddr; - hash[1] = net_secret[13]; - hash[2] = net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; -} - -__u32 secure_ipv6_id(const __be32 daddr[4]) +u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) { - __u32 hash[4]; - - memcpy(hash, daddr, 16); - md5_transform(hash, net_secret); + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + return 0; - return hash[0]; + ts_secret_init(); + return siphash_2u32((__force u32)saddr, (__force u32)daddr, + &ts_secret); } -__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), + * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, + * it would be easy enough to have the former function use siphash_4u32, passing + * the arguments as separate u32. + */ +u32 secure_tcp_seq(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) { - u32 hash[MD5_DIGEST_WORDS]; - - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = ((__force u16)sport << 16) + (__force u16)dport; - hash[3] = net_secret[15]; + u32 hash; - md5_transform(hash, net_secret); - - return seq_scale(hash[0]); + net_secret_init(); + hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); + return seq_scale(hash); } +EXPORT_SYMBOL_GPL(secure_tcp_seq); -u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { - u32 hash[MD5_DIGEST_WORDS]; - - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = (__force u32)dport ^ net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; + net_secret_init(); + return siphash_4u32((__force u32)saddr, (__force u32)daddr, + (__force u16)dport, + jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + &net_secret); } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif - -#if IS_ENABLED(CONFIG_IP_DCCP) -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) -{ - u32 hash[MD5_DIGEST_WORDS]; - u64 seq; - - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = ((__force u16)sport << 16) + (__force u16)dport; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); - seq &= (1ull << 48) - 1; - - return seq; -} -EXPORT_SYMBOL(secure_dccp_sequence_number); - -#if IS_ENABLED(CONFIG_IPV6) -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport) -{ - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u64 seq; - u32 i; - - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + daddr[i]; - secret[4] = net_secret[4] + - (((__force u16)sport << 16) + (__force u16)dport); - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); - seq &= (1ull << 48) - 1; - - return seq; -} -EXPORT_SYMBOL(secure_dccpv6_sequence_number); -#endif -#endif diff --git a/net/core/selftests.c b/net/core/selftests.c new file mode 100644 index 000000000000..8b81feb82c4a --- /dev/null +++ b/net/core/selftests.c @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates. + * stmmac Selftests Support + * + * Author: Jose Abreu <joabreu@synopsys.com> + * + * Ported from stmmac by: + * Copyright (C) 2021 Oleksij Rempel <o.rempel@pengutronix.de> + */ + +#include <linux/phy.h> +#include <net/selftests.h> +#include <net/tcp.h> +#include <net/udp.h> + +static u8 net_test_next_id; + +struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr) +{ + struct sk_buff *skb = NULL; + struct udphdr *uhdr = NULL; + struct tcphdr *thdr = NULL; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct iphdr *ihdr; + int iplen, size; + + size = attr->size + NET_TEST_PKT_SIZE; + + if (attr->tcp) + size += sizeof(struct tcphdr); + else + size += sizeof(struct udphdr); + + if (attr->max_size && attr->max_size > size) + size = attr->max_size; + + skb = netdev_alloc_skb(ndev, size); + if (!skb) + return NULL; + + prefetchw(skb->data); + + ehdr = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + + skb_set_network_header(skb, skb->len); + ihdr = skb_put(skb, sizeof(*ihdr)); + + skb_set_transport_header(skb, skb->len); + if (attr->tcp) + thdr = skb_put(skb, sizeof(*thdr)); + else + uhdr = skb_put(skb, sizeof(*uhdr)); + + eth_zero_addr(ehdr->h_dest); + + if (attr->src) + ether_addr_copy(ehdr->h_source, attr->src); + if (attr->dst) + ether_addr_copy(ehdr->h_dest, attr->dst); + + ehdr->h_proto = htons(ETH_P_IP); + + if (attr->tcp) { + memset(thdr, 0, sizeof(*thdr)); + thdr->source = htons(attr->sport); + thdr->dest = htons(attr->dport); + thdr->doff = sizeof(struct tcphdr) / 4; + } else { + uhdr->source = htons(attr->sport); + uhdr->dest = htons(attr->dport); + uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size); + if (attr->max_size) + uhdr->len = htons(attr->max_size - + (sizeof(*ihdr) + sizeof(*ehdr))); + uhdr->check = 0; + } + + ihdr->ihl = 5; + ihdr->ttl = 32; + ihdr->version = 4; + if (attr->tcp) + ihdr->protocol = IPPROTO_TCP; + else + ihdr->protocol = IPPROTO_UDP; + iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size; + if (attr->tcp) + iplen += sizeof(*thdr); + else + iplen += sizeof(*uhdr); + + if (attr->max_size) + iplen = attr->max_size - sizeof(*ehdr); + + ihdr->tot_len = htons(iplen); + ihdr->frag_off = 0; + ihdr->saddr = htonl(attr->ip_src); + ihdr->daddr = htonl(attr->ip_dst); + ihdr->tos = 0; + ihdr->id = 0; + ip_send_check(ihdr); + + shdr = skb_put(skb, sizeof(*shdr)); + shdr->version = 0; + shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); + attr->id = id; + shdr->id = id; + + if (attr->size) { + void *payload = skb_put(skb, attr->size); + + memset(payload, 0, attr->size); + } + + if (attr->max_size && attr->max_size > skb->len) { + size_t pad_len = attr->max_size - skb->len; + void *pad = skb_put(skb, pad_len); + + memset(pad, 0, pad_len); + } + + skb->csum = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + if (attr->tcp) { + int l4len = skb->len - skb_transport_offset(skb); + + thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + + if (attr->bad_csum) { + /* Force mangled checksum */ + if (skb_checksum_help(skb)) { + kfree_skb(skb); + return NULL; + } + + if (thdr->check != CSUM_MANGLED_0) + thdr->check = CSUM_MANGLED_0; + else + thdr->check = csum16_sub(thdr->check, + cpu_to_be16(1)); + } + } else { + udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); + } + + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + skb->dev = ndev; + + return skb; +} +EXPORT_SYMBOL_GPL(net_test_get_skb); + +static int net_test_loopback_validate(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt, + struct net_device *orig_ndev) +{ + struct net_test_priv *tpriv = pt->af_packet_priv; + const unsigned char *src = tpriv->packet->src; + const unsigned char *dst = tpriv->packet->dst; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct udphdr *uhdr; + struct tcphdr *thdr; + struct iphdr *ihdr; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + + if (skb_linearize(skb)) + goto out; + if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN)) + goto out; + + ehdr = (struct ethhdr *)skb_mac_header(skb); + if (dst) { + if (!ether_addr_equal_unaligned(ehdr->h_dest, dst)) + goto out; + } + + if (src) { + if (!ether_addr_equal_unaligned(ehdr->h_source, src)) + goto out; + } + + ihdr = ip_hdr(skb); + if (tpriv->double_vlan) + ihdr = (struct iphdr *)(skb_network_header(skb) + 4); + + if (tpriv->packet->tcp) { + if (ihdr->protocol != IPPROTO_TCP) + goto out; + + thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (thdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr)); + } else { + if (ihdr->protocol != IPPROTO_UDP) + goto out; + + uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (uhdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr)); + } + + if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC)) + goto out; + if (tpriv->packet->id != shdr->id) + goto out; + + if (tpriv->packet->bad_csum && skb->ip_summed == CHECKSUM_UNNECESSARY) + tpriv->ok = -EIO; + else + tpriv->ok = true; + + complete(&tpriv->comp); +out: + kfree_skb(skb); + return 0; +} + +static int __net_test_loopback(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct net_test_priv *tpriv; + struct sk_buff *skb = NULL; + int ret = 0; + + tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); + if (!tpriv) + return -ENOMEM; + + tpriv->ok = false; + init_completion(&tpriv->comp); + + tpriv->pt.type = htons(ETH_P_IP); + tpriv->pt.func = net_test_loopback_validate; + tpriv->pt.dev = ndev; + tpriv->pt.af_packet_priv = tpriv; + tpriv->packet = attr; + dev_add_pack(&tpriv->pt); + + skb = net_test_get_skb(ndev, net_test_next_id, attr); + if (!skb) { + ret = -ENOMEM; + goto cleanup; + } + + net_test_next_id++; + ret = dev_direct_xmit(skb, attr->queue_mapping); + if (ret < 0) { + goto cleanup; + } else if (ret > 0) { + ret = -ENETUNREACH; + goto cleanup; + } + + if (!attr->timeout) + attr->timeout = NET_LB_TIMEOUT; + + wait_for_completion_timeout(&tpriv->comp, attr->timeout); + if (tpriv->ok < 0) + ret = tpriv->ok; + else if (!tpriv->ok) + ret = -ETIMEDOUT; + else + ret = 0; + +cleanup: + dev_remove_pack(&tpriv->pt); + kfree(tpriv); + return ret; +} + +static int net_test_netif_carrier(struct net_device *ndev) +{ + return netif_carrier_ok(ndev) ? 0 : -ENOLINK; +} + +static int net_test_phy_phydev(struct net_device *ndev) +{ + return ndev->phydev ? 0 : -EOPNOTSUPP; +} + +static int net_test_phy_loopback_enable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, true, 0); +} + +static int net_test_phy_loopback_disable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, false, 0); +} + +static int net_test_phy_loopback_udp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + return __net_test_loopback(ndev, &attr); +} + +static int net_test_phy_loopback_udp_mtu(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.max_size = ndev->mtu; + return __net_test_loopback(ndev, &attr); +} + +static int net_test_phy_loopback_tcp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + return __net_test_loopback(ndev, &attr); +} + +/** + * net_test_phy_loopback_tcp_bad_csum - PHY loopback test with a deliberately + * corrupted TCP checksum + * @ndev: the network device to test + * + * Builds the same minimal Ethernet/IPv4/TCP frame as + * net_test_phy_loopback_tcp(), then flips the least-significant bit of the TCP + * checksum so the resulting value is provably invalid (neither 0 nor 0xFFFF). + * The frame is transmitted through the device’s internal PHY loopback path: + * + * test code -> MAC driver -> MAC HW -> xMII -> PHY -> + * internal PHY loopback -> xMII -> MAC HW -> MAC driver -> test code + * + * Result interpretation + * --------------------- + * 0 The frame is delivered to the stack and the driver reports + * ip_summed as CHECKSUM_NONE or CHECKSUM_COMPLETE - both are + * valid ways to indicate “bad checksum, let the stack verify.” + * -ETIMEDOUT The MAC/PHY silently dropped the frame; hardware checksum + * verification filtered it out before the driver saw it. + * -EIO The driver returned the frame with ip_summed == + * CHECKSUM_UNNECESSARY, falsely claiming a valid checksum and + * indicating a serious RX-path defect. + * + * Return: 0 on success or a negative error code on failure. + */ +static int net_test_phy_loopback_tcp_bad_csum(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + attr.bad_csum = true; + return __net_test_loopback(ndev, &attr); +} + +static const struct net_test { + char name[ETH_GSTRING_LEN]; + int (*fn)(struct net_device *ndev); +} net_selftests[] = { + { + .name = "Carrier ", + .fn = net_test_netif_carrier, + }, { + .name = "PHY dev is present ", + .fn = net_test_phy_phydev, + }, { + /* This test should be done before all PHY loopback test */ + .name = "PHY internal loopback, enable ", + .fn = net_test_phy_loopback_enable, + }, { + .name = "PHY internal loopback, UDP ", + .fn = net_test_phy_loopback_udp, + }, { + .name = "PHY internal loopback, MTU ", + .fn = net_test_phy_loopback_udp_mtu, + }, { + .name = "PHY internal loopback, TCP ", + .fn = net_test_phy_loopback_tcp, + }, { + .name = "PHY loopback, bad TCP csum ", + .fn = net_test_phy_loopback_tcp_bad_csum, + }, { + /* This test should be done after all PHY loopback test */ + .name = "PHY internal loopback, disable", + .fn = net_test_phy_loopback_disable, + }, +}; + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) +{ + int count = net_selftest_get_count(); + int i; + + memset(buf, 0, sizeof(*buf) * count); + net_test_next_id = 0; + + if (etest->flags != ETH_TEST_FL_OFFLINE) { + netdev_err(ndev, "Only offline tests are supported\n"); + etest->flags |= ETH_TEST_FL_FAILED; + return; + } + + + for (i = 0; i < count; i++) { + buf[i] = net_selftests[i].fn(ndev); + if (buf[i] && (buf[i] != -EOPNOTSUPP)) + etest->flags |= ETH_TEST_FL_FAILED; + } +} +EXPORT_SYMBOL_GPL(net_selftest); + +int net_selftest_get_count(void) +{ + return ARRAY_SIZE(net_selftests); +} +EXPORT_SYMBOL_GPL(net_selftest_get_count); + +void net_selftest_get_strings(u8 *data) +{ + int i; + + for (i = 0; i < net_selftest_get_count(); i++) + ethtool_sprintf(&data, "%2d. %s", i + 1, + net_selftests[i].name); +} +EXPORT_SYMBOL_GPL(net_selftest_get_strings); + +MODULE_DESCRIPTION("Common library for generic PHY ethtool selftests"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>"); diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c new file mode 100644 index 000000000000..4235db6bdfad --- /dev/null +++ b/net/core/skb_fault_injection.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/debugfs.h> +#include <linux/fault-inject.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> + +static struct { + struct fault_attr attr; + char devname[IFNAMSIZ]; + bool filtered; +} skb_realloc = { + .attr = FAULT_ATTR_INITIALIZER, + .filtered = false, +}; + +static bool should_fail_net_realloc_skb(struct sk_buff *skb) +{ + struct net_device *net = skb->dev; + + if (skb_realloc.filtered && + strncmp(net->name, skb_realloc.devname, IFNAMSIZ)) + /* device name filter set, but names do not match */ + return false; + + if (!should_fail(&skb_realloc.attr, 1)) + return false; + + return true; +} +ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE); + +void skb_might_realloc(struct sk_buff *skb) +{ + if (!should_fail_net_realloc_skb(skb)) + return; + + pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_might_realloc); + +static int __init fail_skb_realloc_setup(char *str) +{ + return setup_fault_attr(&skb_realloc.attr, str); +} +__setup("fail_skb_realloc=", fail_skb_realloc_setup); + +static void reset_settings(void) +{ + skb_realloc.filtered = false; + memset(&skb_realloc.devname, 0, IFNAMSIZ); +} + +static ssize_t devname_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + ssize_t ret; + + reset_settings(); + ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ, + ppos, buffer, count); + if (ret < 0) + return ret; + + skb_realloc.devname[IFNAMSIZ - 1] = '\0'; + /* Remove a possible \n at the end of devname */ + strim(skb_realloc.devname); + + if (strnlen(skb_realloc.devname, IFNAMSIZ)) + skb_realloc.filtered = true; + + return count; +} + +static ssize_t devname_read(struct file *file, + char __user *buffer, + size_t size, loff_t *ppos) +{ + if (!skb_realloc.filtered) + return 0; + + return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname, + strlen(skb_realloc.devname)); +} + +static const struct file_operations devname_ops = { + .write = devname_write, + .read = devname_read, +}; + +static int __init fail_skb_realloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_skb_realloc", NULL, + &skb_realloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_file("devname", mode, dir, NULL, &devname_ops); + + return 0; +} + +late_initcall(fail_skb_realloc_debugfs); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 20e02d2605ec..a00808f7be6a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Routines having to do with the 'struct sk_buff' memory handlers. * @@ -25,11 +26,6 @@ * disabled, or you better be *real* sure that the operation is atomic * with respect to whatever list is being frobbed (e.g. via lock_sock() * or via disabling bottom half handlers, etc). - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ /* @@ -41,18 +37,21 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> #include <linux/netdevice.h> #ifdef CONFIG_NET_CLS_ACT #include <net/pkt_sched.h> #endif #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> @@ -60,49 +59,137 @@ #include <linux/scatterlist.h> #include <linux/errqueue.h> #include <linux/prefetch.h> +#include <linux/bitfield.h> +#include <linux/if_vlan.h> +#include <linux/mpls.h> +#include <linux/kcov.h> +#include <linux/iov_iter.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/gro.h> +#include <net/gso.h> +#include <net/hotdata.h> +#include <net/ip6_checksum.h> #include <net/xfrm.h> - -#include <asm/uaccess.h> +#include <net/mpls.h> +#include <net/mptcp.h> +#include <net/mctp.h> +#include <net/page_pool/helpers.h> +#include <net/psp/types.h> +#include <net/dropreason.h> +#include <net/xdp_sock.h> + +#include <linux/uaccess.h> #include <trace/events/skb.h> #include <linux/highmem.h> +#include <linux/capability.h> +#include <linux/user_namespace.h> +#include <linux/indirect_call_wrapper.h> +#include <linux/textsearch.h> + +#include "dev.h" +#include "devmem.h" +#include "netmem_priv.h" +#include "sock_destructor.h" + +#ifdef CONFIG_SKB_EXTENSIONS +static struct kmem_cache *skbuff_ext_cache __ro_after_init; +#endif -struct kmem_cache *skbuff_head_cache __read_mostly; -static struct kmem_cache *skbuff_fclone_cache __read_mostly; +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) -static void sock_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - put_page(buf->page); -} +/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. + * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique + * size, and we can differentiate heads from skb_small_head_cache + * vs system slabs by looking at their size (skb_end_offset()). + */ +#define SKB_SMALL_HEAD_CACHE_SIZE \ + (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ + (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ + SKB_SMALL_HEAD_SIZE) + +#define SKB_SMALL_HEAD_HEADROOM \ + SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -static void sock_pipe_buf_get(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +/* kcm_write_msgs() relies on casting paged frags to bio_vec to use + * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the + * netmem is a page. + */ +static_assert(offsetof(struct bio_vec, bv_page) == + offsetof(skb_frag_t, netmem)); +static_assert(sizeof_field(struct bio_vec, bv_page) == + sizeof_field(skb_frag_t, netmem)); + +static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); +static_assert(sizeof_field(struct bio_vec, bv_len) == + sizeof_field(skb_frag_t, len)); + +static_assert(offsetof(struct bio_vec, bv_offset) == + offsetof(skb_frag_t, offset)); +static_assert(sizeof_field(struct bio_vec, bv_offset) == + sizeof_field(skb_frag_t, offset)); + +#undef FN +#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, +static const char * const drop_reasons[] = { + [SKB_CONSUMED] = "CONSUMED", + DEFINE_DROP_REASON(FN, FN) +}; + +static const struct drop_reason_list drop_reasons_core = { + .reasons = drop_reasons, + .n_reasons = ARRAY_SIZE(drop_reasons), +}; + +const struct drop_reason_list __rcu * +drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { + [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), +}; +EXPORT_SYMBOL(drop_reasons_by_subsys); + +/** + * drop_reasons_register_subsys - register another drop reason subsystem + * @subsys: the subsystem to register, must not be the core + * @list: the list of drop reasons within the subsystem, must point to + * a statically initialized list + */ +void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, + const struct drop_reason_list *list) { - get_page(buf->page); + if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || + subsys >= ARRAY_SIZE(drop_reasons_by_subsys), + "invalid subsystem %d\n", subsys)) + return; + + /* must point to statically allocated memory, so INIT is OK */ + RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); } +EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); -static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +/** + * drop_reasons_unregister_subsys - unregister a drop reason subsystem + * @subsys: the subsystem to remove, must not be the core + * + * Note: This will synchronize_rcu() to ensure no users when it returns. + */ +void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) { - return 1; -} + if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || + subsys >= ARRAY_SIZE(drop_reasons_by_subsys), + "invalid subsystem %d\n", subsys)) + return; + RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); -/* Pipe buffer operations for a socket. */ -static const struct pipe_buf_operations sock_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = sock_pipe_buf_release, - .steal = sock_pipe_buf_steal, - .get = sock_pipe_buf_get, -}; + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); /** * skb_panic - private function for out-of-line support @@ -119,7 +206,7 @@ static const struct pipe_buf_operations sock_pipe_buf_ops = { static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, const char msg[]) { - pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", + pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); @@ -136,6 +223,349 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) skb_panic(skb, sz, addr, __func__); } +#define NAPI_SKB_CACHE_SIZE 128 +#define NAPI_SKB_CACHE_BULK 32 +#define NAPI_SKB_CACHE_FREE 32 + +struct napi_alloc_cache { + local_lock_t bh_lock; + struct page_frag_cache page; + unsigned int skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; + +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; + +void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + void *data; + + fragsz = SKB_DATA_ALIGN(fragsz); + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + data = __page_frag_alloc_align(&nc->page, fragsz, + GFP_ATOMIC | __GFP_NOWARN, align_mask); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + return data; + +} +EXPORT_SYMBOL(__napi_alloc_frag_align); + +void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +{ + void *data; + + if (in_hardirq() || irqs_disabled()) { + struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); + + fragsz = SKB_DATA_ALIGN(fragsz); + data = __page_frag_alloc_align(nc, fragsz, + GFP_ATOMIC | __GFP_NOWARN, + align_mask); + } else { + local_bh_disable(); + data = __napi_alloc_frag_align(fragsz, align_mask); + local_bh_enable(); + } + return data; +} +EXPORT_SYMBOL(__netdev_alloc_frag_align); + +/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler + * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset. + */ +static u32 skbuff_cache_size __read_mostly; + +static struct sk_buff *napi_skb_cache_get(bool alloc) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct sk_buff *skb; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + if (unlikely(!nc->skb_count)) { + if (alloc) + nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, + NAPI_SKB_CACHE_BULK, + nc->skb_cache); + if (unlikely(!nc->skb_count)) { + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + return NULL; + } + } + + skb = nc->skb_cache[--nc->skb_count]; + if (nc->skb_count) + prefetch(nc->skb_cache[nc->skb_count - 1]); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + kasan_mempool_unpoison_object(skb, skbuff_cache_size); + + return skb; +} + +/** + * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache + * @skbs: pointer to an at least @n-sized array to fill with skb pointers + * @n: number of entries to provide + * + * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes + * the pointers into the provided array @skbs. If there are less entries + * available, tries to replenish the cache and bulk-allocates the diff from + * the MM layer if needed. + * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are + * ready for {,__}build_skb_around() and don't have any data buffers attached. + * Must be called *only* from the BH context. + * + * Return: number of successfully allocated skbs (@n if no actual allocation + * needed or kmem_cache_alloc_bulk() didn't fail). + */ +u32 napi_skb_cache_get_bulk(void **skbs, u32 n) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 bulk, total = n; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + + if (nc->skb_count >= n) + goto get; + + /* No enough cached skbs. Try refilling the cache first */ + bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); + nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, bulk, + &nc->skb_cache[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* Still not enough. Bulk-allocate the missing part directly, zeroed */ + n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, + n - nc->skb_count, &skbs[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* kmem_cache didn't allocate the number we need, limit the output */ + total -= n - nc->skb_count; + n = nc->skb_count; + +get: + for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { + skbs[i] = nc->skb_cache[base + i]; + + kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); + memset(skbs[i], 0, offsetof(struct sk_buff, tail)); + } + + nc->skb_count -= n; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + + return total; +} +EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); + +static inline void __finalize_skb_around(struct sk_buff *skb, void *data, + unsigned int size) +{ + struct skb_shared_info *shinfo; + + size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + /* Assumes caller memset cleared SKB */ + skb->truesize = SKB_TRUESIZE(size); + refcount_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb_set_end_offset(skb, size); + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; + skb->alloc_cpu = raw_smp_processor_id(); + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + skb_set_kcov_handle(skb, kcov_common_handle()); +} + +static inline void *__slab_build_skb(void *data, unsigned int *size) +{ + void *resized; + + /* Must find the allocation size (and grow it to match). */ + *size = ksize(data); + /* krealloc() will immediately return "data" when + * "ksize(data)" is requested: it is the existing upper + * bounds. As a result, GFP_ATOMIC will be ignored. Note + * that this "new" pointer needs to be passed back to the + * caller for use so the __alloc_size hinting will be + * tracked correctly. + */ + resized = krealloc(data, *size, GFP_ATOMIC); + WARN_ON_ONCE(resized != data); + return resized; +} + +/* build_skb() variant which can operate on slab buffers. + * Note that this should be used sparingly as slab buffers + * cannot be combined efficiently by GRO! + */ +struct sk_buff *slab_build_skb(void *data) +{ + struct sk_buff *skb; + unsigned int size; + + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + data = __slab_build_skb(data, &size); + __finalize_skb_around(skb, data, size); + + return skb; +} +EXPORT_SYMBOL(slab_build_skb); + +/* Caller must provide SKB that is memset cleared */ +static void __build_skb_around(struct sk_buff *skb, void *data, + unsigned int frag_size) +{ + unsigned int size = frag_size; + + /* frag_size == 0 is considered deprecated now. Callers + * using slab buffer should use slab_build_skb() instead. + */ + if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) + data = __slab_build_skb(data, &size); + + __finalize_skb_around(skb, data, size); +} + +/** + * __build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data (must not be 0) + * + * Allocate a new &sk_buff. Caller provides space holding head and + * skb_shared_info. @data must have been allocated from the page + * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() + * allocation is deprecated, and callers should use slab_build_skb() + * instead.) + * The return is the new skb buffer. + * On a failure the return is %NULL, and @data is not freed. + * Notes : + * Before IO, driver allocates only data buffer where NIC put incoming frame + * Driver should add room at head (NET_SKB_PAD) and + * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) + * After IO, driver calls build_skb(), to allocate sk_buff and populate it + * before giving packet to stack. + * RX rings only contains data buffers, not full skbs. + */ +struct sk_buff *__build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); + + return skb; +} + +/* build_skb() is wrapper over __build_skb(), that specifically + * takes care of skb->head and skb->pfmemalloc + */ +struct sk_buff *build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __build_skb(data, frag_size); + + if (likely(skb && frag_size)) { + skb->head_frag = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); + } + return skb; +} +EXPORT_SYMBOL(build_skb); + +/** + * build_skb_around - build a network buffer around provided skb + * @skb: sk_buff provide by caller, must be memset cleared + * @data: data buffer provided by caller + * @frag_size: size of data + */ +struct sk_buff *build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size) +{ + if (unlikely(!skb)) + return NULL; + + __build_skb_around(skb, data, frag_size); + + if (frag_size) { + skb->head_frag = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); + } + return skb; +} +EXPORT_SYMBOL(build_skb_around); + +/** + * __napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data + * + * Version of __build_skb() that uses NAPI percpu caches to obtain + * skbuff_head instead of inplace allocation. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb; + + skb = napi_skb_cache_get(true); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); + + return skb; +} + +/** + * napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data + * + * Version of __napi_build_skb() that takes care of skb->head_frag + * and skb->pfmemalloc when the data is a page or page fragment. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __napi_build_skb(data, frag_size); + + if (likely(skb) && frag_size) { + skb->head_frag = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); + } + + return skb; +} +EXPORT_SYMBOL(napi_build_skb); + /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and @@ -143,20 +573,39 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) * may be used. Otherwise, the packet data may be discarded until enough * memory is free */ -#define kmalloc_reserve(size, gfp, node, pfmemalloc) \ - __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) - -static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, - unsigned long ip, bool *pfmemalloc) +static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, + bool *pfmemalloc) { - void *obj; bool ret_pfmemalloc = false; + size_t obj_size; + void *obj; + + obj_size = SKB_HEAD_ALIGN(*size); + if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && + !(flags & KMALLOC_NOT_NORMAL_BITS)) { + obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + *size = SKB_SMALL_HEAD_CACHE_SIZE; + if (obj || !(gfp_pfmemalloc_allowed(flags))) + goto out; + /* Try again but now we are using pfmemalloc reserves */ + ret_pfmemalloc = true; + obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); + goto out; + } + + obj_size = kmalloc_size_roundup(obj_size); + /* The following cast might truncate high-order bits of obj_size, this + * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. + */ + *size = (unsigned int)obj_size; /* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. */ - obj = kmalloc_node_track_caller(size, + obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); if (obj || !(gfp_pfmemalloc_allowed(flags))) @@ -164,7 +613,7 @@ static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(size, flags, node); + obj = kmalloc_node_track_caller(obj_size, flags, node); out: if (pfmemalloc) @@ -179,31 +628,6 @@ out: * */ -struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) -{ - struct sk_buff *skb; - - /* Get the HEAD */ - skb = kmem_cache_alloc_node(skbuff_head_cache, - gfp_mask & ~__GFP_DMA, node); - if (!skb) - goto out; - - /* - * Only clear those fields we need to clear, not those that we will - * actually initialise below. Hence, don't put any more fields after - * the tail pointer in struct sk_buff! - */ - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->head = NULL; - skb->truesize = sizeof(struct sk_buff); - atomic_set(&skb->users, 1); - - skb->mac_header = (typeof(skb->mac_header))~0U; -out: - return skb; -} - /** * __alloc_skb - allocate a network buffer * @size: size to allocate @@ -224,22 +648,38 @@ out: struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags, int node) { + struct sk_buff *skb = NULL; struct kmem_cache *cache; - struct skb_shared_info *shinfo; - struct sk_buff *skb; - u8 *data; bool pfmemalloc; - - cache = (flags & SKB_ALLOC_FCLONE) - ? skbuff_fclone_cache : skbuff_head_cache; + u8 *data; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; - /* Get the HEAD */ - skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); - if (!skb) - goto out; + if (flags & SKB_ALLOC_FCLONE) { + cache = net_hotdata.skbuff_fclone_cache; + goto fallback; + } + cache = net_hotdata.skbuff_cache; + if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id())) + goto fallback; + + if (flags & SKB_ALLOC_NAPI) { + skb = napi_skb_cache_get(true); + if (unlikely(!skb)) + return NULL; + } else if (!in_hardirq() && !irqs_disabled()) { + local_bh_disable(); + skb = napi_skb_cache_get(false); + local_bh_enable(); + } + + if (!skb) { +fallback: + skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); + if (unlikely(!skb)) + return NULL; + } prefetchw(skb); /* We do our best to align skb_shared_info on a separate cache @@ -247,17 +687,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ - size = SKB_DATA_ALIGN(size); - size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); - if (!data) + data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); + if (unlikely(!data)) goto nodata; - /* kmalloc(size) might give us more room than requested. + /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - size = SKB_WITH_OVERHEAD(ksize(data)); - prefetchw(data + size); + prefetchw(data + SKB_WITH_OVERHEAD(size)); /* * Only clear those fields we need to clear, not those that we will @@ -265,215 +702,199 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); - /* Account for allocated memory : skb + skb->head */ - skb->truesize = SKB_TRUESIZE(size); + __build_skb_around(skb, data, size); skb->pfmemalloc = pfmemalloc; - atomic_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->mac_header = (typeof(skb->mac_header))~0U; - skb->transport_header = (typeof(skb->transport_header))~0U; - - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); + struct sk_buff_fclones *fclones; - kmemcheck_annotate_bitfield(child, flags1); - kmemcheck_annotate_bitfield(child, flags2); - skb->fclone = SKB_FCLONE_ORIG; - atomic_set(fclone_ref, 1); + fclones = container_of(skb, struct sk_buff_fclones, skb1); - child->fclone = SKB_FCLONE_UNAVAILABLE; - child->pfmemalloc = pfmemalloc; + skb->fclone = SKB_FCLONE_ORIG; + refcount_set(&fclones->fclone_ref, 1); } -out: + return skb; + nodata: kmem_cache_free(cache, skb); - skb = NULL; - goto out; + return NULL; } EXPORT_SYMBOL(__alloc_skb); /** - * build_skb - build a network buffer - * @data: data buffer provided by caller - * @frag_size: size of fragment, or 0 if head was kmalloced + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @len: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb * - * Allocate a new &sk_buff. Caller provides space holding head and - * skb_shared_info. @data must have been allocated by kmalloc() - * The return is the new skb buffer. - * On a failure the return is %NULL, and @data is not freed. - * Notes : - * Before IO, driver allocates only data buffer where NIC put incoming frame - * Driver should add room at head (NET_SKB_PAD) and - * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) - * After IO, driver calls build_skb(), to allocate sk_buff and populate it - * before giving packet to stack. - * RX rings only contains data buffers, not full skbs. + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has NET_SKB_PAD headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. */ -struct sk_buff *build_skb(void *data, unsigned int frag_size) +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, + gfp_t gfp_mask) { - struct skb_shared_info *shinfo; + struct page_frag_cache *nc; struct sk_buff *skb; - unsigned int size = frag_size ? : ksize(data); - - skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); - if (!skb) - return NULL; + bool pfmemalloc; + void *data; - size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len += NET_SKB_PAD; - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->truesize = SKB_TRUESIZE(size); - skb->head_frag = frag_size != 0; - atomic_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->mac_header = (typeof(skb->mac_header))~0U; - skb->transport_header = (typeof(skb->transport_header))~0U; + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); + len = SKB_HEAD_ALIGN(len); - return skb; -} -EXPORT_SYMBOL(build_skb); + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; -struct netdev_alloc_cache { - struct page_frag frag; - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. - */ - unsigned int pagecnt_bias; -}; -static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); + if (in_hardirq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); + } else { + local_bh_disable(); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); -static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) -{ - struct netdev_alloc_cache *nc; - void *data = NULL; - int order; - unsigned long flags; + nc = this_cpu_ptr(&napi_alloc_cache.page); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); - local_irq_save(flags); - nc = &__get_cpu_var(netdev_alloc_cache); - if (unlikely(!nc->frag.page)) { -refill: - for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { - gfp_t gfp = gfp_mask; - - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; - nc->frag.page = alloc_pages(gfp, order); - if (likely(nc->frag.page)) - break; - if (--order < 0) - goto end; - } - nc->frag.size = PAGE_SIZE << order; -recycle: - atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS); - nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; - nc->frag.offset = 0; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + local_bh_enable(); } - if (nc->frag.offset + fragsz > nc->frag.size) { - /* avoid unnecessary locked operations if possible */ - if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) || - atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count)) - goto recycle; - goto refill; + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; } - data = page_address(nc->frag.page) + nc->frag.offset; - nc->frag.offset += fragsz; - nc->pagecnt_bias--; -end: - local_irq_restore(flags); - return data; -} + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; -/** - * netdev_alloc_frag - allocate a page fragment - * @fragsz: fragment size - * - * Allocates a frag from a page for receive buffer. - * Uses GFP_ATOMIC allocations. - */ -void *netdev_alloc_frag(unsigned int fragsz) -{ - return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); +skb_success: + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + +skb_fail: + return skb; } -EXPORT_SYMBOL(netdev_alloc_frag); +EXPORT_SYMBOL(__netdev_alloc_skb); /** - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device - * @dev: network device to receive on - * @length: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * @napi: napi instance this buffer was allocated for + * @len: length to allocate * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. + * Allocate a new sk_buff for use in NAPI receive. This buffer will + * attempt to allocate the head from a special reserved region used + * only for NAPI Rx allocation. By doing this we can save several + * CPU cycles by avoiding having to disable and re-enable IRQs. * * %NULL is returned if there is no free memory. */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, - unsigned int length, gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { - struct sk_buff *skb = NULL; - unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; + struct napi_alloc_cache *nc; + struct sk_buff *skb; + bool pfmemalloc; + void *data; - if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { - void *data; + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + len += NET_SKB_PAD + NET_IP_ALIGN; - if (sk_memalloc_socks()) - gfp_mask |= __GFP_MEMALLOC; + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, + NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } - data = __netdev_alloc_frag(fragsz, gfp_mask); + len = SKB_HEAD_ALIGN(len); - if (likely(data)) { - skb = build_skb(data, fragsz); - if (unlikely(!skb)) - put_page(virt_to_head_page(data)); - } - } else { - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, - SKB_ALLOC_RX, NUMA_NO_NODE); - } - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD); - skb->dev = dev; + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache); + + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + + if (unlikely(!data)) + return NULL; + + skb = __napi_build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; } + + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + +skb_success: + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + +skb_fail: return skb; } -EXPORT_SYMBOL(__netdev_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb); + +void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, + int off, int size, unsigned int truesize) +{ + DEBUG_NET_WARN_ON_ONCE(size > truesize); + + skb_fill_netmem_desc(skb, i, netmem, off, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_add_rx_frag_netmem); -void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, - int size, unsigned int truesize) +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, + unsigned int truesize) { - skb_fill_page_desc(skb, i, page, off, size); + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + DEBUG_NET_WARN_ON_ONCE(size > truesize); + + skb_frag_size_add(frag, size); skb->len += size; skb->data_len += size; skb->truesize += truesize; } -EXPORT_SYMBOL(skb_add_rx_frag); +EXPORT_SYMBOL(skb_coalesce_rx_frag); static void skb_drop_list(struct sk_buff **listp) { @@ -494,42 +915,207 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static void skb_free_head(struct sk_buff *skb) +int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, + unsigned int headroom) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + u32 size, truesize, len, max_head_size, off; + struct sk_buff *skb = *pskb, *nskb; + int err, i, head_off; + void *data; + + /* XDP does not support fraglist so we need to linearize + * the skb. + */ + if (skb_has_frag_list(skb)) + return -EOPNOTSUPP; + + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); + if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) + return -ENOMEM; + + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + headroom; + data = page_pool_dev_alloc_va(pool, &truesize); + if (!data) + return -ENOMEM; + + nskb = napi_build_skb(data, truesize); + if (!nskb) { + page_pool_free_va(pool, data, true); + return -ENOMEM; + } + + skb_reserve(nskb, headroom); + skb_copy_header(nskb, skb); + skb_mark_for_recycle(nskb); + + err = skb_copy_bits(skb, 0, nskb->data, size); + if (err) { + consume_skb(nskb); + return err; + } + skb_put(nskb, size); + + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + + off = size; + len = skb->len - off; + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + struct page *page; + u32 page_off; + + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = page_pool_dev_alloc(pool, &page_off, &truesize); + if (!page) { + consume_skb(nskb); + return -ENOMEM; + } + + skb_add_rx_frag(nskb, i, page, page_off, size, truesize); + err = skb_copy_bits(skb, off, page_address(page) + page_off, + size); + if (err) { + consume_skb(nskb); + return err; + } + + len -= size; + off += size; + } + + consume_skb(skb); + *pskb = nskb; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} +EXPORT_SYMBOL(skb_pp_cow_data); + +int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, + const struct bpf_prog *prog) +{ + if (!prog->aux->xdp_has_frags) + return -EINVAL; + + return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); +} +EXPORT_SYMBOL(skb_cow_data_for_xdp); + +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(netmem_ref netmem) +{ + netmem = netmem_compound_head(netmem); + + if (unlikely(!netmem_is_pp(netmem))) + return false; + + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + +static bool skb_pp_recycle(struct sk_buff *skb, void *data) +{ + if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) + return false; + return napi_pp_put_page(page_to_netmem(virt_to_page(data))); +} + +/** + * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb + * @skb: page pool aware skb + * + * Increase the fragment reference count (pp_ref_count) of a skb. This is + * intended to gain fragment references only for page pool aware skbs, + * i.e. when skb->pp_recycle is true, and not for fragments in a + * non-pp-recycling skb. It has a fallback to increase references on normal + * pages, as page pool aware skbs may also have normal page fragments. + */ +static int skb_pp_frag_ref(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + netmem_ref head_netmem; + int i; + + if (!skb->pp_recycle) + return -EINVAL; + + shinfo = skb_shinfo(skb); + + for (i = 0; i < shinfo->nr_frags; i++) { + head_netmem = netmem_compound_head(shinfo->frags[i].netmem); + if (likely(netmem_is_pp(head_netmem))) + page_pool_ref_netmem(head_netmem); + else + page_ref_inc(netmem_to_page(head_netmem)); + } + return 0; +} + +static void skb_kfree_head(void *head, unsigned int end_offset) { - if (skb->head_frag) - put_page(virt_to_head_page(skb->head)); + if (end_offset == SKB_SMALL_HEAD_HEADROOM) + kmem_cache_free(net_hotdata.skb_small_head_cache, head); else - kfree(skb->head); + kfree(head); } -static void skb_release_data(struct sk_buff *skb) +static void skb_free_head(struct sk_buff *skb) { - if (!skb->cloned || - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &skb_shinfo(skb)->dataref)) { - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_frag_unref(skb, i); - } + unsigned char *head = skb->head; - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; + if (skb->head_frag) { + if (skb_pp_recycle(skb, head)) + return; + skb_free_frag(head); + } else { + skb_kfree_head(head, skb_end_offset(skb)); + } +} - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int i; - if (skb_has_frag_list(skb)) - skb_drop_fraglist(skb); + if (!skb_data_unref(skb, shinfo)) + goto exit; - skb_free_head(skb); + if (skb_zcopy(skb)) { + bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; + + skb_zcopy_clear(skb, true); + if (skip_unref) + goto free_head; } + + for (i = 0; i < shinfo->nr_frags; i++) + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); + +free_head: + if (shinfo->frag_list) + kfree_skb_list_reason(shinfo->frag_list, reason); + + skb_free_head(skb); +exit: + /* When we clone an SKB we copy the reycling bit. The pp_recycle + * bit is only set on the head though, so in order to avoid races + * while trying to recycle fragments on __skb_frag_unref() we need + * to make one SKB responsible for triggering the recycle path. + * So disable the recycling bit if an SKB is cloned and we have + * additional references to the fragmented part of the SKB. + * Eventually the last SKB will have the recycling bit set and it's + * dataref set to 0, which will trigger the recycling + */ + skb->pp_recycle = 0; } /* @@ -537,69 +1123,63 @@ static void skb_release_data(struct sk_buff *skb) */ static void kfree_skbmem(struct sk_buff *skb) { - struct sk_buff *other; - atomic_t *fclone_ref; + struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: - kmem_cache_free(skbuff_head_cache, skb); - break; + kmem_cache_free(net_hotdata.skbuff_cache, skb); + return; case SKB_FCLONE_ORIG: - fclone_ref = (atomic_t *) (skb + 2); - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, skb); - break; - - case SKB_FCLONE_CLONE: - fclone_ref = (atomic_t *) (skb + 1); - other = skb - 1; + fclones = container_of(skb, struct sk_buff_fclones, skb1); - /* The clone portion is available for - * fast-cloning again. + /* We usually free the clone (TX completion) before original skb + * This test would have no chance to be true for the clone, + * while here, branch prediction will be good. */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; + if (refcount_read(&fclones->fclone_ref) == 1) + goto fastpath; + break; - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, other); + default: /* SKB_FCLONE_CLONE */ + fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } + if (!refcount_dec_and_test(&fclones->fclone_ref)) + return; +fastpath: + kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones); } -static void skb_release_head_state(struct sk_buff *skb) +void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); -#ifdef CONFIG_XFRM - secpath_put(skb->sp); -#endif if (skb->destructor) { - WARN_ON(in_irq()); - skb->destructor(skb); - } -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb->nfct); -#endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED - nf_conntrack_put_reasm(skb->nfct_reasm); -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(skb->nf_bridge); -#endif -/* XXX: IS this still necessary? - JHS */ -#ifdef CONFIG_NET_SCHED - skb->tc_index = 0; -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; -#endif + DEBUG_NET_WARN_ON_ONCE(in_hardirq()); +#ifdef CONFIG_INET + INDIRECT_CALL_4(skb->destructor, + tcp_wfree, __sock_wfree, sock_wfree, + xsk_destruct_skb, + skb); +#else + INDIRECT_CALL_2(skb->destructor, + sock_wfree, xsk_destruct_skb, + skb); + #endif + skb->destructor = NULL; + skb->sk = NULL; + } + nf_reset_ct(skb); + skb_ext_reset(skb); } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb); + skb_release_data(skb, reason); } /** @@ -613,41 +1193,206 @@ static void skb_release_all(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); +static __always_inline +bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason reason) +{ + if (unlikely(!skb_unref(skb))) + return false; + + DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || + u32_get_bits(reason, + SKB_DROP_REASON_SUBSYS_MASK) >= + SKB_DROP_REASON_SUBSYS_NUM); + + if (reason == SKB_CONSUMED) + trace_consume_skb(skb, __builtin_return_address(0)); + else + trace_kfree_skb(skb, __builtin_return_address(0), reason, sk); + return true; +} + /** - * kfree_skb - free an sk_buff + * sk_skb_reason_drop - free an sk_buff with special reason + * @sk: the socket to receive @skb, or NULL if not applicable * @skb: buffer to free + * @reason: reason why this skb is dropped * - * Drop a reference to the buffer and free it if the usage count has - * hit zero. + * Drop a reference to the buffer and free it if the usage count has hit + * zero. Meanwhile, pass the receiving socket and drop reason to + * 'kfree_skb' tracepoint. */ -void kfree_skb(struct sk_buff *skb) +void __fix_address +sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - if (unlikely(!skb)) - return; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + if (__sk_skb_reason_drop(sk, skb, reason)) + __kfree_skb(skb); +} +EXPORT_SYMBOL(sk_skb_reason_drop); + +#define KFREE_SKB_BULK_SIZE 16 + +struct skb_free_array { + unsigned int skb_count; + void *skb_array[KFREE_SKB_BULK_SIZE]; +}; + +static void kfree_skb_add_bulk(struct sk_buff *skb, + struct skb_free_array *sa, + enum skb_drop_reason reason) +{ + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); return; - trace_kfree_skb(skb, __builtin_return_address(0)); - __kfree_skb(skb); + } + + skb_release_all(skb, reason); + sa->skb_array[sa->skb_count++] = skb; + + if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { + kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, + sa->skb_array); + sa->skb_count = 0; + } } -EXPORT_SYMBOL(kfree_skb); -void kfree_skb_list(struct sk_buff *segs) +void __fix_address +kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) { + struct skb_free_array sa; + + sa.skb_count = 0; + while (segs) { struct sk_buff *next = segs->next; - kfree_skb(segs); + if (__sk_skb_reason_drop(NULL, segs, reason)) { + skb_poison_list(segs); + kfree_skb_add_bulk(segs, &sa, reason); + } + segs = next; } + + if (sa.skb_count) + kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array); +} +EXPORT_SYMBOL(kfree_skb_list_reason); + +/* Dump skb information and contents. + * + * Must only be called from net_ratelimit()-ed paths. + * + * Dumps whole packets if full_pkt, only headers otherwise. + */ +void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + struct net_device *dev = skb->dev; + struct sock *sk = skb->sk; + struct sk_buff *list_skb; + bool has_mac, has_trans; + int headroom, tailroom; + int i, len, seg_len; + + if (full_pkt) + len = skb->len; + else + len = min_t(int, skb->len, MAX_HEADER + 128); + + headroom = skb_headroom(skb); + tailroom = skb_tailroom(skb); + + has_mac = skb_mac_header_was_set(skb); + has_trans = skb_transport_header_was_set(skb); + + printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" + "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" + "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" + "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" + "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" + "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", + level, skb->len, headroom, skb_headlen(skb), tailroom, + has_mac ? skb->mac_header : -1, + has_mac ? skb_mac_header_len(skb) : -1, + skb->mac_len, + skb->network_header, + has_trans ? skb_network_header_len(skb) : -1, + has_trans ? skb->transport_header : -1, + sh->tx_flags, sh->nr_frags, + sh->gso_size, sh->gso_type, sh->gso_segs, + skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level, + skb->hash, skb->sw_hash, skb->l4_hash, + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, + skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, + skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, + skb->inner_network_header, skb->inner_transport_header); + + if (dev) + printk("%sdev name=%s feat=%pNF\n", + level, dev->name, &dev->features); + if (sk) + printk("%ssk family=%hu type=%u proto=%u\n", + level, sk->sk_family, sk->sk_type, sk->sk_protocol); + + if (full_pkt && headroom) + print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->head, headroom, false); + + seg_len = min_t(int, skb_headlen(skb), len); + if (seg_len) + print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->data, seg_len, false); + len -= seg_len; + + if (full_pkt && tailroom) + print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, + 16, 1, skb_tail_pointer(skb), tailroom, false); + + for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + if (skb_frag_is_net_iov(frag)) { + printk("%sskb frag %d: not readable\n", level, i); + len -= skb_frag_size(frag); + if (!len) + break; + continue; + } + + skb_frag_foreach_page(frag, skb_frag_off(frag), + skb_frag_size(frag), p, p_off, p_len, + copied) { + seg_len = min_t(int, p_len, len); + vaddr = kmap_atomic(p); + print_hex_dump(level, "skb frag: ", + DUMP_PREFIX_OFFSET, + 16, 1, vaddr + p_off, seg_len, false); + kunmap_atomic(vaddr); + len -= seg_len; + if (!len) + break; + } + } + + if (full_pkt && skb_has_frag_list(skb)) { + printk("skb fraglist:\n"); + skb_walk_frags(skb, list_skb) + skb_dump(level, list_skb, true); + } } -EXPORT_SYMBOL(kfree_skb_list); +EXPORT_SYMBOL(skb_dump); /** * skb_tx_error - report an sk_buff xmit error @@ -658,17 +1403,14 @@ EXPORT_SYMBOL(kfree_skb_list); */ void skb_tx_error(struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, false); - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; + if (skb) { + skb_zcopy_downgrade_managed(skb); + skb_zcopy_clear(skb, true); } } EXPORT_SYMBOL(skb_tx_error); +#ifdef CONFIG_TRACEPOINTS /** * consume_skb - free an skbuff * @skb: buffer to free @@ -679,69 +1421,154 @@ EXPORT_SYMBOL(skb_tx_error); */ void consume_skb(struct sk_buff *skb) { - if (unlikely(!skb)) + if (!skb_unref(skb)) return; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) - return; - trace_consume_skb(skb); + + trace_consume_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); +#endif + +/** + * __consume_stateless_skb - free an skbuff, assuming it is stateless + * @skb: buffer to free + * + * Alike consume_skb(), but this variant assumes that this is the last + * skb reference and all the head states have been already dropped + */ +void __consume_stateless_skb(struct sk_buff *skb) +{ + trace_consume_skb(skb, __builtin_return_address(0)); + skb_release_data(skb, SKB_CONSUMED); + kfree_skbmem(skb); +} + +static void napi_skb_cache_put(struct sk_buff *skb) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + if (!kasan_mempool_poison_object(skb)) + return; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc->skb_cache[nc->skb_count++] = skb; + + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { + u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE; + + for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++) + kasan_mempool_unpoison_object(nc->skb_cache[i], + skbuff_cache_size); + + kmem_cache_free_bulk(net_hotdata.skbuff_cache, + NAPI_SKB_CACHE_FREE, + nc->skb_cache + remaining); + nc->skb_count = remaining; + } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); +} + +void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) +{ + skb_release_all(skb, reason); + napi_skb_cache_put(skb); +} + +void napi_skb_free_stolen_head(struct sk_buff *skb) +{ + if (unlikely(skb->slow_gro)) { + nf_reset_ct(skb); + skb_dst_drop(skb); + skb_ext_put(skb); + skb_orphan(skb); + skb->slow_gro = 0; + } + napi_skb_cache_put(skb); +} + +void napi_consume_skb(struct sk_buff *skb, int budget) +{ + /* Zero budget indicate non-NAPI context called us, like netpoll */ + if (unlikely(!budget || !skb)) { + dev_consume_skb_any(skb); + return; + } + + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + + if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) { + skb_release_head_state(skb); + return skb_attempt_defer_free(skb); + } + + if (!skb_unref(skb)) + return; + + /* if reaching here SKB is ready to free */ + trace_consume_skb(skb, __builtin_return_address(0)); + + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + skb_release_all(skb, SKB_CONSUMED); + napi_skb_cache_put(skb); +} +EXPORT_SYMBOL(napi_consume_skb); + +/* Make sure a field is contained by headers group */ +#define CHECK_SKB_FIELD(field) \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ + offsetof(struct sk_buff, headers.field)); \ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; + /* We do not copy old->sk */ new->dev = old->dev; - new->transport_header = old->transport_header; - new->network_header = old->network_header; - new->mac_header = old->mac_header; - new->inner_protocol = old->inner_protocol; - new->inner_transport_header = old->inner_transport_header; - new->inner_network_header = old->inner_network_header; - new->inner_mac_header = old->inner_mac_header; + memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); - new->rxhash = old->rxhash; - new->ooo_okay = old->ooo_okay; - new->l4_rxhash = old->l4_rxhash; - new->no_fcs = old->no_fcs; - new->encapsulation = old->encapsulation; -#ifdef CONFIG_XFRM - new->sp = secpath_get(old->sp); + __skb_ext_copy(new, old); + __nf_copy(new, old, false); + + /* Note : this field could be in the headers group. + * It is not yet because we do not want to have a 16 bit hole + */ + new->queue_mapping = old->queue_mapping; + + memcpy(&new->headers, &old->headers, sizeof(new->headers)); + CHECK_SKB_FIELD(protocol); + CHECK_SKB_FIELD(csum); + CHECK_SKB_FIELD(hash); + CHECK_SKB_FIELD(priority); + CHECK_SKB_FIELD(skb_iif); + CHECK_SKB_FIELD(vlan_proto); + CHECK_SKB_FIELD(vlan_tci); + CHECK_SKB_FIELD(transport_header); + CHECK_SKB_FIELD(network_header); + CHECK_SKB_FIELD(mac_header); + CHECK_SKB_FIELD(inner_protocol); + CHECK_SKB_FIELD(inner_transport_header); + CHECK_SKB_FIELD(inner_network_header); + CHECK_SKB_FIELD(inner_mac_header); + CHECK_SKB_FIELD(mark); +#ifdef CONFIG_NETWORK_SECMARK + CHECK_SKB_FIELD(secmark); #endif - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum = old->csum; - new->local_df = old->local_df; - new->pkt_type = old->pkt_type; - new->ip_summed = old->ip_summed; - skb_copy_queue_mapping(new, old); - new->priority = old->priority; -#if IS_ENABLED(CONFIG_IP_VS) - new->ipvs_property = old->ipvs_property; +#ifdef CONFIG_NET_RX_BUSY_POLL + CHECK_SKB_FIELD(napi_id); #endif - new->pfmemalloc = old->pfmemalloc; - new->protocol = old->protocol; - new->mark = old->mark; - new->skb_iif = old->skb_iif; - __nf_copy(new, old); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) - new->nf_trace = old->nf_trace; + CHECK_SKB_FIELD(alloc_cpu); +#ifdef CONFIG_XPS + CHECK_SKB_FIELD(sender_cpu); #endif #ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; -#ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; -#endif + CHECK_SKB_FIELD(tc_index); #endif - new->vlan_proto = old->vlan_proto; - new->vlan_tci = old->vlan_tci; - skb_copy_secmark(new, old); - -#ifdef CONFIG_NET_LL_RX_POLL - new->napi_id = old->napi_id; -#endif } /* @@ -762,6 +1589,9 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; + n->peeked = 0; + C(pfmemalloc); + C(pp_recycle); n->destructor = NULL; C(tail); C(end); @@ -769,7 +1599,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(head_frag); C(data); C(truesize); - atomic_set(&n->users, 1); + refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; @@ -779,6 +1609,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) } /** + * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg + * @first: first sk_buff of the msg + */ +struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) +{ + struct sk_buff *n; + + n = alloc_skb(0, GFP_ATOMIC); + if (!n) + return NULL; + + n->len = first->len; + n->data_len = first->len; + n->truesize = first->truesize; + + skb_shinfo(n)->frag_list = first; + + __copy_skb_header(n, first); + n->destructor = NULL; + + return n; +} +EXPORT_SYMBOL_GPL(alloc_skb_for_msg); + +/** * skb_morph - morph one skb into another * @dst: the skb to receive the contents * @src: the skb to supply the contents @@ -790,17 +1645,317 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst); + skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); +int mm_account_pinned_pages(struct mmpin *mmp, size_t size) +{ + unsigned long max_pg, num_pg, new_pg, old_pg, rlim; + struct user_struct *user; + + if (capable(CAP_IPC_LOCK) || !size) + return 0; + + rlim = rlimit(RLIMIT_MEMLOCK); + if (rlim == RLIM_INFINITY) + return 0; + + num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ + max_pg = rlim >> PAGE_SHIFT; + user = mmp->user ? : current_user(); + + old_pg = atomic_long_read(&user->locked_vm); + do { + new_pg = old_pg + num_pg; + if (new_pg > max_pg) + return -ENOBUFS; + } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg)); + + if (!mmp->user) { + mmp->user = get_uid(user); + mmp->num_pg = num_pg; + } else { + mmp->num_pg += num_pg; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mm_account_pinned_pages); + +void mm_unaccount_pinned_pages(struct mmpin *mmp) +{ + if (mmp->user) { + atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); + free_uid(mmp->user); + } +} +EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); + +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) +{ + struct ubuf_info_msgzc *uarg; + struct sk_buff *skb; + + WARN_ON_ONCE(!in_task()); + + skb = sock_omalloc(sk, 0, GFP_KERNEL); + if (!skb) + return NULL; + + BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); + uarg = (void *)skb->cb; + uarg->mmp.user = NULL; + + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { + kfree_skb(skb); + return NULL; + } + + uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; + uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; + uarg->len = 1; + uarg->bytelen = size; + uarg->zerocopy = 1; + uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + refcount_set(&uarg->ubuf.refcnt, 1); + sock_hold(sk); + + return &uarg->ubuf; +} + +static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) +{ + return container_of((void *)uarg, struct sk_buff, cb); +} + +struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg, bool devmem) +{ + if (uarg) { + struct ubuf_info_msgzc *uarg_zc; + const u32 byte_limit = 1 << 19; /* limit to a few TSO */ + u32 bytelen, next; + + /* there might be non MSG_ZEROCOPY users */ + if (uarg->ops != &msg_zerocopy_ubuf_ops) + return NULL; + + /* realloc only when socket is locked (TCP, UDP cork), + * so uarg->len and sk_zckey access is serialized + */ + if (!sock_owned_by_user(sk)) { + WARN_ON_ONCE(1); + return NULL; + } + + uarg_zc = uarg_to_msgzc(uarg); + bytelen = uarg_zc->bytelen + size; + if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { + /* TCP can create new skb to attach new uarg */ + if (sk->sk_type == SOCK_STREAM) + goto new_alloc; + return NULL; + } + + next = (u32)atomic_read(&sk->sk_zckey); + if ((u32)(uarg_zc->id + uarg_zc->len) == next) { + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) + return NULL; + uarg_zc->len++; + uarg_zc->bytelen = bytelen; + atomic_set(&sk->sk_zckey, ++next); + + /* no extra ref when appending to datagram (MSG_MORE) */ + if (sk->sk_type == SOCK_STREAM) + net_zcopy_get(uarg); + + return uarg; + } + } + +new_alloc: + return msg_zerocopy_alloc(sk, size, devmem); +} +EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); + +static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + u32 old_lo, old_hi; + u64 sum_len; + + old_lo = serr->ee.ee_info; + old_hi = serr->ee.ee_data; + sum_len = old_hi - old_lo + 1ULL + len; + + if (sum_len >= (1ULL << 32)) + return false; + + if (lo != old_hi + 1) + return false; + + serr->ee.ee_data += len; + return true; +} + +static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) +{ + struct sk_buff *tail, *skb = skb_from_uarg(uarg); + struct sock_exterr_skb *serr; + struct sock *sk = skb->sk; + struct sk_buff_head *q; + unsigned long flags; + bool is_zerocopy; + u32 lo, hi; + u16 len; + + mm_unaccount_pinned_pages(&uarg->mmp); + + /* if !len, there was only 1 call, and it was aborted + * so do not queue a completion notification + */ + if (!uarg->len || sock_flag(sk, SOCK_DEAD)) + goto release; + + len = uarg->len; + lo = uarg->id; + hi = uarg->id + len - 1; + is_zerocopy = uarg->zerocopy; + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; + serr->ee.ee_data = hi; + serr->ee.ee_info = lo; + if (!is_zerocopy) + serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; + + q = &sk->sk_error_queue; + spin_lock_irqsave(&q->lock, flags); + tail = skb_peek_tail(q); + if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || + !skb_zerocopy_notify_extend(tail, lo, len)) { + __skb_queue_tail(q, skb); + skb = NULL; + } + spin_unlock_irqrestore(&q->lock, flags); + + sk_error_report(sk); + +release: + consume_skb(skb); + sock_put(sk); +} + +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) +{ + struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); + + uarg_zc->zerocopy = uarg_zc->zerocopy & success; + + if (refcount_dec_and_test(&uarg->refcnt)) + __msg_zerocopy_callback(uarg_zc); +} + +void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) +{ + struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; + + atomic_dec(&sk->sk_zckey); + uarg_to_msgzc(uarg)->len--; + + if (have_uref) + msg_zerocopy_complete(NULL, uarg, true); +} +EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); + +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { + .complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) +{ + int err, orig_len = skb->len; + + if (uarg->ops->link_skb) { + err = uarg->ops->link_skb(skb, uarg); + if (err) + return err; + } else { + struct ubuf_info *orig_uarg = skb_zcopy(skb); + + /* An skb can only point to one uarg. This edge case happens + * when TCP appends to an skb, but zerocopy_realloc triggered + * a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + } + + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); + if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { + struct sock *save_sk = skb->sk; + + /* Streams do not free skb on error. Reset to prev state. */ + iov_iter_revert(&msg->msg_iter, skb->len - orig_len); + skb->sk = sk; + ___pskb_trim(skb, orig_len); + skb->sk = save_sk; + return err; + } + + skb_zcopy_set(skb, uarg, NULL); + return skb->len - orig_len; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); + +void __skb_zcopy_downgrade_managed(struct sk_buff *skb) +{ + int i; + + skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); +} +EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); + +static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, + gfp_t gfp_mask) +{ + if (skb_zcopy(orig)) { + if (skb_zcopy(nskb)) { + /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ + if (!gfp_mask) { + WARN_ON_ONCE(1); + return -ENOMEM; + } + if (skb_uarg(nskb) == skb_uarg(orig)) + return 0; + if (skb_copy_ubufs(nskb, GFP_ATOMIC)) + return -EIO; + } + skb_zcopy_set(nskb, skb_uarg(orig), NULL); + } + return 0; +} + /** * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify * @gfp_mask: allocation priority * - * This must be called on SKBTX_DEV_ZEROCOPY skb. + * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. * It will copy all frags into kernel and drop the reference * to userspace pages. * @@ -812,16 +1967,31 @@ EXPORT_SYMBOL_GPL(skb_morph); */ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { - int i; int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; - struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; + int i, order, psize, new_frags; + u32 d_off; - for (i = 0; i < num_frags; i++) { - u8 *vaddr; - skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) + return -EINVAL; + + if (!skb_frags_readable(skb)) + return -EFAULT; + + if (!num_frags) + goto release; - page = alloc_page(gfp_mask); + /* We might have to allocate high order pages, so compute what minimum + * page order is needed. + */ + order = 0; + while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) + order++; + psize = (PAGE_SIZE << order); + + new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); + for (i = 0; i < new_frags; i++) { + page = alloc_pages(gfp_mask | __GFP_COMP, order); if (!page) { while (head) { struct page *next = (struct page *)page_private(head); @@ -830,28 +2000,53 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) } return -ENOMEM; } - vaddr = kmap_atomic(skb_frag_page(f)); - memcpy(page_address(page), - vaddr + f->page_offset, skb_frag_size(f)); - kunmap_atomic(vaddr); set_page_private(page, (unsigned long)head); head = page; } + page = head; + d_off = 0; + for (i = 0; i < num_frags; i++) { + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), + p, p_off, p_len, copied) { + u32 copy, done = 0; + vaddr = kmap_atomic(p); + + while (done < p_len) { + if (d_off == psize) { + d_off = 0; + page = (struct page *)page_private(page); + } + copy = min_t(u32, psize - d_off, p_len - done); + memcpy(page_address(page) + d_off, + vaddr + p_off + done, copy); + done += copy; + d_off += copy; + } + kunmap_atomic(vaddr); + } + } + /* skb frags release userspace buffers */ for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); - uarg->callback(uarg, false); - /* skb frags point to kernel buffers */ - for (i = num_frags - 1; i >= 0; i--) { - __skb_fill_page_desc(skb, i, head, 0, - skb_shinfo(skb)->frags[i].size); + for (i = 0; i < new_frags - 1; i++) { + __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize); head = (struct page *)page_private(head); } + __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0, + d_off); + skb_shinfo(skb)->nr_frags = new_frags; - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; +release: + skb_zcopy_clear(skb, false); return 0; } EXPORT_SYMBOL_GPL(skb_copy_ubufs); @@ -872,27 +2067,27 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { + struct sk_buff_fclones *fclones = container_of(skb, + struct sk_buff_fclones, + skb1); struct sk_buff *n; if (skb_orphan_frags(skb, gfp_mask)) return NULL; - n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_UNAVAILABLE) { - atomic_t *fclone_ref = (atomic_t *) (n + 1); + refcount_read(&fclones->fclone_ref) == 1) { + n = &fclones->skb2; + refcount_set(&fclones->fclone_ref, 2); n->fclone = SKB_FCLONE_CLONE; - atomic_inc(fclone_ref); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask); if (!n) return NULL; - kmemcheck_annotate_bitfield(n, flags1); - kmemcheck_annotate_bitfield(n, flags2); n->fclone = SKB_FCLONE_UNAVAILABLE; } @@ -900,8 +2095,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) } EXPORT_SYMBOL(skb_clone); -static void skb_headers_offset_update(struct sk_buff *skb, int off) +void skb_headers_offset_update(struct sk_buff *skb, int off) { + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += off; /* {transport,network,mac}_header and tail are relative to skb->head */ skb->transport_header += off; skb->network_header += off; @@ -911,8 +2109,9 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) skb->inner_network_header += off; skb->inner_mac_header += off; } +EXPORT_SYMBOL(skb_headers_offset_update); -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { __copy_skb_header(new, old); @@ -920,6 +2119,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } +EXPORT_SYMBOL(skb_copy_header); static inline int skb_alloc_rx_flag(const struct sk_buff *skb) { @@ -947,11 +2147,20 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb_headroom(skb); - unsigned int size = skb_end_offset(skb) + skb->data_len; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + struct sk_buff *n; + unsigned int size; + int headerlen; + + if (!skb_frags_readable(skb)) + return NULL; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + headerlen = skb_headroom(skb); + size = skb_end_offset(skb) + skb->data_len; + n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; @@ -960,19 +2169,21 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) /* Set the tail pointer and length */ skb_put(n, skb->len); - if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) - BUG(); + BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); - copy_skb_header(n, skb); + skb_copy_header(n, skb); return n; } EXPORT_SYMBOL(skb_copy); /** - * __pskb_copy - create copy of an sk_buff with private head. + * __pskb_copy_fclone - create copy of an sk_buff with private head. * @skb: buffer to copy * @headroom: headroom of new skb * @gfp_mask: allocation priority + * @fclone: if true allocate the copy of the skb from the fclone + * cache instead of the head cache; it is recommended to set this + * to true for the cases where the copy will likely be cloned * * Make a copy of both an &sk_buff and part of its data, located * in header. Fragmented data remain shared. This is used when @@ -982,11 +2193,12 @@ EXPORT_SYMBOL(skb_copy); * The returned buffer has a reference count of 1. */ -struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, + gfp_t gfp_mask, bool fclone) { unsigned int size = skb_headlen(skb) + headroom; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); + struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); if (!n) goto out; @@ -1005,7 +2217,8 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) if (skb_shinfo(skb)->nr_frags) { int i; - if (skb_orphan_frags(skb, gfp_mask)) { + if (skb_orphan_frags(skb, gfp_mask) || + skb_zerocopy_clone(n, skb, gfp_mask)) { kfree_skb(n); n = NULL; goto out; @@ -1022,11 +2235,11 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) skb_clone_fraglist(n); } - copy_skb_header(n, skb); + skb_copy_header(n, skb); out: return n; } -EXPORT_SYMBOL(__pskb_copy); +EXPORT_SYMBOL(__pskb_copy_fclone); /** * pskb_expand_head - reallocate header of &sk_buff @@ -1035,37 +2248,41 @@ EXPORT_SYMBOL(__pskb_copy); * @ntail: room to add at tail * @gfp_mask: allocation priority * - * Expands (or creates identical copy, if &nhead and &ntail are zero) - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * Expands (or creates identical copy, if @nhead and @ntail are zero) + * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. + * + * Note: If you skb_push() the start of the buffer after reallocating the + * header, call skb_postpush_data_move() first to move the metadata out of + * the way before writing to &sk_buff->data. */ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { - int i; - u8 *data; - int size = nhead + skb_end_offset(skb) + ntail; + unsigned int osize = skb_end_offset(skb); + unsigned int size = osize + nhead + ntail; long off; + u8 *data; + int i; BUG_ON(nhead < 0); - if (skb_shared(skb)) - BUG(); + BUG_ON(skb_shared(skb)); - size = SKB_DATA_ALIGN(size); + skb_zcopy_downgrade_managed(skb); if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask, NUMA_NO_NODE, NULL); + + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; - size = SKB_WITH_OVERHEAD(ksize(data)); + size = SKB_WITH_OVERHEAD(size); /* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. @@ -1082,16 +2299,17 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, * be since all we did is relocate the values */ if (skb_cloned(skb)) { - /* copy this zero copy skb frags */ if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; + if (skb_zcopy(skb)) + refcount_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb); + skb_release_data(skb, SKB_CONSUMED); } else { skb_free_head(skb); } @@ -1100,25 +2318,29 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->head = data; skb->head_frag = 0; skb->data += off; + + skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; off = nhead; -#else - skb->end = skb->head + size; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); - /* Only adjust this if it actually is csum_start rather than csum */ - if (skb->ip_summed == CHECKSUM_PARTIAL) - skb->csum_start += nhead; skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + + /* It is not generally safe to change skb->truesize. + * For the moment, we really care of rx path, or + * when skb is orphaned (not attached to a socket). + */ + if (!skb->sk || skb->destructor == sock_edemux) + skb->truesize += size - osize; + return 0; nofrags: - kfree(data); + skb_kfree_head(data, size); nodata: return -ENOMEM; } @@ -1145,6 +2367,104 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +/* Note: We plan to rework this in linux-6.4 */ +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + unsigned int saved_end_offset, saved_truesize; + struct skb_shared_info *shinfo; + int res; + + saved_end_offset = skb_end_offset(skb); + saved_truesize = skb->truesize; + + res = pskb_expand_head(skb, 0, 0, pri); + if (res) + return res; + + skb->truesize = saved_truesize; + + if (likely(skb_end_offset(skb) == saved_end_offset)) + return 0; + + /* We can not change skb->end if the original or new value + * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). + */ + if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || + skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { + /* We think this path should not be taken. + * Add a temporary trace to warn us just in case. + */ + pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n", + saved_end_offset, skb_end_offset(skb)); + WARN_ON_ONCE(1); + return 0; + } + + shinfo = skb_shinfo(skb); + + /* We are about to change back skb->end, + * we need to move skb_shinfo() to its new location. + */ + memmove(skb->head + saved_end_offset, + shinfo, + offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); + + skb_set_end_offset(skb, saved_end_offset); + + return 0; +} + +/** + * skb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @headroom: needed headroom + * + * Unlike skb_realloc_headroom, this one does not allocate a new skb + * if possible; copies skb->sk to new skb as needed + * and frees original skb in case of failures. + * + * It expect increased headroom and generates warning otherwise. + */ + +struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) +{ + int delta = headroom - skb_headroom(skb); + int osize = skb_end_offset(skb); + struct sock *sk = skb->sk; + + if (WARN_ONCE(delta <= 0, + "%s is expecting an increase in the headroom", __func__)) + return skb; + + delta = SKB_DATA_ALIGN(delta); + /* pskb_expand_head() might crash, if skb is shared. */ + if (skb_shared(skb) || !is_skb_wmem(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (unlikely(!nskb)) + goto fail; + + if (sk) + skb_set_owner_w(nskb, sk); + consume_skb(skb); + skb = nskb; + } + if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) + goto fail; + + if (sk && is_skb_wmem(skb)) { + delta = skb_end_offset(skb) - osize; + refcount_add(delta, &sk->sk_wmem_alloc); + skb->truesize += delta; + } + return skb; + +fail: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_expand_head); + /** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy @@ -1170,13 +2490,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* * Allocate the copy buffer */ - struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask, skb_alloc_rx_flag(skb), - NUMA_NO_NODE); - int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; - int off; + struct sk_buff *n; + int oldheadroom; + + if (!skb_frags_readable(skb)) + return NULL; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + oldheadroom = skb_headroom(skb); + n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (!n) return NULL; @@ -1193,35 +2520,32 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, head_copy_off = newheadroom - head_copy_len; /* Copy the linear header and data. */ - if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, - skb->len + head_copy_len)) - BUG(); - - copy_skb_header(n, skb); + BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)); - off = newheadroom - oldheadroom; - if (n->ip_summed == CHECKSUM_PARTIAL) - n->csum_start += off; + skb_copy_header(n, skb); - skb_headers_offset_update(n, off); + skb_headers_offset_update(n, newheadroom - oldheadroom); return n; } EXPORT_SYMBOL(skb_copy_expand); /** - * skb_pad - zero pad the tail of an skb + * __skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad + * @free_on_error: free buffer on error * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * - * May return error in out of memory cases. The skb is freed on error. + * May return error in out of memory cases. The skb is freed on error + * if @free_on_error is true. */ -int skb_pad(struct sk_buff *skb, int pad) +int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) { int err; int ntail; @@ -1250,10 +2574,34 @@ int skb_pad(struct sk_buff *skb, int pad) return 0; free_skb: - kfree_skb(skb); + if (free_on_error) + kfree_skb(skb); return err; } -EXPORT_SYMBOL(skb_pad); +EXPORT_SYMBOL(__skb_pad); + +/** + * pskb_put - add data to the tail of a potentially fragmented buffer + * @skb: start of the buffer to use + * @tail: tail fragment of the buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the potentially + * fragmented buffer. @tail must be the last fragment of @skb -- or + * @skb itself. If this would exceed the total buffer size the kernel + * will panic. A pointer to the first byte of the extra data is + * returned. + */ + +void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); /** * skb_put - add data to a buffer @@ -1264,9 +2612,9 @@ EXPORT_SYMBOL(skb_pad); * exceed the total buffer size the kernel will panic. A pointer to the * first byte of the extra data is returned. */ -unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +void *skb_put(struct sk_buff *skb, unsigned int len) { - unsigned char *tmp = skb_tail_pointer(skb); + void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; @@ -1285,11 +2633,11 @@ EXPORT_SYMBOL(skb_put); * start. If this would exceed the total buffer headroom the kernel will * panic. A pointer to the first byte of the extra data is returned. */ -unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +void *skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; - if (unlikely(skb->data<skb->head)) + if (unlikely(skb->data < skb->head)) skb_under_panic(skb, len, __builtin_return_address(0)); return skb->data; } @@ -1305,13 +2653,37 @@ EXPORT_SYMBOL(skb_push); * is returned. Once the data has been pulled future pushes will overwrite * the old data. */ -unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +void *skb_pull(struct sk_buff *skb, unsigned int len) { return skb_pull_inline(skb, len); } EXPORT_SYMBOL(skb_pull); /** + * skb_pull_data - remove data from the start of a buffer returning its + * original position. + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the original data in the buffer + * is returned after checking if there is enough data to pull. Once the + * data has been pulled future pushes will overwrite the old data. + */ +void *skb_pull_data(struct sk_buff *skb, size_t len) +{ + void *data = skb->data; + + if (skb->len < len) + return NULL; + + skb_pull(skb, len); + + return data; +} +EXPORT_SYMBOL(skb_pull_data); + +/** * skb_trim - remove end from a buffer * @skb: buffer to alter * @len: new length @@ -1409,10 +2781,33 @@ done: skb_set_tail_pointer(skb, len); } + if (!skb->sk || skb->destructor == sock_edemux) + skb_condense(skb); return 0; } EXPORT_SYMBOL(___pskb_trim); +/* Note : use pskb_trim_rcsum() instead of calling this directly + */ +int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + int delta = skb->len - len; + + skb->csum = csum_block_sub(skb->csum, + skb_checksum(skb, len, delta, 0), + len); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; + } + return __pskb_trim(skb, len); +} +EXPORT_SYMBOL(pskb_trim_rcsum_slow); + /** * __pskb_pull_tail - advance tail of skb header * @skb: buffer to reallocate @@ -1438,7 +2833,7 @@ EXPORT_SYMBOL(___pskb_trim); * * It is pretty complicated. Luckily, it is called only in exceptional cases. */ -unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +void *__pskb_pull_tail(struct sk_buff *skb, int delta) { /* If skb has not enough free space at tail, get new one * plus 128 bytes for future expansions. If we have enough @@ -1446,14 +2841,17 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) */ int i, k, eat = (skb->tail + delta) - skb->end; + if (!skb_frags_readable(skb)) + return NULL; + if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC)) return NULL; } - if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) - BUG(); + BUG_ON(skb_copy_bits(skb, skb_headlen(skb), + skb_tail_pointer(skb), delta)); /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. @@ -1472,7 +2870,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) } /* If we need update frag list, we are in troubles. - * Certainly, it possible to add an offset to skb data, + * Certainly, it is possible to add an offset to skb data, * but taking into account that pulling is expected to * be very rare operation, it is worth to fight against * further bloating skb head and crucify ourselves here instead. @@ -1484,8 +2882,6 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - BUG_ON(!list); - if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; @@ -1493,6 +2889,9 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) insp = list; } else { /* Eaten partially. */ + if (skb_is_gso(skb) && !list->head_frag && + skb_headlen(list)) + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; if (skb_shared(list)) { /* Sucks! We need to fork list. :-( */ @@ -1517,7 +2916,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { @@ -1537,10 +2936,14 @@ pull_pages: skb_frag_unref(skb, i); eat -= size; } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; + + *frag = skb_shinfo(skb)->frags[i]; if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); + skb_frag_off_add(frag, eat); + skb_frag_size_sub(frag, eat); + if (!i) + goto end; eat = 0; } k++; @@ -1548,9 +2951,13 @@ pull_pages: } skb_shinfo(skb)->nr_frags = k; +end: skb->tail += delta; skb->data_len -= delta; + if (!skb->data_len) + skb_zcopy_clear(skb, false); + return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail); @@ -1590,6 +2997,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) to += copy; } + if (!skb_frags_readable(skb)) + goto fault; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; @@ -1598,16 +3008,20 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) end = start + skb_frag_size(f); if ((copy = end - offset) > 0) { + u32 p_off, p_len, copied; + struct page *p; u8 *vaddr; if (copy > len) copy = len; - vaddr = kmap_atomic(skb_frag_page(f)); - memcpy(to, - vaddr + f->page_offset + offset - start, - copy); - kunmap_atomic(vaddr); + skb_frag_foreach_page(f, + skb_frag_off(f) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + memcpy(to + copied, vaddr + p_off, p_len); + kunmap_atomic(vaddr); + } if ((len -= copy) == 0) return 0; @@ -1685,10 +3099,8 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd, /* * Fill page/offset/length into spd, if it can hold more pages. */ -static bool spd_fill_page(struct splice_pipe_desc *spd, - struct pipe_inode_info *pipe, struct page *page, - unsigned int *len, unsigned int offset, - bool linear, +static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page, + unsigned int *len, unsigned int offset, bool linear, struct sock *sk) { if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) @@ -1716,8 +3128,7 @@ static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct splice_pipe_desc *spd, bool linear, - struct sock *sk, - struct pipe_inode_info *pipe) + struct sock *sk) { if (!*len) return true; @@ -1736,13 +3147,14 @@ static bool __splice_segment(struct page *page, unsigned int poff, do { unsigned int flen = min(*len, plen); - if (spd_fill_page(spd, pipe, page, &flen, poff, - linear, sk)) + if (spd_fill_page(spd, page, &flen, poff, linear, sk)) return true; poff += flen; plen -= flen; *len -= flen; - } while (*len && plen); + if (!*len) + return true; + } while (plen); return false; } @@ -1755,6 +3167,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd, struct sock *sk) { + struct sk_buff *iter; int seg; /* map the linear part : @@ -1767,18 +3180,37 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, skb_headlen(skb), offset, len, spd, skb_head_is_locked(skb), - sk, pipe)) + sk)) return true; /* * then map the fragments */ + if (!skb_frags_readable(skb)) + return false; + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + if (WARN_ON_ONCE(!skb_frag_page(f))) + return false; + if (__splice_segment(skb_frag_page(f), - f->page_offset, skb_frag_size(f), - offset, len, spd, false, sk, pipe)) + skb_frag_off(f), skb_frag_size(f), + offset, len, spd, false, sk)) + return true; + } + + skb_walk_frags(skb, iter) { + if (*offset >= iter->len) { + *offset -= iter->len; + continue; + } + /* __skb_splice_bits() only fails if the output has no room + * left, so no point in going over the frag_list for the error + * case. + */ + if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) return true; } @@ -1787,11 +3219,9 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, /* * Map data from the skb to a pipe. Should handle both the linear part, - * the fragments, and the frag list. It does NOT handle frag lists within - * the frag list, if such a thing exists. We'd probably need to recurse to - * handle that cleanly. + * the fragments, and the frag list. */ -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { @@ -1801,50 +3231,167 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .pages = pages, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, - .flags = flags, - .ops = &sock_pipe_buf_ops, + .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; - struct sk_buff *frag_iter; - struct sock *sk = skb->sk; int ret = 0; - /* - * __skb_splice_bits() only fails if the output has no room left, - * so no point in going over the frag_list for the error case. - */ - if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) - goto done; - else if (!tlen) - goto done; + __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); - /* - * now see if we have a frag_list to map - */ - skb_walk_frags(skb, frag_iter) { - if (!tlen) - break; - if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) + if (spd.nr_pages) + ret = splice_to_pipe(pipe, &spd); + + return ret; +} +EXPORT_SYMBOL_GPL(skb_splice_bits); + +static int sendmsg_locked(struct sock *sk, struct msghdr *msg) +{ + struct socket *sock = sk->sk_socket; + size_t size = msg_data_left(msg); + + if (!sock) + return -EINVAL; + + if (!sock->ops->sendmsg_locked) + return sock_no_sendmsg_locked(sk, msg, size); + + return sock->ops->sendmsg_locked(sk, msg, size); +} + +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return sock_sendmsg(sock, msg); +} + +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); +static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, + int len, sendmsg_func sendmsg, int flags) +{ + int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0; + unsigned int orig_len = len; + struct sk_buff *head = skb; + unsigned short fragidx; + int slen, ret; + +do_frag_list: + + /* Deal with head data */ + while (offset < skb_headlen(skb) && len) { + struct kvec kv; + struct msghdr msg; + + slen = min_t(int, len, skb_headlen(skb) - offset); + kv.iov_base = skb->data + offset; + kv.iov_len = slen; + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | flags; + if (slen < len) + msg.msg_flags |= more_hint; + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, + sendmsg_unlocked, sk, &msg); + if (ret <= 0) + goto error; + + offset += ret; + len -= ret; + } + + /* All the data was skb head? */ + if (!len) + goto out; + + /* Make offset relative to start of frags */ + offset -= skb_headlen(skb); + + /* Find where we are in frag list */ + for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; + + if (offset < skb_frag_size(frag)) break; + + offset -= skb_frag_size(frag); } -done: - if (spd.nr_pages) { - /* - * Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, &spd); - lock_sock(sk); + for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; + + slen = min_t(size_t, len, skb_frag_size(frag) - offset); + + while (slen) { + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | + flags, + }; + + if (slen < len) + msg.msg_flags |= more_hint; + bvec_set_page(&bvec, skb_frag_page(frag), slen, + skb_frag_off(frag) + offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, + slen); + + ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, + sendmsg_unlocked, sk, &msg); + if (ret <= 0) + goto error; + + len -= ret; + offset += ret; + slen -= ret; + } + + offset = 0; } - return ret; + if (len) { + /* Process any frag lists */ + + if (skb == head) { + if (skb_has_frag_list(skb)) { + skb = skb_shinfo(skb)->frag_list; + goto do_frag_list; + } + } else if (skb->next) { + skb = skb->next; + goto do_frag_list; + } + } + +out: + return orig_len - len; + +error: + return orig_len == len ? ret : orig_len - len; +} + +/* Send skb data on a socket. Socket must be locked. */ +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, + int len) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked); + +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); + +/* Send skb data on a socket. Socket must be unlocked. */ +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** @@ -1878,6 +3425,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) from += copy; } + if (!skb_frags_readable(skb)) + goto fault; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; int end; @@ -1886,15 +3436,20 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { + u32 p_off, p_len, copied; + struct page *p; u8 *vaddr; if (copy > len) copy = len; - vaddr = kmap_atomic(skb_frag_page(frag)); - memcpy(vaddr + frag->page_offset + offset - start, - from, copy); - kunmap_atomic(vaddr); + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + memcpy(vaddr + p_off, from + copied, p_len); + kunmap_atomic(vaddr); + } if ((len -= copy) == 0) return 0; @@ -1932,9 +3487,7 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) +__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -1952,6 +3505,9 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, pos = copy; } + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; @@ -1960,20 +3516,27 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { + u32 p_off, p_len, copied; + struct page *p; __wsum csum2; u8 *vaddr; if (copy > len) copy = len; - vaddr = kmap_atomic(skb_frag_page(frag)); - csum2 = csum_partial(vaddr + frag->page_offset + - offset - start, copy, 0); - kunmap_atomic(vaddr); - csum = csum_block_add(csum, csum2, pos); + + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + csum2 = csum_partial(vaddr + p_off, p_len, 0); + kunmap_atomic(vaddr); + csum = csum_block_add(csum, csum2, pos); + pos += p_len; + } + if (!(len -= copy)) return csum; offset += copy; - pos += copy; } start = end; } @@ -1988,8 +3551,8 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, __wsum csum2; if (copy > len) copy = len; - csum2 = skb_checksum(frag_iter, offset - start, - copy, 0); + csum2 = skb_checksum(frag_iter, offset - start, copy, + 0); csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; @@ -2007,19 +3570,20 @@ EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, - u8 *to, int len, __wsum csum) + u8 *to, int len) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int pos = 0; + __wsum csum = 0; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; csum = csum_partial_copy_nocheck(skb->data + offset, to, - copy, csum); + copy); if ((len -= copy) == 0) return csum; offset += copy; @@ -2027,6 +3591,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, pos = copy; } + if (!skb_frags_readable(skb)) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -2034,24 +3601,31 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; __wsum csum2; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; if (copy > len) copy = len; - vaddr = kmap_atomic(skb_frag_page(frag)); - csum2 = csum_partial_copy_nocheck(vaddr + - frag->page_offset + - offset - start, to, - copy, 0); - kunmap_atomic(vaddr); - csum = csum_block_add(csum, csum2, pos); + + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + csum2 = csum_partial_copy_nocheck(vaddr + p_off, + to + copied, + p_len); + kunmap_atomic(vaddr); + csum = csum_block_add(csum, csum2, pos); + pos += p_len; + } + if (!(len -= copy)) return csum; offset += copy; to += copy; - pos += copy; } start = end; } @@ -2068,7 +3642,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, copy = len; csum2 = skb_copy_and_csum_bits(frag_iter, offset - start, - to, copy, 0); + to, copy); csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; @@ -2083,6 +3657,242 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +#ifdef CONFIG_NET_CRC32C +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + if (copy > 0) { + copy = min(copy, len); + crc = crc32c(crc, skb->data + offset, copy); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + copy = end - offset; + if (copy > 0) { + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + copy = min(copy, len); + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + crc = crc32c(crc, vaddr + p_off, p_len); + kunmap_atomic(vaddr); + } + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + copy = min(copy, len); + crc = skb_crc32c(frag_iter, offset - start, copy, crc); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + BUG_ON(len); + + return crc; +} +EXPORT_SYMBOL(skb_crc32c); +#endif /* CONFIG_NET_CRC32C */ + +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + + /** + * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() + * @from: source buffer + * + * Calculates the amount of linear headroom needed in the 'to' skb passed + * into skb_zerocopy(). + */ +unsigned int +skb_zerocopy_headlen(const struct sk_buff *from) +{ + unsigned int hlen = 0; + + if (!from->head_frag || + skb_headlen(from) < L1_CACHE_BYTES || + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { + hlen = skb_headlen(from); + if (!hlen) + hlen = from->len; + } + + if (skb_has_frag_list(from)) + hlen = from->len; + + return hlen; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); + +/** + * skb_zerocopy - Zero copy skb to skb + * @to: destination buffer + * @from: source buffer + * @len: number of bytes to copy from source buffer + * @hlen: size of linear headroom in destination buffer + * + * Copies up to `len` bytes from `from` to `to` by creating references + * to the frags in the source buffer. + * + * The `hlen` as calculated by skb_zerocopy_headlen() specifies the + * headroom in the `to` buffer. + * + * Return value: + * 0: everything is OK + * -ENOMEM: couldn't orphan frags of @from due to lack of memory + * -EFAULT: skb_copy_bits() found some problem with skb geometry + */ +int +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + int ret; + struct page *page; + unsigned int offset; + + BUG_ON(!from->head_frag && !hlen); + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) + return skb_copy_bits(from, 0, skb_put(to, len), len); + + if (hlen) { + ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + if (unlikely(ret)) + return ret; + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_netmem_desc(to, 0, page_to_netmem(page), + offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + skb_len_add(to, len + plen); + + if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { + skb_tx_error(from); + return -ENOMEM; + } + skb_zerocopy_clone(to, from, GFP_ATOMIC); + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + int size; + + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), + len); + skb_frag_size_set(&skb_shinfo(to)->frags[j], size); + len -= size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); + void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) { __wsum csum; @@ -2100,7 +3910,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) csum = 0; if (csstart != skb->len) csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, - skb->len - csstart, 0); + skb->len - csstart); if (skb->ip_summed == CHECKSUM_PARTIAL) { long csstuff = csstart + skb->csum_offset; @@ -2152,20 +3962,79 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) EXPORT_SYMBOL(skb_dequeue_tail); /** - * skb_queue_purge - empty a list + * skb_queue_purge_reason - empty a list * @list: list to empty + * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function takes the list * lock and is atomic with respect to other list locking functions. */ -void skb_queue_purge(struct sk_buff_head *list) +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) { - struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) + struct sk_buff_head tmp; + unsigned long flags; + + if (skb_queue_empty_lockless(list)) + return; + + __skb_queue_head_init(&tmp); + + spin_lock_irqsave(&list->lock, flags); + skb_queue_splice_init(list, &tmp); + spin_unlock_irqrestore(&list->lock, flags); + + __skb_queue_purge_reason(&tmp, reason); +} +EXPORT_SYMBOL(skb_queue_purge_reason); + +/** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * Return value: the sum of truesizes of all purged skbs. + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +unsigned int skb_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + sum += skb->truesize; kfree_skb(skb); + } + return sum; +} + +void skb_errqueue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb, *next; + struct sk_buff_head kill; + unsigned long flags; + + __skb_queue_head_init(&kill); + + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk_safe(list, skb, next) { + if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) + continue; + __skb_unlink(skb, list); + __skb_queue_tail(&kill, skb); + } + spin_unlock_irqrestore(&list->lock, flags); + __skb_queue_purge(&kill); } -EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_errqueue_purge); /** * skb_queue_head - queue a buffer at the list head @@ -2249,28 +4118,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) @@ -2284,6 +4131,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb, skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb1->unreadable = skb->unreadable; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; skb1->len += skb1->data_len; @@ -2320,7 +4168,7 @@ static inline void skb_split_no_header(struct sk_buff *skb, * 2. Split is accurately. We make this. */ skb_frag_ref(skb, i); - skb_shinfo(skb1)->frags[0].page_offset += len - pos; + skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); skb_shinfo(skb)->nr_frags++; @@ -2331,6 +4179,8 @@ static inline void skb_split_no_header(struct sk_buff *skb, pos += size; } skb_shinfo(skb1)->nr_frags = k; + + skb1->unreadable = skb->unreadable; } /** @@ -2342,8 +4192,12 @@ static inline void skb_split_no_header(struct sk_buff *skb, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); + const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; + + skb_zcopy_downgrade_managed(skb); - skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; + skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2357,7 +4211,7 @@ EXPORT_SYMBOL(skb_split); */ static int skb_prepare_for_shift(struct sk_buff *skb) { - return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + return skb_unclone_keeptruesize(skb, GFP_ATOMIC); } /** @@ -2381,10 +4235,17 @@ static int skb_prepare_for_shift(struct sk_buff *skb) int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) { int from, to, merge, todo; - struct skb_frag_struct *fragfrom, *fragto; + skb_frag_t *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); - BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + if (skb_headlen(skb)) + return 0; + if (skb_zcopy(tgt) || skb_zcopy(skb)) + return 0; + + DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); + DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); todo = shiftlen; from = 0; @@ -2394,9 +4255,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) /* Actual merge is delayed until the point when we know we can * commit all, so that we don't have to undo partial changes */ - if (!to || - !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), - fragfrom->page_offset)) { + if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + skb_frag_off(fragfrom))) { merge = -1; } else { merge = to - 1; @@ -2413,7 +4273,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) skb_frag_size_add(fragto, shiftlen); skb_frag_size_sub(fragfrom, shiftlen); - fragfrom->page_offset += shiftlen; + skb_frag_off_add(fragfrom, shiftlen); goto onlymerged; } @@ -2444,11 +4304,11 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) } else { __skb_frag_ref(fragfrom); - fragto->page = fragfrom->page; - fragto->page_offset = fragfrom->page_offset; + skb_frag_page_copy(fragto, fragfrom); + skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); - fragfrom->page_offset += todo; + skb_frag_off_add(fragfrom, todo); skb_frag_size_sub(fragfrom, todo); todo = 0; @@ -2465,7 +4325,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom); + __skb_frag_unref(fragfrom, skb->pp_recycle); } /* Reposition in the original skb */ @@ -2483,13 +4343,8 @@ onlymerged: tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; - /* Yak, is it really working this way? Some helper please? */ - skb->len -= shiftlen; - skb->data_len -= shiftlen; - skb->truesize -= shiftlen; - tgt->len += shiftlen; - tgt->data_len += shiftlen; - tgt->truesize += shiftlen; + skb_len_add(skb, -shiftlen); + skb_len_add(tgt, shiftlen); return shiftlen; } @@ -2512,6 +4367,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, st->root_skb = st->cur_skb = skb; st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; + st->frag_off = 0; } EXPORT_SYMBOL(skb_prepare_seq_read); @@ -2521,19 +4377,19 @@ EXPORT_SYMBOL(skb_prepare_seq_read); * @data: destination pointer for data to be returned * @st: state variable * - * Reads a block of skb data at &consumed relative to the + * Reads a block of skb data at @consumed relative to the * lower offset specified to skb_prepare_seq_read(). Assigns - * the head of the data block to &data and returns the length + * the head of the data block to @data and returns the length * of the block or 0 if the end of the skb data or the upper * offset has been reached. * * The caller is not required to consume all of the data - * returned, i.e. &consumed is typically set to the number + * returned, i.e. @consumed is typically set to the number * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * * Note 1: The size of each block of data returned can be arbitrary, - * this limitation is the cost for zerocopy seqeuental + * this limitation is the cost for zerocopy sequential * reads of potentially non linear data. * * Note 2: Fragment lists within fragments are not implemented @@ -2562,18 +4418,34 @@ next_skb: return block_limit - abs_offset; } + if (!skb_frags_readable(st->cur_skb)) + return 0; + if (st->frag_idx == 0 && !st->frag_data) st->stepped_offset += skb_headlen(st->cur_skb); while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + unsigned int pg_idx, pg_off, pg_sz; + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; - block_limit = skb_frag_size(frag) + st->stepped_offset; + pg_idx = 0; + pg_off = skb_frag_off(frag); + pg_sz = skb_frag_size(frag); + + if (skb_frag_must_loop(skb_frag_page(frag))) { + pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; + pg_off = offset_in_page(pg_off + st->frag_off); + pg_sz = min_t(unsigned int, pg_sz - st->frag_off, + PAGE_SIZE - pg_off); + } + + block_limit = pg_sz + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) - st->frag_data = kmap_atomic(skb_frag_page(frag)); + st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); - *data = (u8 *) st->frag_data + frag->page_offset + + *data = (u8 *)st->frag_data + pg_off + (abs_offset - st->stepped_offset); return block_limit - abs_offset; @@ -2584,8 +4456,12 @@ next_skb: st->frag_data = NULL; } - st->frag_idx++; - st->stepped_offset += skb_frag_size(frag); + st->stepped_offset += pg_sz; + st->frag_off += pg_sz; + if (st->frag_off == skb_frag_size(frag)) { + st->frag_off = 0; + st->frag_idx++; + } } if (st->frag_data) { @@ -2621,6 +4497,41 @@ void skb_abort_seq_read(struct skb_seq_state *st) } EXPORT_SYMBOL(skb_abort_seq_read); +/** + * skb_copy_seq_read() - copy from a skb_seq_state to a buffer + * @st: source skb_seq_state + * @offset: offset in source + * @to: destination buffer + * @len: number of bytes to copy + * + * Copy @len bytes from @offset bytes into the source @st to the destination + * buffer @to. `offset` should increase (or be unchanged) with each subsequent + * call to this function. If offset needs to decrease from the previous use `st` + * should be reset first. + * + * Return: 0 on success or -EINVAL if the copy ended early + */ +int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) +{ + const u8 *data; + u32 sqlen; + + for (;;) { + sqlen = skb_seq_read(offset, &data, st); + if (sqlen == 0) + return -EINVAL; + if (sqlen >= len) { + memcpy(to, data, len); + return 0; + } + memcpy(to, data, sqlen); + to += sqlen; + offset += sqlen; + len -= sqlen; + } +} +EXPORT_SYMBOL(skb_copy_seq_read); + #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, @@ -2641,7 +4552,6 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) * @from: search offset * @to: search limit * @config: textsearch configuration - * @state: uninitialized textsearch state variable * * Finds a pattern in the skb data according to the specified * textsearch configuration. Use textsearch_next() to retrieve @@ -2649,78 +4559,42 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) * to the first occurrence or UINT_MAX if no match was found. */ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, - unsigned int to, struct ts_config *config, - struct ts_state *state) + unsigned int to, struct ts_config *config) { + unsigned int patlen = config->ops->get_pattern_len(config); + struct ts_state state; unsigned int ret; + BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); + config->get_next_block = skb_ts_get_next_block; config->finish = skb_ts_finish; - skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); - ret = textsearch_find(config, state); - return (ret <= to - from ? ret : UINT_MAX); + ret = textsearch_find(config, &state); + return (ret + patlen <= to - from ? ret : UINT_MAX); } EXPORT_SYMBOL(skb_find_text); -/** - * skb_append_datato_frags - append the user data to a skb - * @sk: sock structure - * @skb: skb structure to be appened with user data. - * @getfrag: call back function to be used for getting the user data - * @from: pointer to user message iov - * @length: length of the iov message - * - * Description: This procedure append the user data in the fragment part - * of the skb if any page alloc fails user this procedure returns -ENOMEM - */ -int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int (*getfrag)(void *from, char *to, int offset, - int len, int odd, struct sk_buff *skb), - void *from, int length) +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size, size_t max_frags) { - int frg_cnt = skb_shinfo(skb)->nr_frags; - int copy; - int offset = 0; - int ret; - struct page_frag *pfrag = ¤t->task_frag; - - do { - /* Return error if we don't have space for new frag */ - if (frg_cnt >= MAX_SKB_FRAGS) - return -EMSGSIZE; - - if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; - - /* copy the user data to page */ - copy = min_t(int, length, pfrag->size - pfrag->offset); - - ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, - offset, copy, 0, skb); - if (ret < 0) - return -EFAULT; - - /* copy was successful so update the size parameters */ - skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, - copy); - frg_cnt++; - pfrag->offset += copy; - get_page(pfrag->page); - - skb->truesize += copy; - atomic_add(copy, &sk->sk_wmem_alloc); - skb->len += copy; - skb->data_len += copy; - offset += copy; - length -= copy; - - } while (length > 0); + int i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + } else if (i < max_frags) { + skb_zcopy_downgrade_managed(skb); + get_page(page); + skb_fill_page_desc_noacc(skb, i, page, offset, size); + } else { + return -EMSGSIZE; + } return 0; } -EXPORT_SYMBOL(skb_append_datato_frags); +EXPORT_SYMBOL_GPL(skb_append_pagefrags); /** * skb_pull_rcsum - pull skb and update receive checksum @@ -2733,79 +4607,283 @@ EXPORT_SYMBOL(skb_append_datato_frags); * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ -unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) +void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { + unsigned char *data = skb->data; + BUG_ON(len > skb->len); - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - skb_postpull_rcsum(skb, skb->data, len); - return skb->data += len; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, data, len); + return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); +static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) +{ + skb_frag_t head_frag; + struct page *page; + + page = virt_to_head_page(frag_skb->head); + skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - + (unsigned char *)page_address(page), + skb_headlen(frag_skb)); + return head_frag; +} + +struct sk_buff *skb_segment_list(struct sk_buff *skb, + netdev_features_t features, + unsigned int offset) +{ + struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; + unsigned int tnl_hlen = skb_tnl_header_len(skb); + unsigned int delta_truesize = 0; + unsigned int delta_len = 0; + struct sk_buff *tail = NULL; + struct sk_buff *nskb, *tmp; + int len_diff, err; + + skb_push(skb, -skb_network_offset(skb) + offset); + + /* Ensure the head is writeable before touching the shared info */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) + goto err_linearize; + + skb_shinfo(skb)->frag_list = NULL; + + while (list_skb) { + nskb = list_skb; + list_skb = list_skb->next; + + err = 0; + delta_truesize += nskb->truesize; + if (skb_shared(nskb)) { + tmp = skb_clone(nskb, GFP_ATOMIC); + if (tmp) { + consume_skb(nskb); + nskb = tmp; + err = skb_unclone(nskb, GFP_ATOMIC); + } else { + err = -ENOMEM; + } + } + + if (!tail) + skb->next = nskb; + else + tail->next = nskb; + + if (unlikely(err)) { + nskb->next = list_skb; + goto err_linearize; + } + + tail = nskb; + + delta_len += nskb->len; + + skb_push(nskb, -skb_network_offset(nskb) + offset); + + skb_release_head_state(nskb); + len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); + __copy_skb_header(nskb, skb); + + skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + nskb->transport_header += len_diff; + skb_copy_from_linear_data_offset(skb, -tnl_hlen, + nskb->data - tnl_hlen, + offset + tnl_hlen); + + if (skb_needs_linearize(nskb, features) && + __skb_linearize(nskb)) + goto err_linearize; + } + + skb->truesize = skb->truesize - delta_truesize; + skb->data_len = skb->data_len - delta_len; + skb->len = skb->len - delta_len; + + skb_gso_reset(skb); + + skb->prev = tail; + + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto err_linearize; + + skb_get(skb); + + return skb; + +err_linearize: + kfree_skb_list(skb->next); + skb->next = NULL; + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(skb_segment_list); + /** * skb_segment - Perform protocol segmentation on skb. - * @skb: buffer to segment + * @head_skb: buffer to segment * @features: features for the output path (see dev->features) * * This function performs segmentation on the given skb. It returns * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ -struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) +struct sk_buff *skb_segment(struct sk_buff *head_skb, + netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; - struct sk_buff *fskb = skb_shinfo(skb)->frag_list; - unsigned int mss = skb_shinfo(skb)->gso_size; - unsigned int doffset = skb->data - skb_mac_header(skb); + struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; + unsigned int mss = skb_shinfo(head_skb)->gso_size; + unsigned int doffset = head_skb->data - skb_mac_header(head_skb); unsigned int offset = doffset; - unsigned int tnl_hlen = skb_tnl_header_len(skb); + unsigned int tnl_hlen = skb_tnl_header_len(head_skb); + unsigned int partial_segs = 0; unsigned int headroom; - unsigned int len; + unsigned int len = head_skb->len; + struct sk_buff *frag_skb; + skb_frag_t *frag; __be16 proto; - bool csum; - int sg = !!(features & NETIF_F_SG); - int nfrags = skb_shinfo(skb)->nr_frags; + bool csum, sg; int err = -ENOMEM; int i = 0; - int pos; + int nfrags, pos; + + if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && + mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { + struct sk_buff *check_skb; + + for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { + if (skb_headlen(check_skb) && !check_skb->head_frag) { + /* gso_size is untrusted, and we have a frag_list with + * a linear non head_frag item. + * + * If head_skb's headlen does not fit requested gso_size, + * it means that the frag_list members do NOT terminate + * on exact gso_size boundaries. Hence we cannot perform + * skb_frag_t page sharing. Therefore we must fallback to + * copying the frag_list skbs; we do so by disabling SG. + */ + features &= ~NETIF_F_SG; + break; + } + } + } - proto = skb_network_protocol(skb); + __skb_push(head_skb, doffset); + proto = skb_network_protocol(head_skb, NULL); if (unlikely(!proto)) return ERR_PTR(-EINVAL); + sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); - __skb_push(skb, doffset); - headroom = skb_headroom(skb); - pos = skb_headlen(skb); + + if (sg && csum && (mss != GSO_BY_FRAGS)) { + if (!(features & NETIF_F_GSO_PARTIAL)) { + struct sk_buff *iter; + unsigned int frag_len; + + if (!list_skb || + !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) + goto normal; + + /* If we get here then all the required + * GSO features except frag_list are supported. + * Try to split the SKB to multiple GSO SKBs + * with no frag_list. + * Currently we can do that only when the buffers don't + * have a linear part and all the buffers except + * the last are of the same length. + */ + frag_len = list_skb->len; + skb_walk_frags(head_skb, iter) { + if (frag_len != iter->len && iter->next) + goto normal; + if (skb_headlen(iter) && !iter->head_frag) + goto normal; + + len -= iter->len; + } + + if (len != frag_len) + goto normal; + } + + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + * Cap len to not accidentally hit GSO_BY_FRAGS. + */ + partial_segs = min(len, GSO_BY_FRAGS - 1) / mss; + if (partial_segs > 1) + mss *= partial_segs; + else + partial_segs = 0; + } + +normal: + headroom = skb_headroom(head_skb); + pos = skb_headlen(head_skb); + + if (skb_orphan_frags(head_skb, GFP_ATOMIC)) + return ERR_PTR(-ENOMEM); + + nfrags = skb_shinfo(head_skb)->nr_frags; + frag = skb_shinfo(head_skb)->frags; + frag_skb = head_skb; do { struct sk_buff *nskb; - skb_frag_t *frag; + skb_frag_t *nskb_frag; int hsize; int size; - len = skb->len - offset; - if (len > mss) - len = mss; - - hsize = skb_headlen(skb) - offset; - if (hsize < 0) - hsize = 0; - if (hsize > len || !sg) - hsize = len; + if (unlikely(mss == GSO_BY_FRAGS)) { + len = list_skb->len; + } else { + len = head_skb->len - offset; + if (len > mss) + len = mss; + } - if (!hsize && i >= nfrags) { - BUG_ON(fskb->len != len); + hsize = skb_headlen(head_skb) - offset; - pos += len; - nskb = skb_clone(fskb, GFP_ATOMIC); - fskb = fskb->next; + if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && + (skb_headlen(list_skb) == len || sg)) { + BUG_ON(skb_headlen(list_skb) > len); + nskb = skb_clone(list_skb, GFP_ATOMIC); if (unlikely(!nskb)) goto err; + i = 0; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; + pos += skb_headlen(list_skb); + + while (pos < offset + len) { + BUG_ON(i >= nfrags); + + size = skb_frag_size(frag); + if (pos + size > offset + len) + break; + + i++; + pos += size; + frag++; + } + + list_skb = list_skb->next; + + if (unlikely(pskb_trim(nskb, len))) { + kfree_skb(nskb); + goto err; + } + hsize = skb_end_offset(nskb); if (skb_cow_head(nskb, doffset + headroom)) { kfree_skb(nskb); @@ -2816,8 +4894,13 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { + if (hsize < 0) + hsize = 0; + if (hsize > len || !sg) + hsize = len; + nskb = __alloc_skb(hsize + doffset + headroom, - GFP_ATOMIC, skb_alloc_rx_flag(skb), + GFP_ATOMIC, skb_alloc_rx_flag(head_skb), NUMA_NO_NODE); if (unlikely(!nskb)) @@ -2833,80 +4916,101 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) segs = nskb; tail = nskb; - __copy_skb_header(nskb, skb); - nskb->mac_len = skb->mac_len; - - /* nskb and skb might have different headroom */ - if (nskb->ip_summed == CHECKSUM_PARTIAL) - nskb->csum_start += skb_headroom(nskb) - headroom; + __copy_skb_header(nskb, head_skb); - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, skb->mac_len); - nskb->transport_header = (nskb->network_header + - skb_network_header_len(skb)); + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); + skb_reset_mac_len(nskb); - skb_copy_from_linear_data_offset(skb, -tnl_hlen, + skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, nskb->data - tnl_hlen, doffset + tnl_hlen); - if (fskb != skb_shinfo(skb)->frag_list) + if (nskb->len == len + doffset) goto perform_csum_check; if (!sg) { - nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(skb, offset, - skb_put(nskb, len), - len, 0); + if (!csum) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, + len), + len); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; + } else { + if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) + goto err; + } continue; } - frag = skb_shinfo(nskb)->frags; + nskb_frag = skb_shinfo(nskb)->frags; - skb_copy_from_linear_data_offset(skb, offset, + skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & + SKBFL_SHARED_FRAG; - while (pos < offset + len && i < nfrags) { - *frag = skb_shinfo(skb)->frags[i]; - __skb_frag_ref(frag); - size = skb_frag_size(frag); + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + goto err; + + while (pos < offset + len) { + if (i >= nfrags) { + if (skb_orphan_frags(list_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, list_skb, + GFP_ATOMIC)) + goto err; + + i = 0; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; + if (!skb_headlen(list_skb)) { + BUG_ON(!nfrags); + } else { + BUG_ON(!list_skb->head_frag); + + /* to make room for head_frag. */ + i--; + frag--; + } + + list_skb = list_skb->next; + } + + if (unlikely(skb_shinfo(nskb)->nr_frags >= + MAX_SKB_FRAGS)) { + net_warn_ratelimited( + "skb_segment: too many frags: %u %u\n", + pos, mss); + err = -EINVAL; + goto err; + } + + *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; + __skb_frag_ref(nskb_frag); + size = skb_frag_size(nskb_frag); if (pos < offset) { - frag->page_offset += offset - pos; - skb_frag_size_sub(frag, offset - pos); + skb_frag_off_add(nskb_frag, offset - pos); + skb_frag_size_sub(nskb_frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; if (pos + size <= offset + len) { i++; + frag++; pos += size; } else { - skb_frag_size_sub(frag, pos + size - (offset + len)); + skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); goto skip_fraglist; } - frag++; - } - - if (pos < offset + len) { - struct sk_buff *fskb2 = fskb; - - BUG_ON(pos + fskb->len != offset + len); - - pos += fskb->len; - fskb = fskb->next; - - if (fskb2->next) { - fskb2 = skb_clone(fskb2, GFP_ATOMIC); - if (!fskb2) - goto err; - } else - skb_get(fskb2); - - SKB_FRAG_ASSERT(nskb); - skb_shinfo(nskb)->frag_list = fskb2; + nskb_frag++; } skip_fraglist: @@ -2916,199 +5020,174 @@ skip_fraglist: perform_csum_check: if (!csum) { - nskb->csum = skb_checksum(nskb, doffset, - nskb->len - doffset, 0); - nskb->ip_summed = CHECKSUM_NONE; - } - } while ((offset += len) < skb->len); - - return segs; - -err: - while ((skb = segs)) { - segs = skb->next; - kfree_skb(skb); - } - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(skb_segment); - -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ - struct sk_buff *p = *head; - struct sk_buff *nskb; - struct skb_shared_info *skbinfo = skb_shinfo(skb); - struct skb_shared_info *pinfo = skb_shinfo(p); - unsigned int headroom; - unsigned int len = skb_gro_len(skb); - unsigned int offset = skb_gro_offset(skb); - unsigned int headlen = skb_headlen(skb); - unsigned int delta_truesize; - - if (p->len + len >= 65536) - return -E2BIG; - - if (pinfo->frag_list) - goto merge; - else if (headlen <= offset) { - skb_frag_t *frag; - skb_frag_t *frag2; - int i = skbinfo->nr_frags; - int nr_frags = pinfo->nr_frags + i; - - offset -= headlen; - - if (nr_frags > MAX_SKB_FRAGS) - return -E2BIG; - - pinfo->nr_frags = nr_frags; - skbinfo->nr_frags = 0; - - frag = pinfo->frags + nr_frags; - frag2 = skbinfo->frags + i; - do { - *--frag = *--frag2; - } while (--i); - - frag->page_offset += offset; - skb_frag_size_sub(frag, offset); - - /* all fragments truesize : remove (head size + sk_buff) */ - delta_truesize = skb->truesize - - SKB_TRUESIZE(skb_end_offset(skb)); - - skb->truesize -= skb->data_len; - skb->len -= skb->data_len; - skb->data_len = 0; - - NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; - goto done; - } else if (skb->head_frag) { - int nr_frags = pinfo->nr_frags; - skb_frag_t *frag = pinfo->frags + nr_frags; - struct page *page = virt_to_head_page(skb->head); - unsigned int first_size = headlen - offset; - unsigned int first_offset; - - if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) - return -E2BIG; - - first_offset = skb->data - - (unsigned char *)page_address(page) + - offset; - - pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - - frag->page.p = page; - frag->page_offset = first_offset; - skb_frag_size_set(frag, first_size); - - memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); - /* We dont need to clear skbinfo->nr_frags here */ - - delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); - NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; - goto done; - } else if (skb_gro_len(p) != pinfo->gso_size) - return -E2BIG; + if (skb_has_shared_frag(nskb) && + __skb_linearize(nskb)) + goto err; - headroom = skb_headroom(p); - nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); - if (unlikely(!nskb)) - return -ENOMEM; + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_checksum(nskb, doffset, + nskb->len - doffset, 0); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; + } + } while ((offset += len) < head_skb->len); - __copy_skb_header(nskb, p); - nskb->mac_len = p->mac_len; + /* Some callers want to get the end of the list. + * Put it in segs->prev to avoid walking the list. + * (see validate_xmit_skb_list() for example) + */ + segs->prev = tail; - skb_reserve(nskb, headroom); - __skb_put(nskb, skb_gro_offset(p)); + if (partial_segs) { + struct sk_buff *iter; + int type = skb_shinfo(head_skb)->gso_type; + unsigned short gso_size = skb_shinfo(head_skb)->gso_size; - skb_set_mac_header(nskb, skb_mac_header(p) - p->data); - skb_set_network_header(nskb, skb_network_offset(p)); - skb_set_transport_header(nskb, skb_transport_offset(p)); + /* Update type to add partial and then remove dodgy if set */ + type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; + type &= ~SKB_GSO_DODGY; - __skb_pull(p, skb_gro_offset(p)); - memcpy(skb_mac_header(nskb), skb_mac_header(p), - p->data - skb_mac_header(p)); + /* Update GSO info and prepare to start updating headers on + * our way back down the stack of protocols. + */ + for (iter = segs; iter; iter = iter->next) { + skb_shinfo(iter)->gso_size = gso_size; + skb_shinfo(iter)->gso_segs = partial_segs; + skb_shinfo(iter)->gso_type = type; + SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; + } - skb_shinfo(nskb)->frag_list = p; - skb_shinfo(nskb)->gso_size = pinfo->gso_size; - pinfo->gso_size = 0; - skb_header_release(p); - NAPI_GRO_CB(nskb)->last = p; + if (tail->len - doffset <= gso_size) + skb_shinfo(tail)->gso_size = 0; + else if (tail != segs) + skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); + } - nskb->data_len += p->len; - nskb->truesize += p->truesize; - nskb->len += p->len; + /* Following permits correct backpressure, for protocols + * using skb_set_owner_w(). + * Idea is to tranfert ownership from head_skb to last segment. + */ + if (head_skb->destructor == sock_wfree) { + swap(tail->truesize, head_skb->truesize); + swap(tail->destructor, head_skb->destructor); + swap(tail->sk, head_skb->sk); + } + return segs; - *head = nskb; - nskb->next = p->next; - p->next = NULL; +err: + kfree_skb_list(segs); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(skb_segment); - p = nskb; +#ifdef CONFIG_SKB_EXTENSIONS +#define SKB_EXT_ALIGN_VALUE 8 +#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) -merge: - delta_truesize = skb->truesize; - if (offset > headlen) { - unsigned int eat = offset - headlen; +static const u8 skb_ext_type_len[] = { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), +#endif +#ifdef CONFIG_XFRM + [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), +#endif +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), +#endif +#if IS_ENABLED(CONFIG_MPTCP) + [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), +#endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), +#endif +}; - skbinfo->frags[0].page_offset += eat; - skb_frag_size_sub(&skbinfo->frags[0], eat); - skb->data_len -= eat; - skb->len -= eat; - offset = headlen; - } +static __always_inline unsigned int skb_ext_total_length(void) +{ + unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); + int i; - __skb_pull(skb, offset); + for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) + l += skb_ext_type_len[i]; - NAPI_GRO_CB(p)->last->next = skb; - NAPI_GRO_CB(p)->last = skb; - skb_header_release(skb); + return l; +} -done: - NAPI_GRO_CB(p)->count++; - p->data_len += len; - p->truesize += delta_truesize; - p->len += len; +static void skb_extensions_init(void) +{ + BUILD_BUG_ON(SKB_EXT_NUM >= 8); +#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) + BUILD_BUG_ON(skb_ext_total_length() > 255); +#endif - NAPI_GRO_CB(skb)->same_flow = 1; - return 0; + skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", + SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); } -EXPORT_SYMBOL_GPL(skb_gro_receive); +#else +static void skb_extensions_init(void) {} +#endif + +/* The SKB kmem_cache slab is critical for network performance. Never + * merge/alias the slab with similar sized objects. This avoids fragmentation + * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. + */ +#ifndef CONFIG_SLUB_TINY +#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE +#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ +#define FLAG_SKB_NO_MERGE 0 +#endif void __init skb_init(void) { - skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, + SLAB_HWCACHE_ALIGN|SLAB_PANIC| + FLAG_SKB_NO_MERGE, + offsetof(struct sk_buff, cb), + sizeof_field(struct sk_buff, cb), NULL); - skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", - (2*sizeof(struct sk_buff)) + - sizeof(atomic_t), + skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache); + + net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. + * struct skb_shared_info is located at the end of skb->head, + * and should not be copied to/from user. + */ + net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", + SKB_SMALL_HEAD_CACHE_SIZE, + 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, + 0, + SKB_SMALL_HEAD_HEADROOM, + NULL); + skb_extensions_init(); } -/** - * skb_to_sgvec - Fill a scatter-gather list from a socket buffer - * @skb: Socket buffer containing the buffers to be mapped - * @sg: The scatter-gather list to map into - * @offset: The offset into the buffer's contents to start mapping - * @len: Length of buffer space to be mapped - * - * Fill the specified scatter-gather list with mappings/pointers into a - * region of the buffer space attached to a socket buffer. - */ static int -__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, + unsigned int recursion_level) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int elt = 0; + if (unlikely(recursion_level >= 24)) + return -EMSGSIZE; + if (copy > 0) { if (copy > len) copy = len; @@ -3127,11 +5206,13 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; if (copy > len) copy = len; sg_set_page(&sg[elt], skb_frag_page(frag), copy, - frag->page_offset+offset-start); + skb_frag_off(frag) + offset - start); elt++; if (!(len -= copy)) return elt; @@ -3141,16 +5222,22 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) } skb_walk_frags(skb, frag_iter) { - int end; + int end, ret; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; + if (copy > len) copy = len; - elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, - copy); + ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, + copy, recursion_level + 1); + if (unlikely(ret < 0)) + return ret; + elt += ret; if ((len -= copy) == 0) return elt; offset += copy; @@ -3161,9 +5248,24 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) return elt; } +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. Returns either + * the number of scatterlist items used, or -EMSGSIZE if the contents + * could not fit. + */ int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { - int nsg = __skb_to_sgvec(skb, sg, offset, len); + int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); + + if (nsg <= 0) + return nsg; sg_mark_end(&sg[nsg - 1]); @@ -3171,6 +5273,34 @@ int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int le } EXPORT_SYMBOL_GPL(skb_to_sgvec); +/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given + * sglist without mark the sg which contain last skb data as the end. + * So the caller can mannipulate sg list as will when padding new data after + * the first call without calling sg_unmark_end to expend sg list. + * + * Scenario to use skb_to_sgvec_nomark: + * 1. sg_init_table + * 2. skb_to_sgvec_nomark(payload1) + * 3. skb_to_sgvec_nomark(payload2) + * + * This is equivalent to: + * 1. sg_init_table + * 2. skb_to_sgvec(payload1) + * 3. sg_unmark_end + * 4. skb_to_sgvec(payload2) + * + * When mapping multiple payload conditionally, skb_to_sgvec_nomark + * is more preferable. + */ +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, + int offset, int len) +{ + return __skb_to_sgvec(skb, sg, offset, len, 0); +} +EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); + + + /** * skb_cow_data - Check that a socket buffer's data buffers are writable * @skb: The socket buffer to check. @@ -3199,7 +5329,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) * at the moment even if they are anonymous). */ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && - __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + !__pskb_pull_tail(skb, __skb_pagelen(skb))) return -ENOMEM; /* Easy case. Most of packets will go this way. */ @@ -3288,76 +5418,301 @@ static void sock_rmem_free(struct sk_buff *skb) atomic_sub(skb->truesize, &sk->sk_rmem_alloc); } +static void skb_set_err_queue(struct sk_buff *skb) +{ + /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. + * So, it is safe to (mis)use it to mark skbs on the error queue. + */ + skb->pkt_type = PACKET_OUTGOING; + BUILD_BUG_ON(PACKET_OUTGOING == 0); +} + /* * Note: We dont mem charge error packets (no sk_forward_alloc changes) */ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { - int len = skb->len; - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned int)sk->sk_rcvbuf) + (unsigned int)READ_ONCE(sk->sk_rcvbuf)) return -ENOMEM; skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rmem_free; atomic_add(skb->truesize, &sk->sk_rmem_alloc); + skb_set_err_queue(skb); /* before exiting rcu section, make sure dst is refcounted */ skb_dst_force(skb); skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, len); + sk_error_report(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); -void skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps) +static bool is_icmp_err_skb(const struct sk_buff *skb) { - struct sock *sk = orig_skb->sk; - struct sock_exterr_skb *serr; - struct sk_buff *skb; - int err; + return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); +} - if (!sk) - return; +struct sk_buff *sock_dequeue_err_skb(struct sock *sk) +{ + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *skb_next = NULL; + bool icmp_next = false; + unsigned long flags; - if (hwtstamps) { - *skb_hwtstamps(orig_skb) = - *hwtstamps; - } else { - /* - * no hardware time stamps available, - * so keep the shared tx_flags and only - * store software time stamp - */ - orig_skb->tstamp = ktime_get_real(); + if (skb_queue_empty_lockless(q)) + return NULL; + + spin_lock_irqsave(&q->lock, flags); + skb = __skb_dequeue(q); + if (skb && (skb_next = skb_peek(q))) { + icmp_next = is_icmp_err_skb(skb_next); + if (icmp_next) + sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; } + spin_unlock_irqrestore(&q->lock, flags); - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; + if (is_icmp_err_skb(skb) && !icmp_next) + sk->sk_err = 0; + + if (skb_next) + sk_error_report(sk); + + return skb; +} +EXPORT_SYMBOL(sock_dequeue_err_skb); + +/** + * skb_clone_sk - create clone of skb, and take reference to socket + * @skb: the skb to clone + * + * This function creates a clone of a buffer that holds a reference on + * sk_refcnt. Buffers created via this function are meant to be + * returned using sock_queue_err_skb, or free via kfree_skb. + * + * When passing buffers allocated with this function to sock_queue_err_skb + * it is necessary to wrap the call with sock_hold/sock_put in order to + * prevent the socket from being released prior to being enqueued on + * the sk_error_queue. + */ +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; + } + + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + +static void __skb_complete_tx_timestamp(struct sk_buff *skb, + struct sock *sk, + int tstype, + bool opt_stats) +{ + struct sock_exterr_skb *serr; + int err; + + BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = tstype; + serr->opt_stats = opt_stats; + serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + serr->ee.ee_data = skb_shinfo(skb)->tskey; + if (sk_is_tcp(sk)) + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); + } err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); } + +static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) +{ + bool ret; + + if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) + return true; + + read_lock_bh(&sk->sk_callback_lock); + ret = sk->sk_socket && sk->sk_socket->file && + file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); + read_unlock_bh(&sk->sk_callback_lock); + return ret; +} + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + + if (!skb_may_tx_timestamp(sk, false)) + goto err; + + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); + sock_put(sk); + return; + } + +err: + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + int tstype) +{ + switch (tstype) { + case SCM_TSTAMP_SCHED: + return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; + case SCM_TSTAMP_SND: + return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : + SKBTX_SW_TSTAMP); + case SCM_TSTAMP_ACK: + return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; + case SCM_TSTAMP_COMPLETION: + return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP; + } + + return false; +} + +static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, + int tstype) +{ + int op; + + switch (tstype) { + case SCM_TSTAMP_SCHED: + op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; + break; + case SCM_TSTAMP_SND: + if (hwtstamps) { + op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; + *skb_hwtstamps(skb) = *hwtstamps; + } else { + op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; + } + break; + case SCM_TSTAMP_ACK: + op = BPF_SOCK_OPS_TSTAMP_ACK_CB; + break; + default: + return; + } + + bpf_skops_tx_timestamping(sk, skb, op); +} + +void __skb_tstamp_tx(struct sk_buff *orig_skb, + const struct sk_buff *ack_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) +{ + struct sk_buff *skb; + bool tsonly, opt_stats = false; + u32 tsflags; + + if (!sk) + return; + + if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) + skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, + sk, tstype); + + if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) + return; + + tsflags = READ_ONCE(sk->sk_tsflags); + if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) + return; + + tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + if (!skb_may_tx_timestamp(sk, tsonly)) + return; + + if (tsonly) { +#ifdef CONFIG_INET + if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && + sk_is_tcp(sk)) { + skb = tcp_get_timestamping_opt_stats(sk, orig_skb, + ack_skb); + opt_stats = true; + } else +#endif + skb = alloc_skb(0, GFP_ATOMIC); + } else { + skb = skb_clone(orig_skb, GFP_ATOMIC); + + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { + kfree_skb(skb); + return; + } + } + if (!skb) + return; + + if (tsonly) { + skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & + SKBTX_ANY_TSTAMP; + skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; + } + + if (hwtstamps) + *skb_hwtstamps(skb) = *hwtstamps; + else + __net_timestamp(skb); + + __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); +} +EXPORT_SYMBOL_GPL(__skb_tstamp_tx); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, + SCM_TSTAMP_SND); +} EXPORT_SYMBOL_GPL(skb_tstamp_tx); +#ifdef CONFIG_WIRELESS void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; struct sock_exterr_skb *serr; - int err; + int err = 1; skb->wifi_acked_valid = 1; skb->wifi_acked = acked; @@ -3367,12 +5722,18 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; - err = sock_queue_err_skb(sk, skb); + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { + err = sock_queue_err_skb(sk, skb); + sock_put(sk); + } if (err) kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); - +#endif /* CONFIG_WIRELESS */ /** * skb_partial_csum_set - set up and verify partial csum values for packet @@ -3388,20 +5749,340 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); */ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) { - if (unlikely(start > skb_headlen(skb)) || - unlikely((int)start + off > skb_headlen(skb) - 2)) { - net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", - start, off, skb_headlen(skb)); + u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); + u32 csum_start = skb_headroom(skb) + (u32)start; + + if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { + net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", + start, off, skb_headroom(skb), skb_headlen(skb)); return false; } skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + start; + skb->csum_start = csum_start; skb->csum_offset = off; - skb_set_transport_header(skb, start); + skb->transport_header = csum_start; return true; } EXPORT_SYMBOL_GPL(skb_partial_csum_set); +static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, + unsigned int max) +{ + if (skb_headlen(skb) >= len) + return 0; + + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + if (max > skb->len) + max = skb->len; + + if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) + return -ENOMEM; + + if (skb_headlen(skb) < len) + return -EPROTO; + + return 0; +} + +#define MAX_TCP_HDR_LEN (15 * 4) + +static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, + typeof(IPPROTO_IP) proto, + unsigned int off) +{ + int err; + + switch (proto) { + case IPPROTO_TCP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), + off + MAX_TCP_HDR_LEN); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct tcphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; + + case IPPROTO_UDP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), + off + sizeof(struct udphdr)); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct udphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &udp_hdr(skb)->check; + } + + return ERR_PTR(-EPROTO); +} + +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + +static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) +{ + unsigned int off; + bool fragment; + __sum16 *csum; + int err; + + fragment = false; + + err = skb_maybe_pull_tail(skb, + sizeof(struct iphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + if (ip_is_fragment(ip_hdr(skb))) + fragment = true; + + off = ip_hdrlen(skb); + + err = -EPROTO; + + if (fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + ip_hdr(skb)->protocol, 0); + err = 0; + +out: + return err; +} + +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ + (type *)(skb_network_header(skb) + (off)) + +static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) +{ + int err; + u8 nexthdr; + unsigned int off; + unsigned int len; + bool fragment; + bool done; + __sum16 *csum; + + fragment = false; + done = false; + + off = sizeof(struct ipv6hdr); + + err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + nexthdr = ipv6_hdr(skb)->nexthdr; + + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + while (off <= len && !done) { + switch (nexthdr) { + case IPPROTO_DSTOPTS: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: { + struct ipv6_opt_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ipv6_opt_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_optlen(hp); + break; + } + case IPPROTO_AH: { + struct ip_auth_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ip_auth_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ip_auth_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_authlen(hp); + break; + } + case IPPROTO_FRAGMENT: { + struct frag_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct frag_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct frag_hdr, skb, off); + + if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) + fragment = true; + + nexthdr = hp->nexthdr; + off += sizeof(struct frag_hdr); + break; + } + default: + done = true; + break; + } + } + + err = -EPROTO; + + if (!done || fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, nexthdr, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, nexthdr, 0); + err = 0; + +out: + return err; +} + +/** + * skb_checksum_setup - set up partial checksum offset + * @skb: the skb to set up + * @recalculate: if true the pseudo-header checksum will be recalculated + */ +int skb_checksum_setup(struct sk_buff *skb, bool recalculate) +{ + int err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + err = skb_checksum_setup_ipv4(skb, recalculate); + break; + + case htons(ETH_P_IPV6): + err = skb_checksum_setup_ipv6(skb, recalculate); + break; + + default: + err = -EPROTO; + break; + } + + return err; +} +EXPORT_SYMBOL(skb_checksum_setup); + +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and free any returned skb if it + * differs from the provided skb. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, + unsigned int transport_len) +{ + struct sk_buff *skb_chk; + unsigned int len = skb_transport_offset(skb) + transport_len; + int ret; + + if (skb->len < len) + return NULL; + else if (skb->len == len) + return skb; + + skb_chk = skb_clone(skb, GFP_ATOMIC); + if (!skb_chk) + return NULL; + + ret = pskb_trim_rcsum(skb_chk, len); + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and free any returned skb if it + * differs from the provided skb. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)) +{ + struct sk_buff *skb_chk; + unsigned int offset = skb_transport_offset(skb); + __sum16 ret; + + skb_chk = skb_checksum_maybe_trim(skb, transport_len); + if (!skb_chk) + goto err; + + if (!pskb_may_pull(skb_chk, offset)) + goto err; + + skb_pull_rcsum(skb_chk, offset); + ret = skb_chkf(skb_chk); + skb_push_rcsum(skb_chk, offset); + + if (ret) + goto err; + + return skb_chk; + +err: + if (skb_chk && skb_chk != skb) + kfree_skb(skb_chk); + + return NULL; + +} +EXPORT_SYMBOL(skb_checksum_trimmed); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", @@ -3413,7 +6094,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); - kmem_cache_free(skbuff_head_cache, skb); + kmem_cache_free(net_hotdata.skbuff_cache, skb); } else { __kfree_skb(skb); } @@ -3430,6 +6111,7 @@ EXPORT_SYMBOL(kfree_skb_partial); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize) { + struct skb_shared_info *to_shinfo, *from_shinfo; int i, delta, len = from->len; *fragstolen = false; @@ -3437,21 +6119,38 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; - if (len <= skb_tailroom(to)) { - BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); + /* In general, avoid mixing page_pool and non-page_pool allocated + * pages within the same SKB. In theory we could take full + * references if @from is cloned and !@to->pp_recycle but its + * tricky (due to potential race with the clone disappearing) and + * rare, so not worth dealing with. + */ + if (to->pp_recycle != from->pp_recycle) + return false; + + if (skb_frags_readable(from) != skb_frags_readable(to)) + return false; + + if (len <= skb_tailroom(to) && skb_frags_readable(from)) { + if (len) + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; return true; } - if (skb_has_frag_list(to) || skb_has_frag_list(from)) + to_shinfo = skb_shinfo(to); + from_shinfo = skb_shinfo(from); + if (to_shinfo->frag_list || from_shinfo->frag_list) + return false; + if (skb_zcopy(to) || skb_zcopy(from)) return false; if (skb_headlen(from) != 0) { struct page *page; unsigned int offset; - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags >= MAX_SKB_FRAGS) return false; if (skb_head_is_locked(from)) @@ -3462,12 +6161,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); - skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, + skb_fill_page_desc(to, to_shinfo->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; } else { - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags > MAX_SKB_FRAGS) return false; delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); @@ -3475,19 +6174,21 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, WARN_ON_ONCE(delta < len); - memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, - skb_shinfo(from)->frags, - skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); - skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + memcpy(to_shinfo->frags + to_shinfo->nr_frags, + from_shinfo->frags, + from_shinfo->nr_frags * sizeof(skb_frag_t)); + to_shinfo->nr_frags += from_shinfo->nr_frags; if (!skb_cloned(from)) - skb_shinfo(from)->nr_frags = 0; + from_shinfo->nr_frags = 0; /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) - skb_frag_ref(from, i); + if (skb_pp_frag_ref(from)) { + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); + } to->truesize += delta; to->len += len; @@ -3499,24 +6200,1220 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, EXPORT_SYMBOL(skb_try_coalesce); /** - * skb_scrub_packet - scrub an skb before sending it to another netns + * skb_scrub_packet - scrub an skb * * @skb: buffer to clean + * @xnet: packet is crossing netns * - * skb_scrub_packet can be used to clean an skb before injecting it in - * another namespace. We have to clear all information in the skb that - * could impact namespace isolation. + * skb_scrub_packet can be used after encapsulating or decapsulating a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. */ -void skb_scrub_packet(struct sk_buff *skb) +void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb_orphan(skb); - skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; + skb->ignore_df = 0; skb_dst_drop(skb); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); + +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; +#endif + ipvs_reset(skb); + + if (!xnet) + return; + + skb->mark = 0; + skb_clear_tstamp(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); + +static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) +{ + int mac_len, meta_len; + void *meta; + + if (skb_cow(skb, skb_headroom(skb)) < 0) { + kfree_skb(skb); + return NULL; + } + + mac_len = skb->data - skb_mac_header(skb); + if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { + memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), + mac_len - VLAN_HLEN - ETH_TLEN); + } + + meta_len = skb_metadata_len(skb); + if (meta_len) { + meta = skb_metadata_end(skb) - meta_len; + memmove(meta + VLAN_HLEN, meta, meta_len); + } + + skb->mac_header += VLAN_HLEN; + return skb; +} + +struct sk_buff *skb_vlan_untag(struct sk_buff *skb) +{ + struct vlan_hdr *vhdr; + u16 vlan_tci; + + if (unlikely(skb_vlan_tag_present(skb))) { + /* vlan_tci is already set-up so leave this for another time */ + return skb; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto err_free; + /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) + goto err_free; + + vhdr = (struct vlan_hdr *)skb->data; + vlan_tci = ntohs(vhdr->h_vlan_TCI); + __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); + + skb_pull_rcsum(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vhdr); + + skb = skb_reorder_vlan_header(skb); + if (unlikely(!skb)) + goto err_free; + + skb_reset_network_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + + return skb; + +err_free: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_vlan_untag); + +int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) +{ + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable); + +int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) +{ + int needed_headroom = dev->needed_headroom; + int needed_tailroom = dev->needed_tailroom; + + /* For tail taggers, we need to pad short frames ourselves, to ensure + * that the tail tag does not fail at its role of being at the end of + * the packet, once the conduit interface pads the frame. Account for + * that pad length here, and pad later. + */ + if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) + needed_tailroom += ETH_ZLEN - skb->len; + /* skb_headroom() returns unsigned int... */ + needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); + needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); + + if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) + /* No reallocation needed, yay! */ + return 0; + + return pskb_expand_head(skb, needed_headroom, needed_tailroom, + GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable_head_tail); + +/* remove VLAN header from packet and update csum accordingly. + * expects a non skb_vlan_tag_present skb with a vlan tag payload + */ +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +{ + int offset = skb->data - skb_mac_header(skb); + int err; + + if (WARN_ONCE(offset, + "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + + vlan_remove_tag(skb, vlan_tci); + + skb->mac_header += VLAN_HLEN; + + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); + + skb_reset_mac_len(skb); + + return err; +} +EXPORT_SYMBOL(__skb_vlan_pop); + +/* Pop a vlan tag either from hwaccel or from payload. + * Expects skb->data at mac header. + */ +int skb_vlan_pop(struct sk_buff *skb) +{ + u16 vlan_tci; + __be16 vlan_proto; + int err; + + if (likely(skb_vlan_tag_present(skb))) { + __vlan_hwaccel_clear_tag(skb); + } else { + if (unlikely(!eth_type_vlan(skb->protocol))) + return 0; + + err = __skb_vlan_pop(skb, &vlan_tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely(!eth_type_vlan(skb->protocol))) + return 0; + + vlan_proto = skb->protocol; + err = __skb_vlan_pop(skb, &vlan_tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_pop); + +/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). + * Expects skb->data at mac header. + */ +int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) +{ + if (skb_vlan_tag_present(skb)) { + int offset = skb->data - skb_mac_header(skb); + int err; + + if (WARN_ONCE(offset, + "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + + err = __vlan_insert_tag(skb, skb->vlan_proto, + skb_vlan_tag_get(skb)); + if (err) + return err; + + skb->protocol = skb->vlan_proto; + skb->network_header -= VLAN_HLEN; + + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + } + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_push); + +/** + * skb_eth_pop() - Drop the Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * + * Drop the Ethernet header of @skb. + * + * Expects that skb->data points to the mac header and that no VLAN tags are + * present. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_pop(struct sk_buff *skb) +{ + if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || + skb_network_offset(skb) < ETH_HLEN) + return -EPROTO; + + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return 0; +} +EXPORT_SYMBOL(skb_eth_pop); + +/** + * skb_eth_push() - Add a new Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * @dst: Destination MAC address of the new header + * @src: Source MAC address of the new header + * + * Prepend @skb with a new Ethernet header. + * + * Expects that skb->data points to the mac header, which must be empty. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src) +{ + struct ethhdr *eth; + int err; + + if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) + return -EPROTO; + + err = skb_cow_head(skb, sizeof(*eth)); + if (err < 0) + return err; + + skb_push(skb, sizeof(*eth)); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + eth = eth_hdr(skb); + ether_addr_copy(eth->h_dest, dst); + ether_addr_copy(eth->h_source, src); + eth->h_proto = skb->protocol; + + skb_postpush_rcsum(skb, eth, sizeof(*eth)); + + return 0; +} +EXPORT_SYMBOL(skb_eth_push); + +/* Update the ethertype of hdr and the skb csum value if required. */ +static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, + __be16 ethertype) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be16 diff[] = { ~hdr->h_proto, ethertype }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + hdr->h_proto = ethertype; +} + +/** + * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of + * the packet + * + * @skb: buffer + * @mpls_lse: MPLS label stack entry to push + * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) + * @mac_len: length of the MAC header + * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is + * ethernet + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, + int mac_len, bool ethernet) +{ + struct mpls_shim_hdr *lse; + int err; + + if (unlikely(!eth_p_mpls(mpls_proto))) + return -EINVAL; + + /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -EINVAL; + + err = skb_cow_head(skb, MPLS_HLEN); + if (unlikely(err)) + return err; + + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_protocol(skb, skb->protocol); + } + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + mac_len); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb_reset_mac_len(skb); + + lse = mpls_hdr(skb); + lse->label_stack_entry = mpls_lse; + skb_postpush_rcsum(skb, lse, MPLS_HLEN); + + if (ethernet && mac_len >= ETH_HLEN) + skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); + skb->protocol = mpls_proto; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_push); + +/** + * skb_mpls_pop() - pop the outermost MPLS header + * + * @skb: buffer + * @next_proto: ethertype of header after popped MPLS header + * @mac_len: length of the MAC header + * @ethernet: flag to indicate if the packet is ethernet + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, + bool ethernet) +{ + int err; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return 0; + + err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + mac_len); + + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + + if (ethernet && mac_len >= ETH_HLEN) { + struct ethhdr *hdr; + + /* use mpls_hdr() to get ethertype to account for VLANs. */ + hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); + skb_mod_eth_type(skb, hdr, next_proto); + } + skb->protocol = next_proto; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_pop); + +/** + * skb_mpls_update_lse() - modify outermost MPLS header and update csum + * + * @skb: buffer + * @mpls_lse: new MPLS label stack entry to update to + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) +{ + int err; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return -EINVAL; + + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + mpls_hdr(skb)->label_stack_entry = mpls_lse; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_update_lse); + +/** + * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header + * + * @skb: buffer + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_dec_ttl(struct sk_buff *skb) +{ + u32 lse; + u8 ttl; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return -EINVAL; + + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + + lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); + ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + if (!--ttl) + return -EINVAL; + + lse &= ~MPLS_LS_TTL_MASK; + lse |= ttl << MPLS_LS_TTL_SHIFT; + + return skb_mpls_update_lse(skb, cpu_to_be32(lse)); +} +EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); + +/** + * alloc_skb_with_frags - allocate skb with page frags + * + * @header_len: size of linear part + * @data_len: needed length in frags + * @order: max page order desired. + * @errcode: pointer to error code if any + * @gfp_mask: allocation mask + * + * This can be used to allocate a paged skb, given a maximal order for frags. + */ +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int order, + int *errcode, + gfp_t gfp_mask) +{ + unsigned long chunk; + struct sk_buff *skb; + struct page *page; + int nr_frags = 0; + + *errcode = -EMSGSIZE; + if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) + return NULL; + + *errcode = -ENOBUFS; + skb = alloc_skb(header_len, gfp_mask); + if (!skb) + return NULL; + + while (data_len) { + if (nr_frags == MAX_SKB_FRAGS) + goto failure; + while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) + order--; + + if (order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN, + order); + if (!page) { + order--; + continue; + } + } else { + page = alloc_page(gfp_mask); + if (!page) + goto failure; + } + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, nr_frags, page, 0, chunk); + nr_frags++; + skb->truesize += (PAGE_SIZE << order); + data_len -= chunk; + } + return skb; + +failure: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(alloc_skb_with_frags); + +/* carve out the first off bytes from skb when off < headlen */ +static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, + const int headlen, gfp_t gfp_mask) +{ + int i; + unsigned int size = skb_end_offset(skb); + int new_hlen = headlen - off; + u8 *data; + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + size = SKB_WITH_OVERHEAD(size); + + /* Copy real data, and all frags */ + skb_copy_from_linear_data_offset(skb, off, data, new_hlen); + skb->len -= off; + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_cloned(skb)) { + /* drop the old head gracefully */ + if (skb_orphan_frags(skb, gfp_mask)) { + skb_kfree_head(data, size); + return -ENOMEM; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + skb_release_data(skb, SKB_CONSUMED); + } else { + /* we can reuse existing recount- all we did was + * relocate values + */ + skb_free_head(skb); + } + + skb->head = data; + skb->data = data; + skb->head_frag = 0; + skb_set_end_offset(skb, size); + skb_set_tail_pointer(skb, skb_headlen(skb)); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + + return 0; +} + +static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); + +/* carve out the first eat bytes from skb's frag_list. May recurse into + * pskb_carve() + */ +static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat, + gfp_t gfp_mask) +{ + struct sk_buff *list = shinfo->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (!list) { + pr_err("Not enough bytes to eat. Want %d\n", eat); + return -EFAULT; + } + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + if (skb_shared(list)) { + clone = skb_clone(list, gfp_mask); + if (!clone) + return -ENOMEM; + insp = list->next; + list = clone; + } else { + /* This may be pulled without problems. */ + insp = list; + } + if (pskb_carve(list, eat, gfp_mask) < 0) { + kfree_skb(clone); + return -ENOMEM; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = shinfo->frag_list) != insp) { + shinfo->frag_list = list->next; + consume_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + shinfo->frag_list = clone; + } + return 0; +} + +/* carve off first len bytes from skb. Split line (off) is in the + * non-linear part of skb + */ +static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, + int pos, gfp_t gfp_mask) +{ + int i, k = 0; + unsigned int size = skb_end_offset(skb); + u8 *data; + const int nfrags = skb_shinfo(skb)->nr_frags; + struct skb_shared_info *shinfo; + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + size = SKB_WITH_OVERHEAD(size); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); + if (skb_orphan_frags(skb, gfp_mask)) { + skb_kfree_head(data, size); + return -ENOMEM; + } + shinfo = (struct skb_shared_info *)(data + size); + for (i = 0; i < nfrags; i++) { + int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (pos + fsize > off) { + shinfo->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < off) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + skb_frag_off_add(&shinfo->frags[0], off - pos); + skb_frag_size_sub(&shinfo->frags[0], off - pos); + } + skb_frag_ref(skb, i); + k++; + } + pos += fsize; + } + shinfo->nr_frags = k; + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + /* split line is in frag list */ + if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) { + /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ + if (skb_has_frag_list(skb)) + kfree_skb_list(skb_shinfo(skb)->frag_list); + skb_kfree_head(data, size); + return -ENOMEM; + } + skb_release_data(skb, SKB_CONSUMED); + + skb->head = data; + skb->head_frag = 0; + skb->data = data; + skb_set_end_offset(skb, size); + skb_reset_tail_pointer(skb); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + skb->len -= off; + skb->data_len = skb->len; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; +} + +/* remove len bytes from the beginning of the skb */ +static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) +{ + int headlen = skb_headlen(skb); + + if (len < headlen) + return pskb_carve_inside_header(skb, len, headlen, gfp); + else + return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); +} + +/* Extract to_copy bytes starting at off from skb, and return this in + * a new skb + */ +struct sk_buff *pskb_extract(struct sk_buff *skb, int off, + int to_copy, gfp_t gfp) +{ + struct sk_buff *clone = skb_clone(skb, gfp); + + if (!clone) + return NULL; + + if (pskb_carve(clone, off, gfp) < 0 || + pskb_trim(clone, to_copy)) { + kfree_skb(clone); + return NULL; + } + return clone; +} +EXPORT_SYMBOL(pskb_extract); + +/** + * skb_condense - try to get rid of fragments/frag_list if possible + * @skb: buffer + * + * Can be used to save memory before skb is added to a busy queue. + * If packet has bytes in frags and enough tail room in skb->head, + * pull all of them, so that we can free the frags right now and adjust + * truesize. + * Notes: + * We do not reallocate skb->head thus can not fail. + * Caller must re-evaluate skb->truesize if needed. + */ +void skb_condense(struct sk_buff *skb) +{ + if (skb->data_len) { + if (skb->data_len > skb->end - skb->tail || + skb_cloned(skb) || !skb_frags_readable(skb)) + return; + + /* Nice, we can free page frag(s) right now */ + __pskb_pull_tail(skb, skb->data_len); + } + /* At this point, skb->truesize might be over estimated, + * because skb had a fragment, and fragments do not tell + * their truesize. + * When we pulled its content into skb->head, fragment + * was freed, but __pskb_pull_tail() could not possibly + * adjust skb->truesize, not knowing the frag truesize. + */ + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); +} +EXPORT_SYMBOL(skb_condense); + +#ifdef CONFIG_SKB_EXTENSIONS +static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) +{ + return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); +} + +/** + * __skb_ext_alloc - allocate a new skb extensions storage + * + * @flags: See kmalloc(). + * + * Returns the newly allocated pointer. The pointer can later attached to a + * skb via __skb_ext_set(). + * Note: caller must handle the skb_ext as an opaque data. + */ +struct skb_ext *__skb_ext_alloc(gfp_t flags) +{ + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); + + if (new) { + memset(new->offset, 0, sizeof(new->offset)); + refcount_set(&new->refcnt, 1); + } + + return new; +} + +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, + unsigned int old_active) +{ + struct skb_ext *new; + + if (refcount_read(&old->refcnt) == 1) + return old; + + new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); + refcount_set(&new->refcnt, 1); + +#ifdef CONFIG_XFRM + if (old_active & (1 << SKB_EXT_SEC_PATH)) { + struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } +#endif +#ifdef CONFIG_MCTP_FLOWS + if (old_active & (1 << SKB_EXT_MCTP)) { + struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP); + + if (flow->key) + refcount_inc(&flow->key->refs); + } +#endif + __skb_ext_put(old); + return new; +} + +/** + * __skb_ext_set - attach the specified extension storage to this skb + * @skb: buffer + * @id: extension id + * @ext: extension storage previously allocated via __skb_ext_alloc() + * + * Existing extensions, if any, are cleared. + * + * Returns the pointer to the extension. + */ +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext) +{ + unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); + + skb_ext_put(skb); + newlen = newoff + skb_ext_type_len[id]; + ext->chunks = newlen; + ext->offset[id] = newoff; + skb->extensions = ext; + skb->active_extensions = 1 << id; + return skb_ext_get_ptr(ext, id); +} +EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL"); + +/** + * skb_ext_add - allocate space for given extension, COW if needed + * @skb: buffer + * @id: extension to allocate space for + * + * Allocates enough space for the given extension. + * If the extension is already present, a pointer to that extension + * is returned. + * + * If the skb was cloned, COW applies and the returned memory can be + * modified without changing the extension space of clones buffers. + * + * Returns pointer to the extension or NULL on allocation failure. + */ +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *new, *old = NULL; + unsigned int newlen, newoff; + + if (skb->active_extensions) { + old = skb->extensions; + + new = skb_ext_maybe_cow(old, skb->active_extensions); + if (!new) + return NULL; + + if (__skb_ext_exist(new, id)) + goto set_active; + + newoff = new->chunks; + } else { + newoff = SKB_EXT_CHUNKSIZEOF(*new); + + new = __skb_ext_alloc(GFP_ATOMIC); + if (!new) + return NULL; + } + + newlen = newoff + skb_ext_type_len[id]; + new->chunks = newlen; + new->offset[id] = newoff; +set_active: + skb->slow_gro = 1; + skb->extensions = new; + skb->active_extensions |= 1 << id; + return skb_ext_get_ptr(new, id); +} +EXPORT_SYMBOL(skb_ext_add); + +#ifdef CONFIG_XFRM +static void skb_ext_put_sp(struct sec_path *sp) +{ + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); +} +#endif + +#ifdef CONFIG_MCTP_FLOWS +static void skb_ext_put_mctp(struct mctp_flow *flow) +{ + if (flow->key) + mctp_key_unref(flow->key); +} +#endif + +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *ext = skb->extensions; + + skb->active_extensions &= ~(1 << id); + if (skb->active_extensions == 0) { + skb->extensions = NULL; + __skb_ext_put(ext); +#ifdef CONFIG_XFRM + } else if (id == SKB_EXT_SEC_PATH && + refcount_read(&ext->refcnt) == 1) { + struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); + + skb_ext_put_sp(sp); + sp->len = 0; +#endif + } +} +EXPORT_SYMBOL(__skb_ext_del); + +void __skb_ext_put(struct skb_ext *ext) +{ + /* If this is last clone, nothing can increment + * it after check passes. Avoids one atomic op. + */ + if (refcount_read(&ext->refcnt) == 1) + goto free_now; + + if (!refcount_dec_and_test(&ext->refcnt)) + return; +free_now: +#ifdef CONFIG_XFRM + if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) + skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); +#endif +#ifdef CONFIG_MCTP_FLOWS + if (__skb_ext_exist(ext, SKB_EXT_MCTP)) + skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); +#endif + + kmem_cache_free(skbuff_ext_cache, ext); +} +EXPORT_SYMBOL(__skb_ext_put); +#endif /* CONFIG_SKB_EXTENSIONS */ + +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + local_bh_disable(); + __napi_kfree_skb(skb, SKB_CONSUMED); + local_bh_enable(); +} + +/** + * skb_attempt_defer_free - queue skb for remote freeing + * @skb: buffer + * + * Put @skb in a per-cpu list, using the cpu which + * allocated the skb/pages to reduce false sharing + * and memory zone spinlock contention. + */ +void skb_attempt_defer_free(struct sk_buff *skb) +{ + struct skb_defer_node *sdn; + unsigned long defer_count; + int cpu = skb->alloc_cpu; + unsigned int defer_max; + bool kick; + + if (cpu == raw_smp_processor_id() || + WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu)) { +nodefer: kfree_skb_napi_cache(skb); + return; + } + + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(skb->destructor); + DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb)); + + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); + + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); + defer_count = atomic_long_inc_return(&sdn->defer_count); + + if (defer_count >= defer_max) + goto nodefer; + + llist_add(&skb->ll_node, &sdn->defer_list); + + /* Send an IPI every time queue reaches half capacity. */ + kick = (defer_count - 1) == (defer_max >> 1); + + /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU + * if we are unlucky enough (this seems very unlikely). + */ + if (unlikely(kick)) + kick_defer_list_purge(cpu); +} + +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, + size_t offset, size_t len) +{ + const char *kaddr; + __wsum csum; + + kaddr = kmap_local_page(page); + csum = csum_partial(kaddr + offset, len, 0); + kunmap_local(kaddr); + skb->csum = csum_block_add(skb->csum, csum, skb->len); +} + +/** + * skb_splice_from_iter - Splice (or copy) pages to skbuff + * @skb: The buffer to add pages to + * @iter: Iterator representing the pages to be added + * @maxsize: Maximum amount of pages to be added + * + * This is a common helper function for supporting MSG_SPLICE_PAGES. It + * extracts pages from an iterator and adds them to the socket buffer if + * possible, copying them to fragments if not possible (such as if they're slab + * pages). + * + * Returns the amount of data spliced/copied or -EMSGSIZE if there's + * insufficient space in the buffer to transfer anything. + */ +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, + ssize_t maxsize) +{ + size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); + struct page *pages[8], **ppages = pages; + ssize_t spliced = 0, ret = 0; + unsigned int i; + + while (iter->count > 0) { + ssize_t space, nr, len; + size_t off; + + ret = -EMSGSIZE; + space = frag_limit - skb_shinfo(skb)->nr_frags; + if (space < 0) + break; + + /* We might be able to coalesce without increasing nr_frags */ + nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); + + len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); + if (len <= 0) { + ret = len ?: -EIO; + break; + } + + i = 0; + do { + struct page *page = pages[i++]; + size_t part = min_t(size_t, PAGE_SIZE - off, len); + + ret = -EIO; + if (WARN_ON_ONCE(!sendpage_ok(page))) + goto out; + + ret = skb_append_pagefrags(skb, page, off, part, + frag_limit); + if (ret < 0) { + iov_iter_revert(iter, len); + goto out; + } + + if (skb->ip_summed == CHECKSUM_NONE) + skb_splice_csum_page(skb, page, off, part); + + off = 0; + spliced += part; + maxsize -= part; + len -= part; + } while (len > 0); + + if (maxsize <= 0) + break; + } + +out: + skb_len_add(skb, spliced); + return spliced ?: ret; +} +EXPORT_SYMBOL(skb_splice_from_iter); + +static __always_inline +size_t memcpy_from_iter_csum(void *iter_from, size_t progress, + size_t len, void *to, void *priv2) +{ + __wsum *csum = priv2; + __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len); + + *csum = csum_block_add(*csum, next, progress); + return 0; +} + +static __always_inline +size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, + size_t len, void *to, void *priv2) +{ + __wsum next, *csum = priv2; + + next = csum_and_copy_from_user(iter_from, to + progress, len); + *csum = csum_block_add(*csum, next, progress); + return next ? 0 : len; +} + +bool csum_and_copy_from_iter_full(void *addr, size_t bytes, + __wsum *csum, struct iov_iter *i) +{ + size_t copied; + + if (WARN_ON_ONCE(!i->data_source)) + return false; + copied = iterate_and_advance2(i, bytes, addr, csum, + copy_from_user_iter_csum, + memcpy_from_iter_csum); + if (likely(copied == bytes)) + return true; + iov_iter_revert(i, copied); + return false; +} +EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(put_netmem); diff --git a/net/core/skmsg.c b/net/core/skmsg.c new file mode 100644 index 000000000000..2ac7731e1e0a --- /dev/null +++ b/net/core/skmsg.c @@ -0,0 +1,1289 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/skmsg.h> +#include <linux/skbuff.h> +#include <linux/scatterlist.h> + +#include <net/sock.h> +#include <net/tcp.h> +#include <net/tls.h> +#include <trace/events/sock.h> + +static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) +{ + if (msg->sg.end > msg->sg.start && + elem_first_coalesce < msg->sg.end) + return true; + + if (msg->sg.end < msg->sg.start && + (elem_first_coalesce > msg->sg.start || + elem_first_coalesce < msg->sg.end)) + return true; + + return false; +} + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce) +{ + struct page_frag *pfrag = sk_page_frag(sk); + u32 osize = msg->sg.size; + int ret = 0; + + len -= msg->sg.size; + while (len > 0) { + struct scatterlist *sge; + u32 orig_offset; + int use, i; + + if (!sk_page_frag_refill(sk, pfrag)) { + ret = -ENOMEM; + goto msg_trim; + } + + orig_offset = pfrag->offset; + use = min_t(int, len, pfrag->size - orig_offset); + if (!sk_wmem_schedule(sk, use)) { + ret = -ENOMEM; + goto msg_trim; + } + + i = msg->sg.end; + sk_msg_iter_var_prev(i); + sge = &msg->sg.data[i]; + + if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && + sg_page(sge) == pfrag->page && + sge->offset + sge->length == orig_offset) { + sge->length += use; + } else { + if (sk_msg_full(msg)) { + ret = -ENOSPC; + break; + } + + sge = &msg->sg.data[msg->sg.end]; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + sk_msg_iter_next(msg, end); + } + + sk_mem_charge(sk, use); + msg->sg.size += use; + pfrag->offset += use; + len -= use; + } + + return ret; + +msg_trim: + sk_msg_trim(sk, msg, osize); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_alloc); + +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len) +{ + int i = src->sg.start; + struct scatterlist *sge = sk_msg_elem(src, i); + struct scatterlist *sgd = NULL; + u32 sge_len, sge_off; + + while (off) { + if (sge->length > off) + break; + off -= sge->length; + sk_msg_iter_var_next(i); + if (i == src->sg.end && off) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + while (len) { + sge_len = sge->length - off; + if (sge_len > len) + sge_len = len; + + if (dst->sg.end) + sgd = sk_msg_elem(dst, dst->sg.end - 1); + + if (sgd && + (sg_page(sge) == sg_page(sgd)) && + (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) { + sgd->length += sge_len; + dst->sg.size += sge_len; + } else if (!sk_msg_full(dst)) { + sge_off = sge->offset + off; + sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); + } else { + return -ENOSPC; + } + + off = 0; + len -= sge_len; + sk_mem_charge(sk, sge_len); + sk_msg_iter_var_next(i); + if (i == src->sg.end && len) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_msg_clone); + +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (bytes < sge->length) { + sge->length -= bytes; + sge->offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sge->length); + bytes -= sge->length; + sge->length = 0; + sge->offset = 0; + sk_msg_iter_var_next(i); + } while (bytes && i != msg->sg.end); + msg->sg.start = i; +} +EXPORT_SYMBOL_GPL(sk_msg_return_zero); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = &msg->sg.data[i]; + int uncharge = (bytes < sge->length) ? bytes : sge->length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +} +EXPORT_SYMBOL_GPL(sk_msg_return); + +static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + u32 len = sge->length; + + /* When the skb owns the memory we free it from consume_skb path. */ + if (!msg->skb) { + if (charge) + sk_mem_uncharge(sk, len); + put_page(sg_page(sge)); + } + memset(sge, 0, sizeof(*sge)); + return len; +} + +static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + int freed = 0; + + while (msg->sg.size) { + msg->sg.size -= sge->length; + freed += sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, msg->sg.size); + sge = sk_msg_elem(msg, i); + } + consume_skb(msg->skb); + sk_msg_init(msg); + return freed; +} + +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, false); +} +EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); + +int sk_msg_free(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free); + +static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, + u32 bytes, bool charge) +{ + struct scatterlist *sge; + u32 i = msg->sg.start; + + while (bytes) { + sge = sk_msg_elem(msg, i); + if (!sge->length) + break; + if (bytes < sge->length) { + if (charge) + sk_mem_uncharge(sk, bytes); + sge->length -= bytes; + sge->offset += bytes; + msg->sg.size -= bytes; + break; + } + + msg->sg.size -= sge->length; + bytes -= sge->length; + sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, bytes); + } + msg->sg.start = i; +} + +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free_partial); + +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, false); +} + +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) +{ + int trim = msg->sg.size - len; + u32 i = msg->sg.end; + + if (trim <= 0) { + WARN_ON(trim < 0); + return; + } + + sk_msg_iter_var_prev(i); + msg->sg.size = len; + while (msg->sg.data[i].length && + trim >= msg->sg.data[i].length) { + trim -= msg->sg.data[i].length; + sk_msg_free_elem(sk, msg, i, true); + sk_msg_iter_var_prev(i); + if (!trim) + goto out; + } + + msg->sg.data[i].length -= trim; + sk_mem_uncharge(sk, trim); + /* Adjust copybreak if it falls into the trimmed part of last buf */ + if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) + msg->sg.copybreak = msg->sg.data[i].length; +out: + sk_msg_iter_var_next(i); + msg->sg.end = i; + + /* If we trim data a full sg elem before curr pointer update + * copybreak and current so that any future copy operations + * start at new copy location. + * However trimmed data that has not yet been used in a copy op + * does not require an update. + */ + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= + sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { + sk_msg_iter_var_prev(i); + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } +} +EXPORT_SYMBOL_GPL(sk_msg_trim); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); + const int to_max_pages = MAX_MSG_FRAGS; + struct page *pages[MAX_MSG_FRAGS]; + ssize_t orig, copied, use, offset; + + orig = msg->sg.size; + while (bytes > 0) { + i = 0; + maxpages = to_max_pages - num_elems; + if (maxpages == 0) { + ret = -EFAULT; + goto out; + } + + copied = iov_iter_get_pages2(from, pages, bytes, maxpages, + &offset); + if (copied <= 0) { + ret = -EFAULT; + goto out; + } + + bytes -= copied; + msg->sg.size += copied; + + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + sg_set_page(&msg->sg.data[msg->sg.end], + pages[i], use, offset); + sg_unmark_end(&msg->sg.data[msg->sg.end]); + sk_mem_charge(sk, use); + + offset = 0; + copied -= use; + sk_msg_iter_next(msg, end); + num_elems++; + i++; + } + /* When zerocopy is mixed with sk_msg_*copy* operations we + * may have a copybreak set in this case clear and prefer + * zerocopy remainder when possible. + */ + msg->sg.copybreak = 0; + msg->sg.curr = msg->sg.end; + } +out: + /* Revert iov_iter updates, msg will need to use 'trim' later if it + * also needs to be cleared. + */ + if (ret) + iov_iter_revert(from, msg->sg.size - orig); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); + +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; + struct scatterlist *sge; + void *to; + + do { + sge = sk_msg_elem(msg, i); + /* This is possible if a trim operation shrunk the buffer */ + if (msg->sg.copybreak >= sge->length) { + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + sge = sk_msg_elem(msg, i); + } + + buf_size = sge->length - msg->sg.copybreak; + copy = (buf_size > bytes) ? bytes : buf_size; + to = sg_virt(sge) + msg->sg.copybreak; + msg->sg.copybreak += copy; + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + ret = copy_from_iter_nocache(to, copy, from); + else + ret = copy_from_iter(to, copy, from); + if (ret != copy) { + ret = -EFAULT; + goto out; + } + bytes -= copy; + copied += copy; + if (!bytes) + break; + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +out: + msg->sg.curr = i; + return (ret < 0) ? ret : copied; +} +EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; + struct sk_msg *msg_rx; + int i, copied = 0; + + msg_rx = sk_psock_peek_msg(psock); + while (copied != len) { + struct scatterlist *sge; + + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + if (copy) + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; + } + + copied += copy; + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + if (!msg_rx->skb) { + sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + goto out; + sk_msg_iter_var_next(i); + } + + if (copied == len) + break; + } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); + + if (unlikely(peek)) { + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) + break; + continue; + } + + msg_rx->sg.start = i; + if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); + } + msg_rx = sk_psock_peek_msg(psock); + } +out: + return copied; +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg); + +bool sk_msg_is_readable(struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} +EXPORT_SYMBOL_GPL(sk_msg_is_readable); + +static struct sk_msg *alloc_sk_msg(gfp_t gfp) +{ + struct sk_msg *msg; + + msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); + if (unlikely(!msg)) + return NULL; + sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); + return msg; +} + +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) +{ + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + return NULL; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return NULL; + + return alloc_sk_msg(GFP_KERNEL); +} + +static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, + u32 off, u32 len, + struct sk_psock *psock, + struct sock *sk, + struct sk_msg *msg, + bool take_ref) +{ + int num_sge, copied; + + /* skb_to_sgvec will fail when the total number of fragments in + * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the + * caller may aggregate multiple skbs. + */ + num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); + if (num_sge < 0) { + /* skb linearize may fail with ENOMEM, but lets simply try again + * later if this happens. Under memory pressure we don't want to + * drop the skb. We need to linearize the skb so that the mapping + * in skb_to_sgvec can not error. + * Note that skb_linearize requires the skb not to be shared. + */ + if (skb_linearize(skb)) + return -EAGAIN; + + num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); + if (unlikely(num_sge < 0)) + return num_sge; + } + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + psock->ingress_bytes += len; +#endif + copied = len; + msg->sg.start = 0; + msg->sg.size = copied; + msg->sg.end = num_sge; + msg->skb = take_ref ? skb_get(skb) : skb; + + sk_psock_queue_msg(psock, msg); + sk_psock_data_ready(sk, psock); + return copied; +} + +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len, bool take_ref); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len) +{ + struct sock *sk = psock->sk; + struct sk_msg *msg; + int err; + + /* If we are receiving on the same sock skb->sk is already assigned, + * skip memory accounting and owner transition seeing it already set + * correctly. + */ + if (unlikely(skb->sk == sk)) + return sk_psock_skb_ingress_self(psock, skb, off, len, true); + msg = sk_psock_create_ingress_msg(sk, skb); + if (!msg) + return -EAGAIN; + + /* This will transition ownership of the data from the socket where + * the BPF program was run initiating the redirect to the socket + * we will eventually receive this data on. The data will be released + * from skb_consume found in __tcp_bpf_recvmsg() after its been copied + * into user buffers. + */ + skb_set_owner_r(skb, sk); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); + if (err < 0) + kfree(msg); + return err; +} + +/* Puts an skb on the ingress queue of the socket already assigned to the + * skb. In this case we do not need to check memory limits or skb_set_owner_r + * because the skb is already accounted for here. + */ +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len, bool take_ref) +{ + struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); + struct sock *sk = psock->sk; + int err; + + if (unlikely(!msg)) + return -EAGAIN; + skb_set_owner_r(skb, sk); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); + if (err < 0) + kfree(msg); + return err; +} + +static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len, bool ingress) +{ + if (!ingress) { + if (!sock_writeable(psock->sk)) + return -EAGAIN; + return skb_send_sock(psock->sk, skb, off, len); + } + + return sk_psock_skb_ingress(psock, skb, off, len); +} + +static void sk_psock_skb_state(struct sk_psock *psock, + struct sk_psock_work_state *state, + int len, int off) +{ + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + state->len = len; + state->off = off; + } + spin_unlock_bh(&psock->ingress_lock); +} + +static void sk_psock_backlog(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); + struct sk_psock_work_state *state = &psock->work_state; + struct sk_buff *skb = NULL; + u32 len = 0, off = 0; + bool ingress; + int ret; + + /* If sk is quickly removed from the map and then added back, the old + * psock should not be scheduled, because there are now two psocks + * pointing to the same sk. + */ + if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + return; + + /* Increment the psock refcnt to synchronize with close(fd) path in + * sock_map_close(), ensuring we wait for backlog thread completion + * before sk_socket freed. If refcnt increment fails, it indicates + * sock_map_close() completed with sk_socket potentially already freed. + */ + if (!sk_psock_get(psock->sk)) + return; + mutex_lock(&psock->work_mutex); + while ((skb = skb_peek(&psock->ingress_skb))) { + len = skb->len; + off = 0; + if (skb_bpf_strparser(skb)) { + struct strp_msg *stm = strp_msg(skb); + + off = stm->offset; + len = stm->full_len; + } + + /* Resume processing from previous partial state */ + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); + do { + ret = -EIO; + if (!sock_flag(psock->sk, SOCK_DEAD)) + ret = sk_psock_handle_skb(psock, skb, off, + len, ingress); + if (ret <= 0) { + if (ret == -EAGAIN) { + sk_psock_skb_state(psock, state, len, off); + /* Restore redir info we cleared before */ + skb_bpf_set_redir(skb, psock->sk, ingress); + /* Delay slightly to prioritize any + * other work that might be here. + */ + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 1); + goto end; + } + /* Hard errors break pipe and stop xmit. */ + sk_psock_report_error(psock, ret ? -ret : EPIPE); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + goto end; + } + off += ret; + len -= ret; + } while (len); + + /* The entire skb sent, clear state */ + sk_psock_skb_state(psock, state, 0, 0); + skb = skb_dequeue(&psock->ingress_skb); + kfree_skb(skb); + } +end: + mutex_unlock(&psock->work_mutex); + sk_psock_put(psock->sk, psock); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node) +{ + struct sk_psock *psock; + struct proto *prot; + + write_lock_bh(&sk->sk_callback_lock); + + if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + + if (sk->sk_user_data) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node); + if (!psock) { + psock = ERR_PTR(-ENOMEM); + goto out; + } + + prot = READ_ONCE(sk->sk_prot); + psock->sk = sk; + psock->eval = __SK_NONE; + psock->sk_proto = prot; + psock->saved_unhash = prot->unhash; + psock->saved_destroy = prot->destroy; + psock->saved_close = prot->close; + psock->saved_write_space = sk->sk_write_space; + + INIT_LIST_HEAD(&psock->link); + spin_lock_init(&psock->link_lock); + + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); + INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); + skb_queue_head_init(&psock->ingress_skb); + + sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); + refcount_set(&psock->refcnt, 1); + + __rcu_assign_sk_user_data_with_flags(sk, psock, + SK_USER_DATA_NOCOPY | + SK_USER_DATA_PSOCK); + sock_hold(sk); + +out: + write_unlock_bh(&sk->sk_callback_lock); + return psock; +} +EXPORT_SYMBOL_GPL(sk_psock_init); + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) +{ + struct sk_psock_link *link; + + spin_lock_bh(&psock->link_lock); + link = list_first_entry_or_null(&psock->link, struct sk_psock_link, + list); + if (link) + list_del(&link->list); + spin_unlock_bh(&psock->link_lock); + return link; +} + +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +{ + struct sk_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { + list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); + sk_msg_free(psock->sk, msg); + kfree(msg); + } +} + +static void __sk_psock_zap_ingress(struct sk_psock *psock) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + sock_drop(psock->sk, skb); + } + __sk_psock_purge_ingress_msg(psock); +} + +static void sk_psock_link_destroy(struct sk_psock *psock) +{ + struct sk_psock_link *link, *tmp; + + list_for_each_entry_safe(link, tmp, &psock->link, list) { + list_del(&link->list); + sk_psock_free_link(link); + } +} + +void sk_psock_stop(struct sk_psock *psock) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + spin_unlock_bh(&psock->ingress_lock); +} + +static void sk_psock_done_strp(struct sk_psock *psock); + +static void sk_psock_destroy(struct work_struct *work) +{ + struct sk_psock *psock = container_of(to_rcu_work(work), + struct sk_psock, rwork); + /* No sk_callback_lock since already detached. */ + + sk_psock_done_strp(psock); + + cancel_delayed_work_sync(&psock->work); + __sk_psock_zap_ingress(psock); + mutex_destroy(&psock->work_mutex); + + psock_progs_drop(&psock->progs); + + sk_psock_link_destroy(psock); + sk_psock_cork_free(psock); + + if (psock->sk_redir) + sock_put(psock->sk_redir); + if (psock->sk_pair) + sock_put(psock->sk_pair); + sock_put(psock->sk); + kfree(psock); +} + +void sk_psock_drop(struct sock *sk, struct sk_psock *psock) +{ + write_lock_bh(&sk->sk_callback_lock); + sk_psock_restore_proto(sk, psock); + rcu_assign_sk_user_data(sk, NULL); + if (psock->progs.stream_parser) + sk_psock_stop_strp(sk, psock); + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) + sk_psock_stop_verdict(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + + sk_psock_stop(psock); + + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); + queue_rcu_work(system_percpu_wq, &psock->rwork); +} +EXPORT_SYMBOL_GPL(sk_psock_drop); + +static int sk_psock_map_verd(int verdict, bool redir) +{ + switch (verdict) { + case SK_PASS: + return redir ? __SK_REDIRECT : __SK_PASS; + case SK_DROP: + default: + break; + } + + return __SK_DROP; +} + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg) +{ + struct bpf_prog *prog; + int ret; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.msg_parser); + if (unlikely(!prog)) { + ret = __SK_PASS; + goto out; + } + + sk_msg_compute_data_pointers(msg); + msg->sk = sk; + ret = bpf_prog_run_pin_on_cpu(prog, msg); + ret = sk_psock_map_verd(ret, msg->sk_redir); + psock->apply_bytes = msg->apply_bytes; + if (ret == __SK_REDIRECT) { + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + if (!msg->sk_redir) { + ret = __SK_DROP; + goto out; + } + psock->redir_ingress = sk_msg_to_ingress(msg); + psock->sk_redir = msg->sk_redir; + sock_hold(psock->sk_redir); + } +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); + +static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) +{ + struct sk_psock *psock_other; + struct sock *sk_other; + + sk_other = skb_bpf_redirect_fetch(skb); + /* This error is a buggy BPF program, it returned a redirect + * return code, but then didn't set a redirect interface. + */ + if (unlikely(!sk_other)) { + skb_bpf_redirect_clear(skb); + sock_drop(from->sk, skb); + return -EIO; + } + psock_other = sk_psock(sk_other); + /* This error indicates the socket is being torn down or had another + * error that caused the pipe to break. We can't send a packet on + * a socket that is in this state so we drop the skb. + */ + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + skb_bpf_redirect_clear(skb); + sock_drop(from->sk, skb); + return -EIO; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); + skb_bpf_redirect_clear(skb); + sock_drop(from->sk, skb); + return -EIO; + } + + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_delayed_work(&psock_other->work, 0); + spin_unlock_bh(&psock_other->ingress_lock); + return 0; +} + +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, + struct sk_psock *from, int verdict) +{ + switch (verdict) { + case __SK_REDIRECT: + sk_psock_skb_redirect(from, skb); + break; + case __SK_PASS: + case __SK_DROP: + default: + break; + } +} + +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog; + int ret = __SK_PASS; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.stream_verdict); + if (likely(prog)) { + skb->sk = psock->sk; + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); + skb->sk = NULL; + } + sk_psock_tls_verdict_apply(skb, psock, ret); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); + +static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, + int verdict) +{ + struct sock *sk_other; + int err = 0; + u32 len, off; + + switch (verdict) { + case __SK_PASS: + err = -EIO; + sk_other = psock->sk; + if (sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + goto out_free; + + skb_bpf_set_ingress(skb); + + /* If the queue is empty then we can submit directly + * into the msg queue. If its not empty we have to + * queue work otherwise we may get OOO data. Otherwise, + * if sk_psock_skb_ingress errors will be handled by + * retrying later from workqueue. + */ + if (skb_queue_empty(&psock->ingress_skb)) { + len = skb->len; + off = 0; + if (skb_bpf_strparser(skb)) { + struct strp_msg *stm = strp_msg(skb); + + off = stm->offset; + len = stm->full_len; + } + err = sk_psock_skb_ingress_self(psock, skb, off, len, false); + } + if (err < 0) { + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_delayed_work(&psock->work, 0); + err = 0; + } + spin_unlock_bh(&psock->ingress_lock); + if (err < 0) + goto out_free; + } + break; + case __SK_REDIRECT: + tcp_eat_skb(psock->sk, skb); + err = sk_psock_skb_redirect(psock, skb); + break; + case __SK_DROP: + default: +out_free: + skb_bpf_redirect_clear(skb); + tcp_eat_skb(psock->sk, skb); + sock_drop(psock->sk, skb); + } + + return err; +} + +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 0); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) +static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock; + struct bpf_prog *prog; + int ret = __SK_DROP; + struct sock *sk; + + rcu_read_lock(); + sk = strp->sk; + psock = sk_psock(sk); + if (unlikely(!psock)) { + sock_drop(sk, skb); + goto out; + } + prog = READ_ONCE(psock->progs.stream_verdict); + if (likely(prog)) { + skb->sk = sk; + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + skb_bpf_set_strparser(skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); + skb->sk = NULL; + } + sk_psock_verdict_apply(psock, skb, ret); +out: + rcu_read_unlock(); +} + +static int sk_psock_strp_read_done(struct strparser *strp, int err) +{ + return err; +} + +static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); + struct bpf_prog *prog; + int ret = skb->len; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.stream_parser); + if (likely(prog)) { + skb->sk = psock->sk; + ret = bpf_prog_run_pin_on_cpu(prog, skb); + skb->sk = NULL; + } + rcu_read_unlock(); + return ret; +} + +/* Called with socket lock held. */ +static void sk_psock_strp_data_ready(struct sock *sk) +{ + struct sk_psock *psock; + + trace_sk_data_ready(sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (tls_sw_has_ctx_rx(sk)) { + psock->saved_data_ready(sk); + } else { + read_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->strp); + read_unlock_bh(&sk->sk_callback_lock); + } + } + rcu_read_unlock(); +} + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + int ret; + + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + ret = strp_init(&psock->strp, sk, &cb); + if (!ret) + sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + + if (sk_is_tcp(sk)) { + psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; + psock->copied_seq = tcp_sk(sk)->copied_seq; + } + return ret; +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + if (psock->saved_data_ready) + return; + + psock->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + psock_set_prog(&psock->progs.stream_parser, NULL); + + if (!psock->saved_data_ready) + return; + + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) + strp_done(&psock->strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + +static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) +{ + struct sk_psock *psock; + struct bpf_prog *prog; + int ret = __SK_DROP; + int len = skb->len; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + len = 0; + tcp_eat_skb(sk, skb); + sock_drop(sk, skb); + goto out; + } + prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); + } + ret = sk_psock_verdict_apply(psock, skb, ret); + if (ret < 0) + len = ret; +out: + rcu_read_unlock(); + return len; +} + +static void sk_psock_verdict_data_ready(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + const struct proto_ops *ops; + int copied; + + trace_sk_data_ready(sk); + + if (unlikely(!sock)) + return; + ops = READ_ONCE(sock->ops); + if (!ops || !ops->read_skb) + return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); + if (copied >= 0) { + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) + sk_psock_data_ready(sk, psock); + rcu_read_unlock(); + } +} + +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) +{ + if (psock->saved_data_ready) + return; + + psock->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_verdict_data_ready; + sk->sk_write_space = sk_psock_write_space; +} + +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) +{ + psock_set_prog(&psock->progs.stream_verdict, NULL); + psock_set_prog(&psock->progs.skb_verdict, NULL); + + if (!psock->saved_data_ready) + return; + + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; +} diff --git a/net/core/sock.c b/net/core/sock.c index 548d716c5f62..45c98bf524b2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -6,7 +7,6 @@ * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> @@ -81,18 +81,14 @@ * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> +#include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> @@ -101,6 +97,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> @@ -110,173 +107,189 @@ #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> +#include <linux/udp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/compat.h> +#include <linux/mroute.h> +#include <linux/mroute6.h> +#include <linux/icmpv6.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> +#include <linux/sock_diag.h> #include <linux/filter.h> +#include <net/sock_reuseport.h> +#include <net/bpf_sk_storage.h> #include <trace/events/sock.h> -#ifdef CONFIG_INET #include <net/tcp.h> -#endif - #include <net/busy_poll.h> +#include <net/phonet/phonet.h> + +#include <linux/ethtool.h> + +#include <uapi/linux/pidfd.h> + +#include "dev.h" static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -#ifdef CONFIG_MEMCG_KMEM -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - struct proto *proto; - int ret = 0; - - mutex_lock(&proto_list_mutex); - list_for_each_entry(proto, &proto_list, node) { - if (proto->init_cgroup) { - ret = proto->init_cgroup(memcg, ss); - if (ret) - goto out; - } - } +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); +static void sock_def_write_space(struct sock *sk); - mutex_unlock(&proto_list_mutex); - return ret; -out: - list_for_each_entry_continue_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); - return ret; +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, + struct user_namespace *user_ns, int cap) +{ + return file_ns_capable(sk->sk_socket->file, user_ns, cap) && + ns_capable(user_ns, cap); } +EXPORT_SYMBOL(sk_ns_capable); -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) { - struct proto *proto; + return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); - mutex_lock(&proto_list_mutex); - list_for_each_entry_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); } -#endif +EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have - * one slock key per address family: + * one slock key per address family and separate keys for internal and + * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; +static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; - -#if defined(CONFIG_MEMCG_KMEM) -struct static_key memcg_socket_limit_enabled; -EXPORT_SYMBOL(memcg_socket_limit_enabled); -#endif +static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ + +#define _sock_locks(x) \ + x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ + x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ + x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ + x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ + x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ + x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ + x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ + x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ + x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ + x "27" , x "28" , x "AF_CAN" , \ + x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ + x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ + x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ + x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ + x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MCTP" , \ + x "AF_MAX" + static const char *const af_family_key_strings[AF_MAX+1] = { - "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , - "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", - "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , - "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , - "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , - "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , - "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , - "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , - "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , - "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , - "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , - "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , - "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , - "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" + _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { - "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , - "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", - "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , - "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , - "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , - "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , - "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , - "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , - "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , - "slock-27" , "slock-28" , "slock-AF_CAN" , - "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , - "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , - "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" + _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { - "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , - "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", - "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , - "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , - "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , - "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , - "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , - "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , - "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , - "clock-27" , "clock-28" , "clock-AF_CAN" , - "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , - "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , - "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , - "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" + _sock_locks("clock-") +}; + +static const char *const af_family_kern_key_strings[AF_MAX+1] = { + _sock_locks("k-sk_lock-") +}; +static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { + _sock_locks("k-slock-") +}; +static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { + _sock_locks("k-clock-") +}; +static const char *const af_family_rlock_key_strings[AF_MAX+1] = { + _sock_locks("rlock-") +}; +static const char *const af_family_wlock_key_strings[AF_MAX+1] = { + _sock_locks("wlock-") +}; +static const char *const af_family_elock_key_strings[AF_MAX+1] = { + _sock_locks("elock-") }; /* - * sk_callback_lock locking rules are per-address-family, + * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; - -/* Take into consideration the size of the struct sk_buff overhead in the - * determination of these values, since that is non-constant across - * platforms. This makes socket queueing behavior and performance - * not depend upon such differences. - */ -#define _SK_MEM_PACKETS 256 -#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +static struct lock_class_key af_rlock_keys[AF_MAX]; +static struct lock_class_key af_wlock_keys[AF_MAX]; +static struct lock_class_key af_elock_keys[AF_MAX]; +static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_wmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_wmem_max); -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_rmem_max); -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; - -/* Maximal space eaten by iovec or ancillary data plus some space */ -int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); -EXPORT_SYMBOL(sysctl_optmem_max); +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; -struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; -EXPORT_SYMBOL_GPL(memalloc_socks); +DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); +EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC @@ -290,7 +303,7 @@ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; - static_key_slow_inc(&memalloc_socks); + static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); @@ -298,53 +311,135 @@ void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; - static_key_slow_dec(&memalloc_socks); + static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward - * progress of swapping. However, if SOCK_MEMALLOC is cleared while - * it has rmem allocations there is a risk that the user of the - * socket cannot make forward progress due to exceeding the rmem - * limits. By rights, sk_clear_memalloc() should only be called - * on sockets being torn down but warn and reset the accounting if - * that assumption breaks. + * progress of swapping. SOCK_MEMALLOC may be cleared while + * it has rmem allocations due to the last swapfile being deactivated + * but there is a risk that the socket is unusable due to exceeding + * the rmem limits. Reclaim the reserves and obey rmem limits again. */ - if (WARN_ON(sk->sk_forward_alloc)) - sk_mem_reclaim(sk); + sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); - current->flags |= PF_MEMALLOC; - ret = sk->sk_backlog_rcv(sk, skb); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + noreclaim_flag = memalloc_noreclaim_save(); + ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); + memalloc_noreclaim_restore(noreclaim_flag); return ret; } EXPORT_SYMBOL(__sk_backlog_rcv); -static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +void sk_error_report(struct sock *sk) { - struct timeval tv; + sk->sk_error_report(sk); + + switch (sk->sk_family) { + case AF_INET: + fallthrough; + case AF_INET6: + trace_inet_sk_error_report(sk); + break; + default: + break; + } +} +EXPORT_SYMBOL(sk_error_report); + +int sock_get_timeout(long timeo, void *optval, bool old_timeval) +{ + struct __kernel_sock_timeval tv; + + if (timeo == MAX_SCHEDULE_TIMEOUT) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } else { + tv.tv_sec = timeo / HZ; + tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; + } + + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; + *(struct old_timeval32 *)optval = tv32; + return sizeof(tv32); + } + + if (old_timeval) { + struct __kernel_old_timeval old_tv; + old_tv.tv_sec = tv.tv_sec; + old_tv.tv_usec = tv.tv_usec; + *(struct __kernel_old_timeval *)optval = old_tv; + return sizeof(old_tv); + } + + *(struct __kernel_sock_timeval *)optval = tv; + return sizeof(tv); +} +EXPORT_SYMBOL(sock_get_timeout); + +int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, + sockptr_t optval, int optlen, bool old_timeval) +{ + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + struct old_timeval32 tv32; + + if (optlen < sizeof(tv32)) + return -EINVAL; + + if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) + return -EFAULT; + tv->tv_sec = tv32.tv_sec; + tv->tv_usec = tv32.tv_usec; + } else if (old_timeval) { + struct __kernel_old_timeval old_tv; + + if (optlen < sizeof(old_tv)) + return -EINVAL; + if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) + return -EFAULT; + tv->tv_sec = old_tv.tv_sec; + tv->tv_usec = old_tv.tv_usec; + } else { + if (optlen < sizeof(*tv)) + return -EINVAL; + if (copy_from_sockptr(tv, optval, sizeof(*tv))) + return -EFAULT; + } + + return 0; +} +EXPORT_SYMBOL(sock_copy_user_timeval); + +static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, + bool old_timeval) +{ + struct __kernel_sock_timeval tv; + int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); + long val; + + if (err) + return err; - if (optlen < sizeof(tv)) - return -EINVAL; - if (copy_from_user(&tv, optval, sizeof(tv))) - return -EFAULT; if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; if (tv.tv_sec < 0) { static int warned __read_mostly; - *timeo_p = 0; + WRITE_ONCE(*timeo_p, 0); if (warned < 10 && net_ratelimit()) { warned++; pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", @@ -352,97 +447,123 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) } return 0; } - *timeo_p = MAX_SCHEDULE_TIMEOUT; - if (tv.tv_sec == 0 && tv.tv_usec == 0) - return 0; - if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) - *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); + val = MAX_SCHEDULE_TIMEOUT; + if ((tv.tv_sec || tv.tv_usec) && + (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) + val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, + USEC_PER_SEC / HZ); + WRITE_ONCE(*timeo_p, val); return 0; } -static void sock_warn_obsolete_bsdism(const char *name) +static bool sk_set_prio_allowed(const struct sock *sk, int val) { - static int warned; - static char warncomm[TASK_COMM_LEN]; - if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); - pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", - warncomm, name); - warned++; - } + return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); } -#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +static bool sock_needs_netstamp(const struct sock *sk) +{ + switch (sk->sk_family) { + case AF_UNSPEC: + case AF_UNIX: + return false; + default: + return true; + } +} static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; - if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int err; - int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { - atomic_inc(&sk->sk_drops); + if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { + sk_drops_inc(sk); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } - err = sk_filter(sk, skb); - if (err) - return err; - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return -ENOBUFS; } skb->dev = NULL; skb_set_owner_r(skb, sk); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ - skb_len = skb->len; - /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); - skb->dropcount = atomic_read(&sk->sk_drops); + sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); return 0; } -EXPORT_SYMBOL(sock_queue_rcv_skb); +EXPORT_SYMBOL(__sock_queue_rcv_skb); -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) +int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason) { + enum skb_drop_reason drop_reason; + int err; + + err = sk_filter_reason(sk, skb, &drop_reason); + if (err) + goto out; + + err = __sock_queue_rcv_skb(sk, skb); + switch (err) { + case -ENOMEM: + drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; + break; + case -ENOBUFS: + drop_reason = SKB_DROP_REASON_PROTO_MEM; + break; + default: + drop_reason = SKB_NOT_DROPPED_YET; + break; + } +out: + if (reason) + *reason = drop_reason; + return err; +} +EXPORT_SYMBOL(sock_queue_rcv_skb_reason); + +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested, unsigned int trim_cap, bool refcounted) +{ + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; int rc = NET_RX_SUCCESS; + int err; - if (sk_filter(sk, skb)) + if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) goto discard_and_relse; skb->dev = NULL; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { + sk_drops_inc(sk); + reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } if (nested) @@ -457,35 +578,41 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) rc = sk_backlog_rcv(sk, skb); - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); + } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { bh_unlock_sock(sk); - atomic_inc(&sk->sk_drops); + if (err == -ENOMEM) + reason = SKB_DROP_REASON_PFMEMALLOC; + if (err == -ENOBUFS) + reason = SKB_DROP_REASON_SOCKET_BACKLOG; + sk_drops_inc(sk); goto discard_and_relse; } bh_unlock_sock(sk); out: - sock_put(sk); + if (refcounted) + sock_put(sk); return rc; discard_and_relse: - kfree_skb(skb); + sk_skb_reason_drop(sk, skb, reason); goto out; } -EXPORT_SYMBOL(sk_receive_skb); - -void sk_reset_txq(struct sock *sk) -{ - sk_tx_queue_clear(sk); -} -EXPORT_SYMBOL(sk_reset_txq); +EXPORT_SYMBOL(__sk_receive_skb); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, + u32)); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (dst && READ_ONCE(dst->obsolete) && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { sk_tx_queue_clear(sk); + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; @@ -499,7 +626,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (dst && READ_ONCE(dst->obsolete) && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { sk_dst_reset(sk); dst_release(dst); return NULL; @@ -509,21 +638,59 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice(struct sock *sk, char __user *optval, - int optlen) +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); - char devname[IFNAMSIZ]; - int index; /* Sorry... */ ret = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_RAW)) + if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; + if (ifindex < 0) + goto out; + + /* Paired with all READ_ONCE() done locklessly. */ + WRITE_ONCE(sk->sk_bound_dev_if, ifindex); + + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + sk_dst_reset(sk); + + ret = 0; + +out: +#endif + + return ret; +} + +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) +{ + int ret; + + if (lock_sk) + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + if (lock_sk) + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + +static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + char devname[IFNAMSIZ]; + int index; + + ret = -EINVAL; if (optlen < 0) goto out; @@ -537,7 +704,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, memset(devname, 0, sizeof(devname)); ret = -EFAULT; - if (copy_from_user(devname, optval, optlen)) + if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; @@ -554,28 +721,25 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - lock_sock(sk); - sk->sk_bound_dev_if = index; - sk_dst_reset(sk); - release_sock(sk); - - ret = 0; - + sockopt_lock_sock(sk); + ret = sock_bindtoindex_locked(sk, index); + sockopt_release_sock(sk); out: #endif return ret; } -static int sock_getbindtodevice(struct sock *sk, char __user *optval, - int __user *optlen, int len) +static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; - if (sk->sk_bound_dev_if == 0) { + if (bound_dev_if == 0) { len = 0; goto zero; } @@ -584,19 +748,19 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, if (len < IFNAMSIZ) goto out; - ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); + ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; len = strlen(devname) + 1; ret = -EFAULT; - if (copy_to_user(optval, devname, len)) + if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; @@ -607,12 +771,418 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +bool sk_mc_loop(const struct sock *sk) +{ + if (dev_recursion_level()) + return false; + if (!sk) + return true; + /* IPV6_ADDRFORM can change sk->sk_family under us. */ + switch (READ_ONCE(sk->sk_family)) { + case AF_INET: + return inet_test_bit(MC_LOOP, sk); +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return inet6_test_bit(MC6_LOOP, sk); +#endif + } + WARN_ON_ONCE(1); + return true; +} +EXPORT_SYMBOL(sk_mc_loop); + +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + WRITE_ONCE(sk->sk_lingertime, 0); + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + +void sock_set_priority(struct sock *sk, u32 priority) +{ + WRITE_ONCE(sk->sk_priority, priority); +} +EXPORT_SYMBOL(sock_set_priority); + +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); + else + WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } +} + +void sock_set_timestamp(struct sock *sk, int optname, bool valbool) +{ + switch (optname) { + case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; + case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; + case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; + case SO_TIMESTAMPNS_NEW: + __sock_set_timestamps(sk, valbool, true, true); + break; + } +} + +static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) { - if (valbool) - sock_set_flag(sk, bit); + struct net *net = sock_net(sk); + struct net_device *dev = NULL; + bool match = false; + int *vclock_index; + int i, num; + + if (sk->sk_bound_dev_if) + dev = dev_get_by_index(net, sk->sk_bound_dev_if); + + if (!dev) { + pr_err("%s: sock not bind to device\n", __func__); + return -EOPNOTSUPP; + } + + num = ethtool_get_phc_vclocks(dev, &vclock_index); + dev_put(dev); + + for (i = 0; i < num; i++) { + if (*(vclock_index + i) == phc_index) { + match = true; + break; + } + } + + if (num > 0) + kfree(vclock_index); + + if (!match) + return -EINVAL; + + WRITE_ONCE(sk->sk_bind_phc, phc_index); + + return 0; +} + +int sock_set_timestamping(struct sock *sk, int optname, + struct so_timestamping timestamping) +{ + int val = timestamping.flags; + int ret; + + if (val & ~SOF_TIMESTAMPING_MASK) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_OPT_ID_TCP && + !(val & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk_is_tcp(sk)) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) + return -EINVAL; + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) + atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); + else + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); + } else { + atomic_set(&sk->sk_tskey, 0); + } + } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_BIND_PHC) { + ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); + if (ret) + return ret; + } + + WRITE_ONCE(sk->sk_tsflags, val); + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); + + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); else - sock_reset_flag(sk, bit); + sock_disable_timestamp(sk, + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + return 0; +} + +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ + struct bpf_sock_ops_kern sock_ops; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, 0); + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); +} +#endif + +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + +static void __sock_set_mark(struct sock *sk, u32 val) +{ + if (val != sk->sk_mark) { + WRITE_ONCE(sk->sk_mark, val); + sk_dst_reset(sk); + } +} + +void sock_set_mark(struct sock *sk, u32 val) +{ + lock_sock(sk); + __sock_set_mark(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_mark); + +static void sock_release_reserved_memory(struct sock *sk, int bytes) +{ + /* Round down bytes to multiple of pages */ + bytes = round_down(bytes, PAGE_SIZE); + + WARN_ON(bytes > sk->sk_reserved_mem); + WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); + sk_mem_reclaim(sk); +} + +static int sock_reserve_memory(struct sock *sk, int bytes) +{ + long allocated; + bool charged; + int pages; + + if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) + return -EOPNOTSUPP; + + if (!bytes) + return 0; + + pages = sk_mem_pages(bytes); + + /* pre-charge to memcg */ + charged = mem_cgroup_sk_charge(sk, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!charged) + return -ENOMEM; + + if (sk->sk_bypass_prot_mem) + goto success; + + /* pre-charge to forward_alloc */ + sk_memory_allocated_add(sk, pages); + allocated = sk_memory_allocated(sk); + + /* If the system goes into memory pressure with this + * precharge, give up and return error. + */ + if (allocated > sk_prot_mem_limits(sk, 1)) { + sk_memory_allocated_sub(sk, pages); + mem_cgroup_sk_uncharge(sk, pages); + return -ENOMEM; + } + +success: + sk_forward_alloc_add(sk, pages << PAGE_SHIFT); + + WRITE_ONCE(sk->sk_reserved_mem, + sk->sk_reserved_mem + (pages << PAGE_SHIFT)); + + return 0; +} + +#ifdef CONFIG_PAGE_POOL + +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED + * in 1 syscall. The limit exists to limit the amount of memory the kernel + * allocates to copy these tokens, and to prevent looping over the frags for + * too long. + */ +#define MAX_DONTNEED_TOKENS 128 +#define MAX_DONTNEED_FRAGS 1024 + +static noinline_for_stack int +sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) +{ + unsigned int num_tokens, i, j, k, netmem_num = 0; + struct dmabuf_token *tokens; + int ret = 0, num_frags = 0; + netmem_ref netmems[16]; + + if (!sk_is_tcp(sk)) + return -EBADF; + + if (optlen % sizeof(*tokens) || + optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) + return -EINVAL; + + num_tokens = optlen / sizeof(*tokens); + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); + if (!tokens) + return -ENOMEM; + + if (copy_from_sockptr(tokens, optval, optlen)) { + kvfree(tokens); + return -EFAULT; + } + + xa_lock_bh(&sk->sk_user_frags); + for (i = 0; i < num_tokens; i++) { + for (j = 0; j < tokens[i].token_count; j++) { + if (++num_frags > MAX_DONTNEED_FRAGS) + goto frag_limit_reached; + + netmem_ref netmem = (__force netmem_ref)__xa_erase( + &sk->sk_user_frags, tokens[i].token_start + j); + + if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + continue; + + netmems[netmem_num++] = netmem; + if (netmem_num == ARRAY_SIZE(netmems)) { + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + netmem_num = 0; + xa_lock_bh(&sk->sk_user_frags); + } + ret++; + } + } + +frag_limit_reached: + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + + kvfree(tokens); + return ret; +} +#endif + +void sockopt_lock_sock(struct sock *sk) +{ + /* When current->bpf_ctx is set, the setsockopt is called from + * a bpf prog. bpf has ensured the sk lock has been + * acquired before calling setsockopt(). + */ + if (has_current_bpf_ctx()) + return; + + lock_sock(sk); +} +EXPORT_SYMBOL(sockopt_lock_sock); + +void sockopt_release_sock(struct sock *sk) +{ + if (has_current_bpf_ctx()) + return; + + release_sock(sk); +} +EXPORT_SYMBOL(sockopt_release_sock); + +bool sockopt_ns_capable(struct user_namespace *ns, int cap) +{ + return has_current_bpf_ctx() || ns_capable(ns, cap); +} +EXPORT_SYMBOL(sockopt_ns_capable); + +bool sockopt_capable(int cap) +{ + return has_current_bpf_ctx() || capable(cap); +} +EXPORT_SYMBOL(sockopt_capable); + +static int sockopt_validate_clockid(__kernel_clockid_t value) +{ + switch (value) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_TAI: + return 0; + } + return -EINVAL; } /* @@ -620,10 +1190,12 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) * at the socket level. Everything here is generic. */ -int sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { - struct sock *sk = sock->sk; + struct so_timestamping timestamping; + struct socket *sock = sk->sk_socket; + struct sock_txtime sk_txtime; int val; int valbool; struct linger ling; @@ -639,16 +1211,107 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; - lock_sock(sk); + /* handle options which do not require locking the socket. */ + switch (optname) { + case SO_PRIORITY: + if (sk_set_prio_allowed(sk, val)) { + sock_set_priority(sk, val); + return 0; + } + return -EPERM; + case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: + case SO_ERROR: + return -ENOPROTOOPT; +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + if (val < 0) + return -EINVAL; + WRITE_ONCE(sk->sk_ll_usec, val); + return 0; + case SO_PREFER_BUSY_POLL: + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) + return -EPERM; + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + return 0; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && + !sockopt_capable(CAP_NET_ADMIN)) + return -EPERM; + if (val < 0 || val > U16_MAX) + return -EINVAL; + WRITE_ONCE(sk->sk_busy_poll_budget, val); + return 0; +#endif + case SO_MAX_PACING_RATE: + { + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; + unsigned long pacing_rate; + + if (sizeof(ulval) != sizeof(val) && + optlen >= sizeof(ulval) && + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { + return -EFAULT; + } + if (ulval != ~0UL) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); + /* Pairs with READ_ONCE() from sk_getsockopt() */ + WRITE_ONCE(sk->sk_max_pacing_rate, ulval); + pacing_rate = READ_ONCE(sk->sk_pacing_rate); + if (ulval < pacing_rate) + WRITE_ONCE(sk->sk_pacing_rate, ulval); + return 0; + } + case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; + if (val < -1 || val > 1) + return -EINVAL; + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + /* Paired with READ_ONCE() in tcp_rtx_synack() + * and sk_getsockopt(). + */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); + return 0; + case SO_PEEK_OFF: + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); + else + ret = -EOPNOTSUPP; + return ret; + } +#ifdef CONFIG_PAGE_POOL + case SO_DEVMEM_DONTNEED: + return sock_devmem_dontneed(sk, optval, optlen); +#endif + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + return sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + return sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); + } + + sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: - if (val && !capable(CAP_NET_ADMIN)) + if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); @@ -657,16 +1320,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; - break; - case SO_TYPE: - case SO_PROTOCOL: - case SO_DOMAIN: - case SO_ERROR: - ret = -ENOPROTOOPT; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); @@ -677,19 +1338,30 @@ int sock_setsockopt(struct socket *sock, int level, int optname, * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_wmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: + /* Ensure val * 2 fits into an int, to prevent max_t() + * from treating it as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; case SO_SNDBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } + + /* No negative values (to prevent underflow, as val will be + * multiplied by 2). + */ + if (val < 0) + val = 0; goto set_sndbuf; case SO_RCVBUF: @@ -698,40 +1370,24 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_rmem_max); -set_rcvbuf: - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); + __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } - goto set_rcvbuf; + + /* No negative values (to prevent underflow, as val will be + * multiplied by 2). + */ + __sock_set_rcvbuf(sk, max(val, 0)); + break; case SO_KEEPALIVE: -#ifdef CONFIG_INET - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) - tcp_set_keepalive(sk, valbool); -#endif + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; @@ -740,15 +1396,7 @@ set_rcvbuf: break; case SO_NO_CHECK: - sk->sk_no_check = valbool; - break; - - case SO_PRIORITY: - if ((val >= 0 && val <= 6) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - sk->sk_priority = val; - else - ret = -EPERM; + sk->sk_no_check_tx = valbool; break; case SO_LINGER: @@ -756,101 +1404,108 @@ set_rcvbuf: ret = -EINVAL; /* 1003.1g */ break; } - if (copy_from_user(&ling, optval, sizeof(ling))) { + if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } - if (!ling.l_onoff) + if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); - else { -#if (BITS_PER_LONG == 32) - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + } else { + unsigned long t_sec = ling.l_linger; + + if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) + WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else -#endif - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("setsockopt"); break; - case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); - else - clear_bit(SOCK_PASSCRED, &sock->flags); + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + sock_set_timestamp(sk, optname, valbool); break; - case SO_TIMESTAMP: - case SO_TIMESTAMPNS: - if (valbool) { - if (optname == SO_TIMESTAMP) - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - else - sock_set_flag(sk, SOCK_RCVTSTAMPNS); - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk, SOCK_TIMESTAMP); + case SO_TIMESTAMPING_NEW: + case SO_TIMESTAMPING_OLD: + if (optlen == sizeof(timestamping)) { + if (copy_from_sockptr(×tamping, optval, + sizeof(timestamping))) { + ret = -EFAULT; + break; + } } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + memset(×tamping, 0, sizeof(timestamping)); + timestamping.flags = val; } - break; - - case SO_TIMESTAMPING: - if (val & ~SOF_TIMESTAMPING_MASK) { - ret = -EINVAL; - break; - } - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, - val & SOF_TIMESTAMPING_TX_HARDWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, - val & SOF_TIMESTAMPING_TX_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, - val & SOF_TIMESTAMPING_RX_HARDWARE); - if (val & SOF_TIMESTAMPING_RX_SOFTWARE) - sock_enable_timestamp(sk, - SOCK_TIMESTAMPING_RX_SOFTWARE); - else - sock_disable_timestamp(sk, - (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, - val & SOF_TIMESTAMPING_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, - val & SOF_TIMESTAMPING_SYS_HARDWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, - val & SOF_TIMESTAMPING_RAW_HARDWARE); + ret = sock_set_timestamping(sk, optname, timestamping); break; case SO_RCVLOWAT: + { + int (*set_rcvlowat)(struct sock *sk, int val) = NULL; + if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + if (sock) + set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; + if (set_rcvlowat) + ret = set_rcvlowat(sk, val); + else + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; + } + case SO_ATTACH_FILTER: { + struct sock_fprog fprog; - case SO_RCVTIMEO: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) + ret = sk_attach_filter(&fprog, sk); break; + } + case SO_ATTACH_BPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; - case SO_SNDTIMEO: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + ret = -EFAULT; + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_attach_bpf(ufd, sk); + } break; - case SO_ATTACH_FILTER: + case SO_ATTACH_REUSEPORT_CBPF: { + struct sock_fprog fprog; + + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) + ret = sk_reuseport_attach_filter(&fprog, sk); + break; + } + case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; + if (optlen == sizeof(u32)) { + u32 ufd; ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; - ret = sk_attach_filter(&fprog, sk); + ret = sk_reuseport_attach_bpf(ufd, sk); } break; + case SO_DETACH_REUSEPORT_BPF: + ret = reuseport_detach_prog(sk); + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -862,21 +1517,23 @@ set_rcvbuf: sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; - case SO_PASSSEC: - if (valbool) - set_bit(SOCK_PASSSEC, &sock->flags); - else - clear_bit(SOCK_PASSSEC, &sock->flags); - break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; - else - sk->sk_mark = val; + break; + } + + __sock_set_mark(sk, val); + break; + case SO_RCVMARK: + sock_valbool_flag(sk, SOCK_RCVMARK, valbool); + break; + + case SO_RCVPRIORITY: + sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); break; - /* We implement the SO_SNDLOWAT etc to - not be settable (1003.1g 5.3) */ case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; @@ -885,13 +1542,6 @@ set_rcvbuf: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; - case SO_PEEK_OFF: - if (sock->ops->set_peek_off) - sock->ops->set_peek_off(sk, val); - else - ret = -EOPNOTSUPP; - break; - case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; @@ -900,31 +1550,152 @@ set_rcvbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; -#ifdef CONFIG_NET_LL_RX_POLL - case SO_BUSY_POLL: - /* allow unprivileged users to decrease the value */ - if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) - ret = -EPERM; - else { - if (val < 0) + case SO_PASSCRED: + if (sk_may_scm_recv(sk)) + sk->sk_scm_credentials = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSSEC: + if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) + sk->sk_scm_security = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSPIDFD: + if (sk_is_unix(sk)) + sk->sk_scm_pidfd = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSRIGHTS: + if (sk_is_unix(sk)) + sk->sk_scm_rights = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_INCOMING_CPU: + reuseport_update_incoming_cpu(sk, val); + break; + + case SO_CNX_ADVICE: + if (val == 1) + dst_negative_advice(sk); + break; + + case SO_ZEROCOPY: + if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { + if (!(sk_is_tcp(sk) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) + ret = -EOPNOTSUPP; + } else if (sk->sk_family != PF_RDS) { + ret = -EOPNOTSUPP; + } + if (!ret) { + if (val < 0 || val > 1) ret = -EINVAL; else - sk->sk_ll_usec = val; + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); } break; -#endif + + case SO_TXTIME: + if (optlen != sizeof(struct sock_txtime)) { + ret = -EINVAL; + break; + } else if (copy_from_sockptr(&sk_txtime, optval, + sizeof(struct sock_txtime))) { + ret = -EFAULT; + break; + } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { + ret = -EINVAL; + break; + } + /* CLOCK_MONOTONIC is only used by sch_fq, and this packet + * scheduler has enough safe guards. + */ + if (sk_txtime.clockid != CLOCK_MONOTONIC && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + + ret = sockopt_validate_clockid(sk_txtime.clockid); + if (ret) + break; + + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + break; + + case SO_BINDTOIFINDEX: + ret = sock_bindtoindex_locked(sk, val); + break; + + case SO_BUF_LOCK: + if (val & ~SOCK_BUF_LOCK_MASK) { + ret = -EINVAL; + break; + } + sk->sk_userlocks = val | (sk->sk_userlocks & + ~SOCK_BUF_LOCK_MASK); + break; + + case SO_RESERVE_MEM: + { + int delta; + + if (val < 0) { + ret = -EINVAL; + break; + } + + delta = val - sk->sk_reserved_mem; + if (delta < 0) + sock_release_reserved_memory(sk, -delta); + else + ret = sock_reserve_memory(sk, delta); + break; + } + default: ret = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); return ret; } + +int sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return sk_setsockopt(sock->sk, level, optname, + optval, optlen); +} EXPORT_SYMBOL(sock_setsockopt); +static const struct cred *sk_get_peer_cred(struct sock *sk) +{ + const struct cred *cred; + + spin_lock(&sk->sk_peer_lock); + cred = get_cred(sk->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); -void cred_to_ucred(struct pid *pid, const struct cred *cred, - struct ucred *ucred) + return cred; +} + +static void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) { ucred->pid = pid_vnr(pid); ucred->uid = ucred->gid = -1; @@ -935,23 +1706,43 @@ void cred_to_ucred(struct pid *pid, const struct cred *cred, ucred->gid = from_kgid_munged(current_ns, cred->egid); } } -EXPORT_SYMBOL_GPL(cred_to_ucred); -int sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +static int groups_to_user(sockptr_t dst, const struct group_info *src) { - struct sock *sk = sock->sk; + struct user_namespace *user_ns = current_user_ns(); + int i; + + for (i = 0; i < src->ngroups; i++) { + gid_t gid = from_kgid_munged(user_ns, src->gid[i]); + + if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) + return -EFAULT; + } + + return 0; +} + +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) +{ + struct socket *sock = sk->sk_socket; union { int val; + u64 val64; + unsigned long ulval; struct linger ling; - struct timeval tm; + struct old_timeval32 tm32; + struct __kernel_old_timeval tm; + struct __kernel_sock_timeval stm; + struct sock_txtime txtime; + struct so_timestamping timestamping; } v; int lv = sizeof(int); int len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; @@ -972,11 +1763,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_SNDBUF: - v.val = sk->sk_sndbuf; + v.val = READ_ONCE(sk->sk_sndbuf); break; case SO_RCVBUF: - v.val = sk->sk_rcvbuf; + v.val = READ_ONCE(sk->sk_rcvbuf); break; case SO_REUSEADDR: @@ -1014,74 +1805,67 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_NO_CHECK: - v.val = sk->sk_no_check; + v.val = sk->sk_no_check_tx; break; case SO_PRIORITY: - v.val = sk->sk_priority; + v.val = READ_ONCE(sk->sk_priority); break; case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); - v.ling.l_linger = sk->sk_lingertime / HZ; + v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("getsockopt"); break; - case SO_TIMESTAMP: + case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; - case SO_TIMESTAMPNS: - v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); - break; - - case SO_TIMESTAMPING: - v.val = 0; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_TX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) - v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; - break; - - case SO_RCVTIMEO: - lv = sizeof(struct timeval); - if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_rcvtimeo / HZ; - v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; - } + case SO_TIMESTAMPNS_OLD: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break; - case SO_SNDTIMEO: - lv = sizeof(struct timeval); - if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_sndtimeo / HZ; - v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + case SO_TIMESTAMP_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMPNS_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + lv = sizeof(v.timestamping); + /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only + * returning the flags when they were set through the same option. + * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. + */ + if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); + v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); } break; + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, + SO_RCVTIMEO_OLD == optname); + break; + + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, + SO_SNDTIMEO_OLD == optname); + break; + case SO_RCVLOWAT: - v.val = sk->sk_rcvlowat; + v.val = READ_ONCE(sk->sk_rcvlowat); break; case SO_SNDLOWAT: @@ -1089,7 +1873,24 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_PASSCRED: - v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_credentials; + break; + + case SO_PASSPIDFD: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_pidfd; + break; + + case SO_PASSRIGHTS: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_rights; break; case SO_PEERCRED: @@ -1097,21 +1898,91 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); + + spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); - if (copy_to_user(optval, &peercred, len)) + spin_unlock(&sk->sk_peer_lock); + + if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } + case SO_PEERPIDFD: + { + struct pid *peer_pid; + struct file *pidfd_file = NULL; + unsigned int flags = 0; + int pidfd; + + if (len > sizeof(pidfd)) + len = sizeof(pidfd); + + spin_lock(&sk->sk_peer_lock); + peer_pid = get_pid(sk->sk_peer_pid); + spin_unlock(&sk->sk_peer_lock); + + if (!peer_pid) + return -ENODATA; + + /* The use of PIDFD_STALE requires stashing of struct pid + * on pidfs with pidfs_register_pid() and only AF_UNIX + * were prepared for this. + */ + if (sk->sk_family == AF_UNIX) + flags = PIDFD_STALE; + + pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); + put_pid(peer_pid); + if (pidfd < 0) + return pidfd; + + if (copy_to_sockptr(optval, &pidfd, len) || + copy_to_sockptr(optlen, &len, sizeof(int))) { + put_unused_fd(pidfd); + fput(pidfd_file); + + return -EFAULT; + } + + fd_install(pidfd, pidfd_file); + return 0; + } + + case SO_PEERGROUPS: + { + const struct cred *cred; + int ret, n; + + cred = sk_get_peer_cred(sk); + if (!cred) + return -ENODATA; + + n = cred->group_info->ngroups; + if (len < n * sizeof(gid_t)) { + len = n * sizeof(gid_t); + put_cred(cred); + return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; + } + len = n * sizeof(gid_t); + + ret = groups_to_user(optval, cred->group_info); + put_cred(cred); + if (ret) + return ret; + goto lenout; + } + case SO_PEERNAME: { - char address[128]; + struct sockaddr_storage address; - if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); + if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_user(optval, address, len)) + if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } @@ -1124,14 +1995,26 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_PASSSEC: - v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_security; break; case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval, optlen, len); + return security_socket_getpeersec_stream(sock, + optval, optlen, len); case SO_MARK: - v.val = sk->sk_mark; + v.val = READ_ONCE(sk->sk_mark); + break; + + case SO_RCVMARK: + v.val = sock_flag(sk, SOCK_RCVMARK); + break; + + case SO_RCVPRIORITY: + v.val = sock_flag(sk, SOCK_RCVPRIORITY); break; case SO_RXQ_OVFL: @@ -1143,10 +2026,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_PEEK_OFF: - if (!sock->ops->set_peek_off) + if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; - v.val = sk->sk_peek_off; + v.val = READ_ONCE(sk->sk_peek_off); break; case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); @@ -1156,7 +2039,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: - len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); + len = sk_get_filter(sk, optval, len); if (len < 0) return len; @@ -1166,26 +2049,123 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_FILTER_LOCKED); break; + case SO_BPF_EXTENSIONS: + v.val = bpf_tell_extensions(); + break; + case SO_SELECT_ERR_QUEUE: v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; -#ifdef CONFIG_NET_LL_RX_POLL +#ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: - v.val = sk->sk_ll_usec; + v.val = READ_ONCE(sk->sk_ll_usec); + break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; +#endif + + case SO_MAX_PACING_RATE: + /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ + if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { + lv = sizeof(v.ulval); + v.ulval = READ_ONCE(sk->sk_max_pacing_rate); + } else { + /* 32bit version */ + v.val = min_t(unsigned long, ~0U, + READ_ONCE(sk->sk_max_pacing_rate)); + } + break; + + case SO_INCOMING_CPU: + v.val = READ_ONCE(sk->sk_incoming_cpu); + break; + + case SO_MEMINFO: + { + u32 meminfo[SK_MEMINFO_VARS]; + + sk_get_meminfo(sk, meminfo); + + len = min_t(unsigned int, len, sizeof(meminfo)); + if (copy_to_sockptr(optval, &meminfo, len)) + return -EFAULT; + + goto lenout; + } + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (!napi_id_valid(v.val)) + v.val = 0; + break; #endif + case SO_COOKIE: + lv = sizeof(u64); + if (len < lv) + return -EINVAL; + v.val64 = sock_gen_cookie(sk); + break; + + case SO_ZEROCOPY: + v.val = sock_flag(sk, SOCK_ZEROCOPY); + break; + + case SO_TXTIME: + lv = sizeof(v.txtime); + v.txtime.clockid = sk->sk_clockid; + v.txtime.flags |= sk->sk_txtime_deadline_mode ? + SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; + break; + + case SO_BINDTOIFINDEX: + v.val = READ_ONCE(sk->sk_bound_dev_if); + break; + + case SO_NETNS_COOKIE: + lv = sizeof(u64); + if (len != lv) + return -EINVAL; + v.val64 = sock_net(sk)->net_cookie; + break; + + case SO_BUF_LOCK: + v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; + break; + + case SO_RESERVE_MEM: + v.val = READ_ONCE(sk->sk_reserved_mem); + break; + + case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; + + /* Paired with WRITE_ONCE() in sk_setsockopt() */ + v.val = READ_ONCE(sk->sk_txrehash); + break; + default: + /* We implement the SO_SNDLOWAT etc to not be settable + * (1003.1g 7). + */ return -ENOPROTOOPT; } if (len > lv) len = lv; - if (copy_to_user(optval, &v, len)) + if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } @@ -1197,7 +2177,18 @@ lenout: */ static inline void sock_lock_init(struct sock *sk) { - sock_lock_init_class_and_name(sk, + sk_owner_clear(sk); + + if (sk->sk_kern_sock) + sock_lock_init_class_and_name( + sk, + af_family_kern_slock_key_strings[sk->sk_family], + af_family_kern_slock_keys + sk->sk_family, + af_family_kern_key_strings[sk->sk_family], + af_family_kern_keys + sk->sk_family); + else + sock_lock_init_class_and_name( + sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], @@ -1206,18 +2197,30 @@ static inline void sock_lock_init(struct sock *sk) /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, - * even temporarly, because of RCU lookups. sk_node should also be left as is. + * even temporarily, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) { + const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif + + /* If we move sk_tx_queue_mapping out of the private section, + * we must check if sk_tx_queue_clear() is called after + * sock_copy() in sk_clone_lock(). + */ + BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < + offsetof(struct sock, sk_dontcopy_begin) || + offsetof(struct sock, sk_tx_queue_mapping) >= + offsetof(struct sock, sk_dontcopy_end)); + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); - memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, - osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + prot->obj_size - offsetof(struct sock, sk_dontcopy_end), + /* alloc is larger than struct, see sk_prot_alloc() */); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; @@ -1225,24 +2228,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #endif } -void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) -{ - unsigned long nulls1, nulls2; - - nulls1 = offsetof(struct sock, __sk_common.skc_node.next); - nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); - if (nulls1 > nulls2) - swap(nulls1, nulls2); - - if (nulls1 != 0) - memset((char *)sk, 0, nulls1); - memset((char *)sk + nulls1 + sizeof(void *), 0, - nulls2 - nulls1 - sizeof(void *)); - memset((char *)sk + nulls2 + sizeof(void *), 0, - size - nulls2 - sizeof(void *)); -} -EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); - static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { @@ -1254,24 +2239,17 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; - if (priority & __GFP_ZERO) { - if (prot->clear_sk) - prot->clear_sk(sk, prot->obj_size); - else - sk_prot_clear_nulls(sk, prot->obj_size); - } + if (want_init_on_alloc(priority)) + sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { - kmemcheck_annotate_bitfield(sk, flags); - if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) goto out_free_sec; - sk_tx_queue_clear(sk); } return sk; @@ -1294,7 +2272,12 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) owner = prot->owner; slab = prot->slab; + cgroup_sk_free(&sk->sk_cgrp_data); + mem_cgroup_sk_free(sk); security_sk_free(sk); + + sk_owner_put(sk); + if (slab != NULL) kmem_cache_free(slab, sk); else @@ -1302,38 +2285,16 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk) -{ - u32 classid; - - classid = task_cls_classid(current); - if (classid != sk->sk_classid) - sk->sk_classid = classid; -} -EXPORT_SYMBOL(sock_update_classid); -#endif - -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) -void sock_update_netprioidx(struct sock *sk) -{ - if (in_interrupt()) - return; - - sk->sk_cgrp_prioidx = task_netprioidx(current); -} -EXPORT_SYMBOL_GPL(sock_update_netprioidx); -#endif - /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot) + struct proto *prot, int kern) { struct sock *sk; @@ -1345,27 +2306,51 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + + if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) + sk->sk_bypass_prot_mem = 1; + + sk->sk_kern_sock = kern; sock_lock_init(sk); - sock_net_set(sk, get_net(net)); - atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk); - sock_update_netprioidx(sk); + sk->sk_net_refcnt = kern ? 0 : 1; + if (likely(sk->sk_net_refcnt)) { + get_net_track(net, &sk->ns_tracker, priority); + sock_inuse_add(net, 1); + } else { + net_passive_inc(net); + __netns_tracker_alloc(net, &sk->ns_tracker, + false, priority); + } + + sock_net_set(sk, net); + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); + + mem_cgroup_sk_alloc(sk); + cgroup_sk_alloc(&sk->sk_cgrp_data); + sock_update_classid(&sk->sk_cgrp_data); + sock_update_netprioidx(&sk->sk_cgrp_data); + sk_tx_queue_clear(sk); } return sk; } EXPORT_SYMBOL(sk_alloc); -static void __sk_free(struct sock *sk) +/* Sockets having SOCK_RCU_FREE will call this function after one RCU + * grace period. This is the case for UDP sockets and TCP listeners. + */ +static void __sk_destruct(struct rcu_head *head) { + struct sock *sk = container_of(head, struct sock, sk_rcu); + struct net *net = sock_net(sk); struct sk_filter *filter; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, - atomic_read(&sk->sk_wmem_alloc) == 0); + refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); @@ -1373,17 +2358,71 @@ static void __sk_free(struct sock *sk) sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); +#ifdef CONFIG_BPF_SYSCALL + bpf_sk_storage_free(sk); +#endif + if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + + /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ + put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - put_net(sock_net(sk)); + + if (likely(sk->sk_net_refcnt)) { + put_net_track(net, &sk->ns_tracker); + } else { + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + } sk_prot_free(sk->sk_prot_creator, sk); } +void sk_net_refcnt_upgrade(struct sock *sk) +{ + struct net *net = sock_net(sk); + + WARN_ON_ONCE(sk->sk_net_refcnt); + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); +} +EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); + +void sk_destruct(struct sock *sk) +{ + bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); + + if (rcu_access_pointer(sk->sk_reuseport_cb)) { + reuseport_detach_sock(sk); + use_call_rcu = true; + } + + if (use_call_rcu) + call_rcu(&sk->sk_rcu, __sk_destruct); + else + __sk_destruct(&sk->sk_rcu); +} + +static void __sk_free(struct sock *sk) +{ + if (likely(sk->sk_net_refcnt)) + sock_inuse_add(sock_net(sk), -1); + + if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) + sock_diag_broadcast_destroy(sk); + else + sk_destruct(sk); +} + void sk_free(struct sock *sk) { /* @@ -1391,156 +2430,230 @@ void sk_free(struct sock *sk) * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ - if (atomic_dec_and_test(&sk->sk_wmem_alloc)) + if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); -/* - * Last sock_put should drop reference to sk->sk_net. It has already - * been dropped in sk_change_net. Taking reference to stopping namespace - * is not an option. - * Take reference to a socket to remove it from hash _alive_ and after that - * destroy it in the context of init_net. - */ -void sk_release_kernel(struct sock *sk) +static void sk_init_common(struct sock *sk) { - if (sk == NULL || sk->sk_socket == NULL) - return; - - sock_hold(sk); - sock_release(sk->sk_socket); - release_net(sock_net(sk)); - sock_net_set(sk, get_net(&init_net)); - sock_put(sk); -} -EXPORT_SYMBOL(sk_release_kernel); + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); -static void sk_update_clone(const struct sock *sk, struct sock *newsk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sock_update_memcg(newsk); + rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_receive_queue.lock, + af_rlock_keys + sk->sk_family, + af_family_rlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_write_queue.lock, + af_wlock_keys + sk->sk_family, + af_family_wlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_error_queue.lock, + af_elock_keys + sk->sk_family, + af_family_elock_key_strings[sk->sk_family]); + if (sk->sk_kern_sock) + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_kern_callback_keys + sk->sk_family, + af_family_kern_clock_key_strings[sk->sk_family]); + else + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); } /** - * sk_clone_lock - clone a socket, and lock its clone - * @sk: the socket to clone - * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * sk_clone - clone a socket + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @lock: if true, lock the cloned sk * - * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + * If @lock is true, the clone is locked by bh_lock_sock(), and + * caller must unlock socket even in error path by bh_unlock_sock(). */ -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, + bool lock) { + struct proto *prot = READ_ONCE(sk->sk_prot); + struct sk_filter *filter; + bool is_charged = true; struct sock *newsk; - newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); - if (newsk != NULL) { - struct sk_filter *filter; + newsk = sk_prot_alloc(prot, priority, sk->sk_family); + if (!newsk) + goto out; - sock_copy(newsk, sk); + sock_copy(newsk, sk); - /* SANITY */ - get_net(sock_net(newsk)); - sk_node_init(&newsk->sk_node); - sock_lock_init(newsk); - bh_lock_sock(newsk); - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; - newsk->sk_backlog.len = 0; + newsk->sk_prot_creator = prot; - atomic_set(&newsk->sk_rmem_alloc, 0); - /* - * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) + /* SANITY */ + if (likely(newsk->sk_net_refcnt)) { + get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); + sock_inuse_add(sock_net(newsk), 1); + } else { + /* Kernel sockets are not elevating the struct net refcount. + * Instead, use a tracker to more easily detect if a layer + * is not properly dismantling its kernel sockets at netns + * destroy time. */ - atomic_set(&newsk->sk_wmem_alloc, 1); - atomic_set(&newsk->sk_omem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - skb_queue_head_init(&newsk->sk_write_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&newsk->sk_async_wait_queue); + net_passive_inc(sock_net(newsk)); + __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, + false, priority); + } + + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + + if (lock) + bh_lock_sock(newsk); + + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; + + atomic_set(&newsk->sk_rmem_alloc, 0); + + refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); + + atomic_set(&newsk->sk_omem_alloc, 0); + sk_init_common(newsk); + + newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + newsk->sk_reserved_mem = 0; + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); + sk_drops_reset(newsk); + newsk->sk_send_head = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); + + sock_reset_flag(newsk, SOCK_DONE); + +#ifdef CONFIG_MEMCG + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; #endif - spin_lock_init(&newsk->sk_dst_lock); - rwlock_init(&newsk->sk_callback_lock); - lockdep_set_class_and_name(&newsk->sk_callback_lock, - af_callback_keys + newsk->sk_family, - af_family_clock_key_strings[newsk->sk_family]); - - newsk->sk_dst_cache = NULL; - newsk->sk_wmem_queued = 0; - newsk->sk_forward_alloc = 0; - newsk->sk_send_head = NULL; - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; - - sock_reset_flag(newsk, SOCK_DONE); - skb_queue_head_init(&newsk->sk_error_queue); - - filter = rcu_dereference_protected(newsk->sk_filter, 1); - if (filter != NULL) - sk_filter_charge(newsk, filter); - - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); - sk_free(newsk); - newsk = NULL; - goto out; - } + cgroup_sk_clone(&newsk->sk_cgrp_data); - newsk->sk_err = 0; - newsk->sk_priority = 0; - /* - * Before updating sk_refcnt, we must commit prior changes to memory - * (Documentation/RCU/rculist_nulls.txt for details) + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter != NULL) + /* though it's an empty new sock, the charging may fail + * if sysctl_optmem_max was changed between creation of + * original socket and cloning */ - smp_wmb(); - atomic_set(&newsk->sk_refcnt, 2); + is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); - /* - * Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy). - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. */ - sk_refcnt_debug_inc(newsk); - sk_set_socket(newsk, NULL); - newsk->sk_wq = NULL; + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_update_clone(sk, newsk); + goto free; + } - if (newsk->sk_prot->sockets_allocated) - sk_sockets_allocated_inc(newsk); + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) - net_enable_timestamp(); - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; + + /* Clear sk_user_data if parent had the pointer tagged + * as not suitable for copying when cloning. + */ + if (sk_user_data_is_nocopy(newsk)) + newsk->sk_user_data = NULL; + + newsk->sk_err = 0; + newsk->sk_err_soft = 0; + newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); + + /* Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.rst for details) + */ + smp_wmb(); + refcount_set(&newsk->sk_refcnt, 2); + + sk_set_socket(newsk, NULL); + sk_tx_queue_clear(newsk); + RCU_INIT_POINTER(newsk->sk_wq, NULL); + + if (newsk->sk_prot->sockets_allocated) + sk_sockets_allocated_inc(newsk); + + if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); out: return newsk; +free: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + if (lock) + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; +} +EXPORT_SYMBOL_GPL(sk_clone); + +static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) +{ + bool is_ipv6 = false; + u32 max_size; + +#if IS_ENABLED(CONFIG_IPV6) + is_ipv6 = (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); +#endif + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ + max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); + if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) + max_size = GSO_LEGACY_MAX_SIZE; + + return max_size - (MAX_TCP_HEADER + 1); } -EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { - __sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features; + const struct net_device *dev; + u32 max_segs = 1; + + rcu_read_lock(); + dev = dst_dev_rcu(dst); + sk->sk_route_caps = dev->features; + if (sk_is_tcp(sk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + + sk->sk_route_caps |= NETIF_F_GSO; + icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); + } if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; - sk->sk_route_caps &= ~sk->sk_route_nocaps; + if (unlikely(sk->sk_gso_disabled)) + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { - if (dst->header_len) { + if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = dst->dev->gso_max_size; - sk->sk_gso_max_segs = dst->dev->gso_max_segs; + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); + /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ + max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); } } + sk->sk_gso_max_segs = max_segs; + sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1554,15 +2667,29 @@ EXPORT_SYMBOL_GPL(sk_setup_caps); */ void sock_wfree(struct sk_buff *skb) { - struct sock *sk = skb->sk; unsigned int len = skb->truesize; + struct sock *sk = skb->sk; + bool free; + int old; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + if (sock_flag(sk, SOCK_RCU_FREE) && + sk->sk_write_space == sock_def_write_space) { + rcu_read_lock(); + free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, + &old); + sock_def_write_space_wfree(sk, old - len); + rcu_read_unlock(); + if (unlikely(free)) + __sk_free(sk); + return; + } + /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ - atomic_sub(len - 1, &sk->sk_wmem_alloc); + WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } @@ -1570,11 +2697,81 @@ void sock_wfree(struct sk_buff *skb) * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ - if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) + if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); +/* This variant of sock_wfree() is used by TCP, + * since it sets SOCK_USE_WRITE_QUEUE. + */ +void __sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) + __sk_free(sk); +} + +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + int old_wmem; + + skb_orphan(skb); +#ifdef CONFIG_INET + if (unlikely(!sk_fullsock(sk))) + return skb_set_owner_edemux(skb, sk); +#endif + skb->sk = sk; + skb->destructor = sock_wfree; + skb_set_hash_from_sk(skb, sk); + /* + * We used to take a refcount on sk, but following operation + * is enough to guarantee sk_free() won't free this sock until + * all in-flight packets are completed + */ + __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); + + /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket + * is in a host queue (qdisc, NIC queue). + * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue + * based on XPS for better performance. + * Otherwise clear ooo_okay to not risk Out Of Order delivery. + */ + skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); +} +EXPORT_SYMBOL(skb_set_owner_w); + +static bool can_skb_orphan_partial(const struct sk_buff *skb) +{ + /* Drivers depend on in-order delivery for crypto offload, + * partial orphan breaks out-of-order-OK logic. + */ + if (skb_is_decrypted(skb)) + return false; + + return (skb->destructor == sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); +} + +/* This helper is used by netem, as it can hold packets in its + * delay queue. We want to allow the owner socket to send more + * packets, as if they were already TX completed by a typical driver. + * But we also want to keep skb->sk set because some packet schedulers + * rely on it (sch_fq for example). + */ +void skb_orphan_partial(struct sk_buff *skb) +{ + if (skb_is_tcp_pure_ack(skb)) + return; + + if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) + return; + + skb_orphan(skb); +} +EXPORT_SYMBOL(skb_orphan_partial); + /* * Read buffer destructor automatically called from kfree_skb. */ @@ -1588,40 +2785,37 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); -void sock_edemux(struct sk_buff *skb) +/* + * Buffer destructor for skbs that are not used directly in read or write + * path, e.g. for error handler skbs. Automatically called from kfree_skb. + */ +void sock_efree(struct sk_buff *skb) { - struct sock *sk = skb->sk; - -#ifdef CONFIG_INET - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put(inet_twsk(sk)); - else -#endif - sock_put(sk); + sock_put(skb->sk); } -EXPORT_SYMBOL(sock_edemux); +EXPORT_SYMBOL(sock_efree); -kuid_t sock_i_uid(struct sock *sk) +/* Buffer destructor for prefetch/receive path where reference count may + * not be held, e.g. for listen sockets. + */ +#ifdef CONFIG_INET +void sock_pfree(struct sk_buff *skb) { - kuid_t uid; + struct sock *sk = skb->sk; - read_lock_bh(&sk->sk_callback_lock); - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; - read_unlock_bh(&sk->sk_callback_lock); - return uid; -} -EXPORT_SYMBOL(sock_i_uid); + if (!sk_is_refcounted(sk)) + return; -unsigned long sock_i_ino(struct sock *sk) -{ - unsigned long ino; + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + inet_reqsk(sk)->rsk_listener = NULL; + reqsk_free(inet_reqsk(sk)); + return; + } - read_lock_bh(&sk->sk_callback_lock); - ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock_bh(&sk->sk_callback_lock); - return ino; + sock_gen_put(sk); } -EXPORT_SYMBOL(sock_i_ino); +EXPORT_SYMBOL(sock_pfree); +#endif /* CONFIG_INET */ /* * Allocate a skb from the socket's send buffer. @@ -1629,8 +2823,10 @@ EXPORT_SYMBOL(sock_i_ino); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { - if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + if (force || + refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { skb_set_owner_w(skb, sk); return skb; @@ -1640,20 +2836,31 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, } EXPORT_SYMBOL(sock_wmalloc); -/* - * Allocate a skb from the socket's receive buffer. - */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, +static void sock_ofree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_omem_alloc); +} + +struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority) { - if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { - struct sk_buff *skb = alloc_skb(size, priority); - if (skb) { - skb_set_owner_r(skb, sk); - return skb; - } - } - return NULL; + struct sk_buff *skb; + + /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ + if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > + READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) + return NULL; + + skb = alloc_skb(size, priority); + if (!skb) + return NULL; + + atomic_add(skb->truesize, &sk->sk_omem_alloc); + skb->sk = sk; + skb->destructor = sock_ofree; + return skb; } /* @@ -1661,8 +2868,10 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - if ((unsigned int)size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + + if ((unsigned int)size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. @@ -1678,15 +2887,49 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) EXPORT_SYMBOL(sock_kmalloc); /* - * Free an option memory block. + * Duplicate the input "src" memory block using the socket's + * option memory buffer. */ -void sock_kfree_s(struct sock *sk, void *mem, int size) +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority) +{ + void *mem; + + mem = sock_kmalloc(sk, size, priority); + if (mem) + memcpy(mem, src, size); + return mem; +} +EXPORT_SYMBOL(sock_kmemdup); + +/* Free an option memory block. Note, we actually want the inline + * here as this allows gcc to detect the nullify and fold away the + * condition entirely. + */ +static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, + const bool nullify) { - kfree(mem); + if (WARN_ON_ONCE(!mem)) + return; + if (nullify) + kfree_sensitive(mem); + else + kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } + +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + __sock_kfree_s(sk, mem, size, false); +} EXPORT_SYMBOL(sock_kfree_s); +void sock_kzfree_s(struct sock *sk, void *mem, int size) +{ + __sock_kfree_s(sk, mem, size, true); +} +EXPORT_SYMBOL(sock_kzfree_s); + /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ @@ -1694,7 +2937,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; @@ -1702,11 +2945,11 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) break; - if (sk->sk_err) + if (READ_ONCE(sk->sk_err)) break; timeo = schedule_timeout(timeo); } @@ -1721,69 +2964,26 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { struct sk_buff *skb; - gfp_t gfp_mask; long timeo; int err; - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - - err = -EMSGSIZE; - if (npages > MAX_SKB_FRAGS) - goto failure; - - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; timeo = sock_sndtimeo(sk, noblock); - while (1) { + for (;;) { err = sock_error(sk); if (err != 0) goto failure; err = -EPIPE; - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } - - /* Full success... */ - break; - } - err = -ENOBUFS; - goto failure; - } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) + break; + + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) @@ -1792,8 +2992,10 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } - - skb_set_owner_w(skb, sk); + skb = alloc_skb_with_frags(header_len, data_len, max_page_order, + errcode, sk->sk_allocation); + if (skb) + skb_set_owner_w(skb, sk); return skb; interrupted: @@ -1804,53 +3006,176 @@ failure: } EXPORT_SYMBOL(sock_alloc_send_pskb); -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, - int noblock, int *errcode) +int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, + struct sockcm_cookie *sockc) +{ + u32 tsflags; + + BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); + + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + + tsflags = *(u32 *)CMSG_DATA(cmsg); + if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) + return -EINVAL; + + sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; + sockc->tsflags |= tsflags; + break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; + case SCM_TS_OPT_ID: + if (sk_is_tcp(sk)) + return -EINVAL; + tsflags = READ_ONCE(sk->sk_tsflags); + if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); + sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; + break; + /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ + case SCM_RIGHTS: + case SCM_CREDENTIALS: + break; + case SO_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) + return -EPERM; + sockc->priority = *(u32 *)CMSG_DATA(cmsg); + break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(__sock_cmsg_send); + +int sock_cmsg_send(struct sock *sk, struct msghdr *msg, + struct sockcm_cookie *sockc) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + struct cmsghdr *cmsg; + int ret; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + ret = __sock_cmsg_send(sk, cmsg, sockc); + if (ret) + return ret; + } + return 0; } -EXPORT_SYMBOL(sock_alloc_send_skb); +EXPORT_SYMBOL(sock_cmsg_send); -/* On 32bit arches, an skb frag is limited to 2^15 */ -#define SKB_FRAG_PAGE_ORDER get_order(32768) +static void sk_enter_memory_pressure(struct sock *sk) +{ + if (!sk->sk_prot->enter_memory_pressure) + return; -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) + sk->sk_prot->enter_memory_pressure(sk); +} + +static void sk_leave_memory_pressure(struct sock *sk) { - int order; + if (sk->sk_prot->leave_memory_pressure) { + INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, + tcp_leave_memory_pressure, sk); + } else { + unsigned long *memory_pressure = sk->sk_prot->memory_pressure; + + if (memory_pressure && READ_ONCE(*memory_pressure)) + WRITE_ONCE(*memory_pressure, 0); + } +} +DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); + +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @gfp: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) +{ if (pfrag->page) { - if (atomic_read(&pfrag->page->_count) == 1) { + if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } - if (pfrag->offset < pfrag->size) + if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } - /* We restrict high order allocations to users that can afford to wait */ - order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; - - do { - gfp_t gfp = sk->sk_allocation; - - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; - pfrag->page = alloc_pages(gfp, order); + pfrag->offset = 0; + if (SKB_FRAG_PAGE_ORDER && + !static_branch_unlikely(&net_high_order_alloc_disable_key)) { + /* Avoid direct reclaim but allow kswapd to wake */ + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY, + SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { - pfrag->offset = 0; - pfrag->size = PAGE_SIZE << order; + pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } - } while (--order >= 0); + } + pfrag->page = alloc_page(gfp); + if (likely(pfrag->page)) { + pfrag->size = PAGE_SIZE; + return true; + } + return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + return true; + + if (!sk->sk_bypass_prot_mem) + sk_enter_memory_pressure(sk); - sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); + return false; } EXPORT_SYMBOL(sk_page_frag_refill); -static void __lock_sock(struct sock *sk) +void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { @@ -1868,37 +3193,35 @@ static void __lock_sock(struct sock *sk) finish_wait(&sk->sk_lock.wq, &wait); } -static void __release_sock(struct sock *sk) +void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { - struct sk_buff *skb = sk->sk_backlog.head; + struct sk_buff *skb, *next; + int nb = 0; - do { + while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; - bh_unlock_sock(sk); - do { - struct sk_buff *next = skb->next; + spin_unlock_bh(&sk->sk_lock.slock); + while (1) { + next = skb->next; prefetch(next); - WARN_ON_ONCE(skb_dst_is_noref(skb)); - skb->next = NULL; + DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); + skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); - /* - * We are in process context here with softirqs - * disabled, use cond_resched_softirq() to preempt. - * This is safe to do because we've taken the backlog - * queue private: - */ - cond_resched_softirq(); - skb = next; - } while (skb != NULL); + if (!skb) + break; - bh_lock_sock(sk); - } while ((skb = sk->sk_backlog.head) != NULL); + if (!(++nb & 15)) + cond_resched(); + } + + spin_lock_bh(&sk->sk_lock.slock); + } /* * Doing the zeroing here guarantee we can not loop forever @@ -1907,87 +3230,131 @@ static void __release_sock(struct sock *sk) sk->sk_backlog.len = 0; } +void __sk_flush_backlog(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); + __release_sock(sk); + + if (sk->sk_prot->release_cb) + INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, + tcp_release_cb, sk); + + spin_unlock_bh(&sk->sk_lock.slock); +} +EXPORT_SYMBOL_GPL(__sk_flush_backlog); + /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long + * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ -int sk_wait_data(struct sock *sk, long *timeo) +int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; - DEFINE_WAIT(wait); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - finish_wait(sk_sleep(sk), &wait); + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** - * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate + * @amt: pages to allocate * @kind: allocation type * - * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means - * rmem allocation. This function assumes that protocols which have - * memory_pressure use sk_wmem_queued as write buffer accounting. + * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. + * + * Unlike the globally shared limits among the sockets under same protocol, + * consuming the budget of a memcg won't have direct effect on other ones. + * So be optimistic about memcg's tolerance, and leave the callers to decide + * whether or not to raise allocated through sk_under_memory_pressure() or + * its variants. */ -int __sk_mem_schedule(struct sock *sk, int size, int kind) +int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { + bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - int amt = sk_mem_pages(size); - long allocated; - int parent_status = UNDER_LIMIT; + long allocated = 0; - sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + if (!sk->sk_bypass_prot_mem) { + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + } - allocated = sk_memory_allocated_add(sk, amt, &parent_status); + if (mem_cgroup_sk_enabled(sk)) { + memcg_enabled = true; + charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); + if (!charged) + goto suppress_allocation; + } + + if (!allocated) + return 1; /* Under limit. */ - if (parent_status == UNDER_LIMIT && - allocated <= sk_prot_mem_limits(sk, 0)) { + if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. (we or our parents) */ - if ((parent_status > SOFT_LIMIT) || - allocated > sk_prot_mem_limits(sk, 1)) + /* Under pressure. */ + if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); - /* Over hard limit (we or our parents) */ - if ((parent_status == OVER_LIMIT) || - (allocated > sk_prot_mem_limits(sk, 2))) + /* Over hard limit. */ + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; - /* guarantee minimum buffer size under pressure */ + /* Guarantee minimum buffer size under pressure (either global + * or memcg) to make sure features described in RFC 7323 (TCP + * Extensions for High Performance) work properly. + * + * This rule does NOT stand when exceeds global or memcg's hard + * limit, or else a DoS attack can be taken place by spawning + * lots of sockets whose usage are under minimum buffer size. + */ if (kind == SK_MEM_RECV) { - if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) + if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; } else { /* SK_MEM_SEND */ + int wmem0 = sk_get_wmem0(sk, prot); + if (sk->sk_type == SOCK_STREAM) { - if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) + if (sk->sk_wmem_queued < wmem0) return 1; - } else if (atomic_read(&sk->sk_wmem_alloc) < - prot->sysctl_wmem[0]) + } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { return 1; + } } if (sk_has_memory_pressure(sk)) { - int alloc; + u64 alloc; - if (!sk_under_memory_pressure(sk)) + /* The following 'average' heuristic is within the + * scope of global accounting, so it only makes + * sense for global memory pressure. + */ + if (!sk_under_global_memory_pressure(sk)) return 1; + + /* Try to be fair among all the sockets under global + * pressure by allowing the ones that below average + * usage to raise. + */ alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + @@ -2004,37 +3371,107 @@ suppress_allocation: /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ - if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { + /* Force charge with __GFP_NOFAIL */ + if (memcg_enabled && !charged) + mem_cgroup_sk_charge(sk, amt, + gfp_memcg_charge() | __GFP_NOFAIL); return 1; + } } - trace_sock_exceed_buf_limit(sk, prot, allocated); + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); - /* Alas. Undo changes. */ - sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; + if (allocated) + sk_memory_allocated_sub(sk, amt); - sk_memory_allocated_sub(sk, amt); + if (charged) + mem_cgroup_sk_uncharge(sk, amt); return 0; } + +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + int ret, amt = sk_mem_pages(size); + + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); + ret = __sk_mem_raise_allocated(sk, size, amt, kind); + if (!ret) + sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); + return ret; +} EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_reclaim - reclaim memory_allocated + * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket + * @amount: number of quanta + * + * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ -void __sk_mem_reclaim(struct sock *sk) +void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); - sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_uncharge(sk, amount); - if (sk_under_memory_pressure(sk) && + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_sub(sk, amount); + + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } + +/** + * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated + * @sk: socket + * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) + */ +void __sk_mem_reclaim(struct sock *sk, int amount) +{ + amount >>= PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); + __sk_mem_reduce_allocated(sk, amount); +} EXPORT_SYMBOL(__sk_mem_reclaim); +void __sk_charge(struct sock *sk, gfp_t gfp) +{ + int amt; + + gfp |= __GFP_NOFAIL; + if (mem_cgroup_from_sk(sk)) { + /* The socket has not been accepted yet, no need + * to look at newsk->sk_wmem_queued. + */ + amt = sk_mem_pages(sk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + if (amt) + mem_cgroup_sk_charge(sk, amt, gfp); + } + + kmem_cache_charge(sk, gfp); +} + +int sk_set_peek_off(struct sock *sk, int val) +{ + WRITE_ONCE(sk->sk_peek_off, val); + return 0; +} +EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when @@ -2043,13 +3480,13 @@ EXPORT_SYMBOL(__sk_mem_reclaim); * function, some default processing is provided. */ -int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); -int sock_no_connect(struct socket *sock, struct sockaddr *saddr, +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { return -EOPNOTSUPP; @@ -2062,25 +3499,20 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) } EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +int sock_no_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, - int *len, int peer) + int peer) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_getname); -unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) -{ - return 0; -} -EXPORT_SYMBOL(sock_no_poll); - int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; @@ -2099,29 +3531,20 @@ int sock_no_shutdown(struct socket *sock, int how) } EXPORT_SYMBOL(sock_no_shutdown); -int sock_no_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(sock_no_setsockopt); - -int sock_no_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } -EXPORT_SYMBOL(sock_no_getsockopt); +EXPORT_SYMBOL(sock_no_sendmsg); -int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, - size_t len) +int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } -EXPORT_SYMBOL(sock_no_sendmsg); +EXPORT_SYMBOL(sock_no_sendmsg_locked); -int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, - size_t len, int flags) +int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, + int flags) { return -EOPNOTSUPP; } @@ -2134,19 +3557,20 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct * } EXPORT_SYMBOL(sock_no_mmap); -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +/* + * When a file is received (via SCM_RIGHTS, etc), we must bump the + * various sock-based usage counts. + */ +void __receive_sock(struct file *file) { - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg(sock, &msg, &iov, 1, size); - kunmap(page); - return res; + struct socket *sock; + + sock = sock_from_file(file); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } } -EXPORT_SYMBOL(sock_no_sendpage); /* * Default Socket Callbacks @@ -2158,7 +3582,7 @@ static void sock_def_wakeup(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } @@ -2169,22 +3593,24 @@ static void sock_def_error_report(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) - wake_up_interruptible_poll(&wq->wait, POLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, EPOLLERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk, int len) +void sock_def_readable(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) - wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | - POLLRDNORM | POLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | + EPOLLRDNORM | EPOLLRDBAND); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -2197,29 +3623,50 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if (sock_writeable(sk)) { wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) - wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | - POLLWRNORM | POLLWRBAND); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - if (sock_writeable(sk)) - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } +/* An optimised version of sock_def_write_space(), should only be called + * for SOCK_RCU_FREE sockets under RCU read section and after putting + * ->sk_wmem_alloc. + */ +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) +{ + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if (__sock_writeable(sk, wmem_alloc)) { + struct socket_wq *wq = rcu_dereference(sk->sk_wq); + + /* rely on refcount_sub from sock_wfree() */ + smp_mb__after_atomic(); + if (wq && waitqueue_active(&wq->wait)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); + + /* Should agree with poll, otherwise some programs break */ + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); + } +} + static void sock_def_destruct(struct sock *sk) { - kfree(sk->sk_protinfo); } void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) - if (send_sigurg(&sk->sk_socket->file->f_owner)) + if (send_sigurg(sk->sk_socket->file)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); @@ -2234,44 +3681,42 @@ EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { - if (del_timer(timer)) + if (timer_delete(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); -void sock_init_data(struct socket *sock, struct sock *sk) +void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { - skb_queue_head_init(&sk->sk_receive_queue); - skb_queue_head_init(&sk->sk_write_queue); - skb_queue_head_init(&sk->sk_error_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&sk->sk_async_wait_queue); -#endif + if (timer_delete_sync(timer)) + __sock_put(sk); +} +EXPORT_SYMBOL(sk_stop_timer_sync); +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) +{ + sk_init_common(sk); sk->sk_send_head = NULL; - init_timer(&sk->sk_timer); + timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; - sk->sk_rcvbuf = sysctl_rmem_default; - sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; + sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; - sk->sk_wq = sock->wq; + RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; - } else - sk->sk_wq = NULL; - - spin_lock_init(&sk->sk_dst_lock); - rwlock_init(&sk->sk_callback_lock); - lockdep_set_class_and_name(&sk->sk_callback_lock, - af_callback_keys + sk->sk_family, - af_family_clock_key_strings[sk->sk_family]); + } else { + RCU_INIT_POINTER(sk->sk_wq, NULL); + } + sk->sk_uid = uid; sk->sk_state_change = sock_def_wakeup; sk->sk_data_ready = sock_def_readable; @@ -2285,131 +3730,153 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; + spin_lock_init(&sk->sk_peer_lock); + sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - sk->sk_stamp = ktime_set(-1L, 0); + sk->sk_stamp = SK_DEFAULT_STAMP; +#if BITS_PER_LONG==32 + seqlock_init(&sk->sk_stamp_seq); +#endif + atomic_set(&sk->sk_zckey, 0); -#ifdef CONFIG_NET_LL_RX_POLL +#ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_busy_read; + sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif + sk->sk_max_pacing_rate = ~0UL; + sk->sk_pacing_rate = ~0UL; + WRITE_ONCE(sk->sk_pacing_shift, 10); + sk->sk_incoming_cpu = -1; + + sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory - * (Documentation/RCU/rculist_nulls.txt for details) + * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); - atomic_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_drops, 0); + refcount_set(&sk->sk_refcnt, 1); + sk_drops_reset(sk); +} +EXPORT_SYMBOL(sock_init_data_uid); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + kuid_t uid = sock ? + SOCK_INODE(sock)->i_uid : + make_kuid(sock_net(sk)->user_ns, 0); + + sock_init_data_uid(sock, sk, uid); } EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { + /* The sk_lock has mutex_lock() semantics here. */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); + might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (sk->sk_lock.owned) + if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; - spin_unlock(&sk->sk_lock.slock); - /* - * The sk_lock has mutex_lock() semantics here: - */ - mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); - local_bh_enable(); + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { - /* - * The sk_lock has mutex_unlock() semantics: - */ - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); if (sk->sk_prot->release_cb) - sk->sk_prot->release_cb(sk); + INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, + tcp_release_cb, sk); - sk->sk_lock.owned = 0; + sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); -/** - * lock_sock_fast - fast version of lock_sock - * @sk: socket - * - * This version should be used for very small section, where process wont block - * return false if fast path is taken - * sk_lock.slock locked, owned = 0, BH disabled - * return true if slow path is taken - * sk_lock.slock unlocked, owned = 1, BH enabled - */ -bool lock_sock_fast(struct sock *sk) +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (!sk->sk_lock.owned) + if (!sock_owned_by_user_nocheck(sk)) { /* - * Note : We must disable BH + * Fast path return with bottom halves disabled and + * sock::sk_lock.slock held. + * + * The 'mutex' is not contended and holding + * sock::sk_lock.slock prevents all other lockers to + * proceed so the corresponding unlock_sock_fast() can + * avoid the slow path of release_sock() completely and + * just release slock. + * + * From a semantical POV this is equivalent to 'acquiring' + * the 'mutex', hence the corresponding lockdep + * mutex_release() has to happen in the fast path of + * unlock_sock_fast(). */ return false; + } __lock_sock(sk); sk->sk_lock.owned = 1; - spin_unlock(&sk->sk_lock.slock); - /* - * The sk_lock has mutex_lock() semantics here: - */ - mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); - local_bh_enable(); + __acquire(&sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.slock); return true; } -EXPORT_SYMBOL(lock_sock_fast); +EXPORT_SYMBOL(__lock_sock_fast); -int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +int sock_gettstamp(struct socket *sock, void __user *userstamp, + bool timeval, bool time32) { - struct timeval tv; - if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); - if (tv.tv_sec == -1) - return -ENOENT; - if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); - } - return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; -} -EXPORT_SYMBOL(sock_get_timestamp); + struct sock *sk = sock->sk; + struct timespec64 ts; -int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) -{ - struct timespec ts; - if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + ts = ktime_to_timespec64(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - ts = ktime_to_timespec(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + ts = ktime_to_timespec64(kt); + } + + if (timeval) + ts.tv_nsec /= 1000; + +#ifdef CONFIG_COMPAT_32BIT_TIME + if (time32) + return put_old_timespec32(&ts, userstamp); +#endif +#ifdef CONFIG_SPARC64 + /* beware of padding in sparc64 timeval */ + if (timeval && !in_compat_syscall()) { + struct __kernel_old_timeval __user tv = { + .tv_sec = ts.tv_sec, + .tv_usec = ts.tv_nsec, + }; + if (copy_to_user(userstamp, &tv, sizeof(tv))) + return -EFAULT; + return 0; } - return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; +#endif + return put_timespec64(&ts, userstamp); } -EXPORT_SYMBOL(sock_get_timestampns); +EXPORT_SYMBOL(sock_gettstamp); -void sock_enable_timestamp(struct sock *sk, int flag) +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; @@ -2420,50 +3887,73 @@ void sock_enable_timestamp(struct sock *sk, int flag) * time stamping, but time stamping might have been on * already because of the other one */ - if (!(previous_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int copied, err; + + err = -EAGAIN; + skb = sock_dequeue_err_skb(sk); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_msg(skb, 0, msg, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + /* * Get a socket option on an socket. * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume - * this means if you specify SO_ERROR (otherwise whats the point of it). + * this means if you specify SO_ERROR (otherwise what is the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); -#ifdef CONFIG_COMPAT -int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_getsockopt != NULL) - return sk->sk_prot->compat_getsockopt(sk, level, optname, - optval, optlen); - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_sock_common_getsockopt); -#endif - -int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; - err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, - flags & ~MSG_DONTWAIT, &addr_len); + err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; @@ -2474,35 +3964,22 @@ EXPORT_SYMBOL(sock_common_recvmsg); * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); -#ifdef CONFIG_COMPAT -int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_setsockopt != NULL) - return sk->sk_prot->compat_setsockopt(sk, level, optname, - optval, optlen); - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_sock_common_setsockopt); -#endif - void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) sk->sk_prot->destroy(sk); /* - * Observation: when sock_common_release is called, processes have + * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * @@ -2527,31 +4004,27 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); - sk_refcnt_debug_release(sk); - - if (sk->sk_frag.page) { - put_page(sk->sk_frag.page); - sk->sk_frag.page = NULL; - } - sock_put(sk); } EXPORT_SYMBOL(sk_common_release); -#ifdef CONFIG_PROC_FS -#define PROTO_INUSE_NR 64 /* should be enough for the first time */ -struct prot_inuse { - int val[PROTO_INUSE_NR]; -}; - -static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); - -#ifdef CONFIG_NET_NS -void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +void sk_get_meminfo(const struct sock *sk, u32 *mem) { - __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); + memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); + + mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); + mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); + mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); + mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); } -EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +#ifdef CONFIG_PROC_FS +static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); int sock_prot_inuse_get(struct net *net, struct proto *prot) { @@ -2559,21 +4032,35 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) int res = 0; for_each_possible_cpu(cpu) - res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +int sock_inuse_get(struct net *net) +{ + int cpu, res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; + + return res; +} + +EXPORT_SYMBOL_GPL(sock_inuse_get); + static int __net_init sock_inuse_init_net(struct net *net) { - net->core.inuse = alloc_percpu(struct prot_inuse); - return net->core.inuse ? 0 : -ENOMEM; + net->core.prot_inuse = alloc_percpu(struct prot_inuse); + if (net->core.prot_inuse == NULL) + return -ENOMEM; + return 0; } static void __net_exit sock_inuse_exit_net(struct net *net) { - free_percpu(net->core.inuse); + free_percpu(net->core.prot_inuse); } static struct pernet_operations net_inuse_ops = { @@ -2590,60 +4077,126 @@ static __init int net_inuse_init(void) } core_initcall(net_inuse_init); -#else -static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); -void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) -{ - __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); -} -EXPORT_SYMBOL_GPL(sock_prot_inuse_add); - -int sock_prot_inuse_get(struct net *net, struct proto *prot) -{ - int cpu, idx = prot->inuse_idx; - int res = 0; - - for_each_possible_cpu(cpu) - res += per_cpu(prot_inuse, cpu).val[idx]; - - return res >= 0 ? res : 0; -} -EXPORT_SYMBOL_GPL(sock_prot_inuse_get); -#endif - -static void assign_proto_idx(struct proto *prot) +static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { pr_err("PROTO_INUSE_NR exhausted\n"); - return; + return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); + return 0; } static void release_proto_idx(struct proto *prot) { - if (prot->inuse_idx != PROTO_INUSE_NR - 1) + if (prot->inuse_idx != PROTO_INUSE_NR) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else -static inline void assign_proto_idx(struct proto *prot) +static inline int assign_proto_idx(struct proto *prot) { + return 0; } static inline void release_proto_idx(struct proto *prot) { } + #endif +static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) +{ + if (!twsk_prot) + return; + kfree(twsk_prot->twsk_slab_name); + twsk_prot->twsk_slab_name = NULL; + kmem_cache_destroy(twsk_prot->twsk_slab); + twsk_prot->twsk_slab = NULL; +} + +static int tw_prot_init(const struct proto *prot) +{ + struct timewait_sock_ops *twsk_prot = prot->twsk_prot; + + if (!twsk_prot) + return 0; + + twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", + prot->name); + if (!twsk_prot->twsk_slab_name) + return -ENOMEM; + + twsk_prot->twsk_slab = + kmem_cache_create(twsk_prot->twsk_slab_name, + twsk_prot->twsk_obj_size, 0, + SLAB_ACCOUNT | prot->slab_flags, + NULL); + if (!twsk_prot->twsk_slab) { + pr_crit("%s: Can't create timewait sock SLAB cache!\n", + prot->name); + return -ENOMEM; + } + + return 0; +} + +static void req_prot_cleanup(struct request_sock_ops *rsk_prot) +{ + if (!rsk_prot) + return; + kfree(rsk_prot->slab_name); + rsk_prot->slab_name = NULL; + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; +} + +static int req_prot_init(const struct proto *prot) +{ + struct request_sock_ops *rsk_prot = prot->rsk_prot; + + if (!rsk_prot) + return 0; + + rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", + prot->name); + if (!rsk_prot->slab_name) + return -ENOMEM; + + rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, + rsk_prot->obj_size, 0, + SLAB_ACCOUNT | prot->slab_flags, + NULL); + + if (!rsk_prot->slab) { + pr_crit("%s: Can't create request sock SLAB cache!\n", + prot->name); + return -ENOMEM; + } + return 0; +} + int proto_register(struct proto *prot, int alloc_slab) { + int ret = -ENOBUFS; + + if (prot->memory_allocated && !prot->sysctl_mem) { + pr_err("%s: missing sysctl_mem\n", prot->name); + return -EINVAL; + } + if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { + pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); + return -EINVAL; + } if (alloc_slab) { - prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | prot->slab_flags, + prot->slab = kmem_cache_create_usercopy(prot->name, + prot->obj_size, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags, + prot->useroffset, prot->usersize, NULL); if (prot->slab == NULL) { @@ -2652,61 +4205,35 @@ int proto_register(struct proto *prot, int alloc_slab) goto out; } - if (prot->rsk_prot != NULL) { - prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); - if (prot->rsk_prot->slab_name == NULL) - goto out_free_sock_slab; - - prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, - prot->rsk_prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + if (req_prot_init(prot)) + goto out_free_request_sock_slab; - if (prot->rsk_prot->slab == NULL) { - pr_crit("%s: Can't create request sock SLAB cache!\n", - prot->name); - goto out_free_request_sock_slab_name; - } - } - - if (prot->twsk_prot != NULL) { - prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); - - if (prot->twsk_prot->twsk_slab_name == NULL) - goto out_free_request_sock_slab; - - prot->twsk_prot->twsk_slab = - kmem_cache_create(prot->twsk_prot->twsk_slab_name, - prot->twsk_prot->twsk_obj_size, - 0, - SLAB_HWCACHE_ALIGN | - prot->slab_flags, - NULL); - if (prot->twsk_prot->twsk_slab == NULL) - goto out_free_timewait_sock_slab_name; - } + if (tw_prot_init(prot)) + goto out_free_timewait_sock_slab; } mutex_lock(&proto_list_mutex); + ret = assign_proto_idx(prot); + if (ret) { + mutex_unlock(&proto_list_mutex); + goto out_free_timewait_sock_slab; + } list_add(&prot->node, &proto_list); - assign_proto_idx(prot); mutex_unlock(&proto_list_mutex); - return 0; + return ret; -out_free_timewait_sock_slab_name: - kfree(prot->twsk_prot->twsk_slab_name); +out_free_timewait_sock_slab: + if (alloc_slab) + tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: - if (prot->rsk_prot && prot->rsk_prot->slab) { - kmem_cache_destroy(prot->rsk_prot->slab); - prot->rsk_prot->slab = NULL; - } -out_free_request_sock_slab_name: - if (prot->rsk_prot) - kfree(prot->rsk_prot->slab_name); -out_free_sock_slab: - kmem_cache_destroy(prot->slab); - prot->slab = NULL; + if (alloc_slab) { + req_prot_cleanup(prot->rsk_prot); + + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } out: - return -ENOBUFS; + return ret; } EXPORT_SYMBOL(proto_register); @@ -2717,24 +4244,36 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; - if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { - kmem_cache_destroy(prot->rsk_prot->slab); - kfree(prot->rsk_prot->slab_name); - prot->rsk_prot->slab = NULL; - } + req_prot_cleanup(prot->rsk_prot); + tw_prot_cleanup(prot->twsk_prot); +} +EXPORT_SYMBOL(proto_unregister); - if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { - kmem_cache_destroy(prot->twsk_prot->twsk_slab); - kfree(prot->twsk_prot->twsk_slab_name); - prot->twsk_prot->twsk_slab = NULL; +int sock_load_diag_module(int family, int protocol) +{ + if (!protocol) { + if (!sock_is_registered(family)) + return -ENOENT; + + return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family); } + +#ifdef CONFIG_INET + if (family == AF_INET && + protocol != IPPROTO_RAW && + protocol < MAX_INET_PROTOS && + !rcu_access_pointer(inet_protos[protocol])) + return -ENOENT; +#endif + + return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family, protocol); } -EXPORT_SYMBOL(proto_unregister); +EXPORT_SYMBOL(sock_load_diag_module); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) @@ -2764,7 +4303,7 @@ static long sock_prot_memory_allocated(struct proto *proto) return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } -static char *sock_prot_memory_pressure(struct proto *proto) +static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; @@ -2774,7 +4313,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " - "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), @@ -2795,7 +4334,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), - proto_method_implemented(proto->sendpage), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), @@ -2816,7 +4354,7 @@ static int proto_seq_show(struct seq_file *seq, void *v) "maxhdr", "slab", "module", - "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; @@ -2829,23 +4367,10 @@ static const struct seq_operations proto_seq_ops = { .show = proto_seq_show, }; -static int proto_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &proto_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations proto_seq_fops = { - .owner = THIS_MODULE, - .open = proto_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - static __net_init int proto_init_net(struct net *net) { - if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) + if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, + sizeof(struct seq_net_private))) return -ENOMEM; return 0; @@ -2870,3 +4395,155 @@ static int __init proto_init(void) subsys_initcall(proto_init); #endif /* PROC_FS */ + +#ifdef CONFIG_NET_RX_BUSY_POLL +bool sk_busy_loop_end(void *p, unsigned long start_time) +{ + struct sock *sk = p; + + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + return true; + + if (sk_is_udp(sk) && + !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) + return true; + + return sk_busy_loop_timeout(sk, start_time); +} +EXPORT_SYMBOL(sk_busy_loop_end); +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len) +{ + if (!sk->sk_prot->bind_add) + return -EOPNOTSUPP; + return sk->sk_prot->bind_add(sk, addr, addr_len); +} +EXPORT_SYMBOL(sock_bind_add); + +/* Copy 'size' bytes from userspace and return `size` back to userspace */ +int sock_ioctl_inout(struct sock *sk, unsigned int cmd, + void __user *arg, void *karg, size_t size) +{ + int ret; + + if (copy_from_user(karg, arg, size)) + return -EFAULT; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); + if (ret) + return ret; + + if (copy_to_user(arg, karg, size)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(sock_ioctl_inout); + +/* This is the most common ioctl prep function, where the result (4 bytes) is + * copied back to userspace if the ioctl() returns successfully. No input is + * copied from userspace as input argument. + */ +static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int ret, karg = 0; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); + if (ret) + return ret; + + return put_user(karg, (int __user *)arg); +} + +/* A wrapper around sock ioctls, which copies the data from userspace + * (depending on the protocol/ioctl), and copies back the result to userspace. + * The main motivation for this function is to pass kernel memory to the + * protocol ioctl callbacks, instead of userspace memory. + */ +int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int rc = 1; + + if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) + rc = ipmr_sk_ioctl(sk, cmd, arg); + else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) + rc = ip6mr_sk_ioctl(sk, cmd, arg); + else if (sk_is_phonet(sk)) + rc = phonet_sk_ioctl(sk, cmd, arg); + + /* If ioctl was processed, returns its value */ + if (rc <= 0) + return rc; + + /* Otherwise call the default handler */ + return sock_ioctl_out(sk, cmd, arg); +} +EXPORT_SYMBOL(sk_ioctl); + +static int __init sock_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); +#ifdef CONFIG_MEMCG + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); +#endif + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); + return 0; +} + +core_initcall(sock_struct_check); diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h new file mode 100644 index 000000000000..2f396e6bfba5 --- /dev/null +++ b/net/core/sock_destructor.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_SOCK_DESTRUCTOR_H +#define _NET_CORE_SOCK_DESTRUCTOR_H +#include <net/tcp.h> + +static inline bool is_skb_wmem(const struct sk_buff *skb) +{ + return skb->destructor == sock_wfree || + skb->destructor == __sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree); +} +#endif diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a0e9cf6379de..026ce9bd9e5e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -1,3 +1,6 @@ +/* License: GPL */ + +#include <linux/filter.h> #include <linux/mutex.h> #include <linux/socket.h> #include <linux/skbuff.h> @@ -5,30 +8,58 @@ #include <net/net_namespace.h> #include <linux/module.h> #include <net/sock.h> - +#include <linux/kernel.h> +#include <linux/tcp.h> +#include <linux/workqueue.h> +#include <linux/nospec.h> +#include <linux/cookie.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> -static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; -static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); -static DEFINE_MUTEX(sock_diag_table_mutex); +static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX]; + +static const struct sock_diag_inet_compat __rcu *inet_rcv_compat; -int sock_diag_check_cookie(void *sk, __u32 *cookie) +static struct workqueue_struct *broadcast_wq; + +DEFINE_COOKIE(sock_cookie); + +u64 __sock_gen_cookie(struct sock *sk) { - if ((cookie[0] != INET_DIAG_NOCOOKIE || - cookie[1] != INET_DIAG_NOCOOKIE) && - ((u32)(unsigned long)sk != cookie[0] || - (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1])) - return -ESTALE; - else + u64 res = atomic64_read(&sk->sk_cookie); + + if (!res) { + u64 new = gen_cookie_next(&sock_cookie); + + atomic64_cmpxchg(&sk->sk_cookie, res, new); + + /* Another thread might have changed sk_cookie before us. */ + res = atomic64_read(&sk->sk_cookie); + } + return res; +} + +int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie) +{ + u64 res; + + if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE) return 0; + + res = sock_gen_cookie(sk); + if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1]) + return -ESTALE; + + return 0; } EXPORT_SYMBOL_GPL(sock_diag_check_cookie); -void sock_diag_save_cookie(void *sk, __u32 *cookie) +void sock_diag_save_cookie(struct sock *sk, __u32 *cookie) { - cookie[0] = (u32)(unsigned long)sk; - cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + u64 res = sock_gen_cookie(sk); + + cookie[0] = (u32)res; + cookie[1] = (u32)(res >> 32); } EXPORT_SYMBOL_GPL(sock_diag_save_cookie); @@ -36,106 +67,165 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) { u32 mem[SK_MEMINFO_VARS]; - mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); - mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; - mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); - mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; - mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; - mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + sk_get_meminfo(sk, mem); return nla_put(skb, attrtype, sizeof(mem), &mem); } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); -int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk, +int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, struct sk_buff *skb, int attrtype) { - struct nlattr *attr; + struct sock_fprog_kern *fprog; struct sk_filter *filter; - unsigned int len; + struct nlattr *attr; + unsigned int flen; int err = 0; - if (!ns_capable(user_ns, CAP_NET_ADMIN)) { + if (!may_report_filterinfo) { nla_reserve(skb, attrtype, 0); return 0; } rcu_read_lock(); - filter = rcu_dereference(sk->sk_filter); - len = filter ? filter->len * sizeof(struct sock_filter) : 0; + if (!filter) + goto out; - attr = nla_reserve(skb, attrtype, len); - if (attr == NULL) { - err = -EMSGSIZE; + fprog = filter->prog->orig_prog; + if (!fprog) goto out; - } - if (filter) { - struct sock_filter *fb = (struct sock_filter *)nla_data(attr); - int i; + flen = bpf_classic_proglen(fprog); - for (i = 0; i < filter->len; i++, fb++) - sk_decode_filter(&filter->insns[i], fb); + attr = nla_reserve(skb, attrtype, flen); + if (attr == NULL) { + err = -EMSGSIZE; + goto out; } + memcpy(nla_data(attr), fprog->filter, flen); out: rcu_read_unlock(); return err; } EXPORT_SYMBOL(sock_diag_put_filterinfo); -void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +struct broadcast_sk { + struct sock *sk; + struct work_struct work; +}; + +static size_t sock_diag_nlmsg_size(void) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = fn; - mutex_unlock(&sock_diag_table_mutex); + return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ + + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ +} + +static const struct sock_diag_handler *sock_diag_lock_handler(int family) +{ + const struct sock_diag_handler *handler; + + rcu_read_lock(); + handler = rcu_dereference(sock_diag_handlers[family]); + if (handler && !try_module_get(handler->owner)) + handler = NULL; + rcu_read_unlock(); + + return handler; +} + +static void sock_diag_unlock_handler(const struct sock_diag_handler *handler) +{ + module_put(handler->owner); +} + +static void sock_diag_broadcast_destroy_work(struct work_struct *work) +{ + struct broadcast_sk *bsk = + container_of(work, struct broadcast_sk, work); + struct sock *sk = bsk->sk; + const struct sock_diag_handler *hndl; + struct sk_buff *skb; + const enum sknetlink_groups group = sock_diag_destroy_group(sk); + int err = -1; + + WARN_ON(group == SKNLGRP_NONE); + + skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out; + + hndl = sock_diag_lock_handler(sk->sk_family); + if (hndl) { + if (hndl->get_info) + err = hndl->get_info(skb, sk); + sock_diag_unlock_handler(hndl); + } + if (!err) + nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, + GFP_KERNEL); + else + kfree_skb(skb); +out: + sk_destruct(sk); + kfree(bsk); +} + +void sock_diag_broadcast_destroy(struct sock *sk) +{ + /* Note, this function is often called from an interrupt context. */ + struct broadcast_sk *bsk = + kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC); + if (!bsk) + return sk_destruct(sk); + bsk->sk = sk; + INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work); + queue_work(broadcast_wq, &bsk->work); +} + +void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr) +{ + xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr)); } EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); -void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = NULL; - mutex_unlock(&sock_diag_table_mutex); + const struct sock_diag_inet_compat *old; + + old = unrcu_pointer(xchg(&inet_rcv_compat, NULL)); + WARN_ON_ONCE(old != ptr); } EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); int sock_diag_register(const struct sock_diag_handler *hndl) { - int err = 0; + int family = hndl->family; - if (hndl->family >= AF_MAX) + if (family >= AF_MAX) return -EINVAL; - mutex_lock(&sock_diag_table_mutex); - if (sock_diag_handlers[hndl->family]) - err = -EBUSY; - else - sock_diag_handlers[hndl->family] = hndl; - mutex_unlock(&sock_diag_table_mutex); - - return err; + return !cmpxchg((const struct sock_diag_handler **) + &sock_diag_handlers[family], + NULL, hndl) ? 0 : -EBUSY; } EXPORT_SYMBOL_GPL(sock_diag_register); -void sock_diag_unregister(const struct sock_diag_handler *hnld) +void sock_diag_unregister(const struct sock_diag_handler *hndl) { - int family = hnld->family; + int family = hndl->family; if (family >= AF_MAX) return; - mutex_lock(&sock_diag_table_mutex); - BUG_ON(sock_diag_handlers[family] != hnld); - sock_diag_handlers[family] = NULL; - mutex_unlock(&sock_diag_table_mutex); + xchg((const struct sock_diag_handler **)&sock_diag_handlers[family], + NULL); } EXPORT_SYMBOL_GPL(sock_diag_unregister); -static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; struct sock_diag_req *req = nlmsg_data(nlh); @@ -146,61 +236,99 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (req->sdiag_family >= AF_MAX) return -EINVAL; + req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); - if (sock_diag_handlers[req->sdiag_family] == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, req->sdiag_family); + if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family])) + sock_load_diag_module(req->sdiag_family, 0); - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[req->sdiag_family]; + hndl = sock_diag_lock_handler(req->sdiag_family); if (hndl == NULL) - err = -ENOENT; - else + return -ENOENT; + + if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); - mutex_unlock(&sock_diag_table_mutex); + else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) + err = hndl->destroy(skb, nlh); + else + err = -EOPNOTSUPP; + sock_diag_unlock_handler(hndl); return err; } -static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { + const struct sock_diag_inet_compat *ptr; int ret; switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: - case DCCPDIAG_GETSOCK: - if (inet_rcv_compat == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); - - mutex_lock(&sock_diag_table_mutex); - if (inet_rcv_compat != NULL) - ret = inet_rcv_compat(skb, nlh); - else - ret = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + if (!rcu_access_pointer(inet_rcv_compat)) + sock_load_diag_module(AF_INET, 0); + + rcu_read_lock(); + ptr = rcu_dereference(inet_rcv_compat); + if (ptr && !try_module_get(ptr->owner)) + ptr = NULL; + rcu_read_unlock(); + + ret = -EOPNOTSUPP; + if (ptr) { + ret = ptr->fn(skb, nlh); + module_put(ptr->owner); + } return ret; case SOCK_DIAG_BY_FAMILY: - return __sock_diag_rcv_msg(skb, nlh); + case SOCK_DESTROY: + return __sock_diag_cmd(skb, nlh); default: return -EINVAL; } } -static DEFINE_MUTEX(sock_diag_mutex); - static void sock_diag_rcv(struct sk_buff *skb) { - mutex_lock(&sock_diag_mutex); netlink_rcv_skb(skb, &sock_diag_rcv_msg); - mutex_unlock(&sock_diag_mutex); } +static int sock_diag_bind(struct net *net, int group) +{ + switch (group) { + case SKNLGRP_INET_TCP_DESTROY: + case SKNLGRP_INET_UDP_DESTROY: + if (!rcu_access_pointer(sock_diag_handlers[AF_INET])) + sock_load_diag_module(AF_INET, 0); + break; + case SKNLGRP_INET6_TCP_DESTROY: + case SKNLGRP_INET6_UDP_DESTROY: + if (!rcu_access_pointer(sock_diag_handlers[AF_INET6])) + sock_load_diag_module(AF_INET6, 0); + break; + } + return 0; +} + +int sock_diag_destroy(struct sock *sk, int err) +{ + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (!sk->sk_prot->diag_destroy) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, err); +} +EXPORT_SYMBOL_GPL(sock_diag_destroy); + static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { + .groups = SKNLGRP_MAX, .input = sock_diag_rcv, + .bind = sock_diag_bind, + .flags = NL_CFG_F_NONROOT_RECV, }; net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); @@ -220,15 +348,8 @@ static struct pernet_operations diag_net_ops = { static int __init sock_diag_init(void) { + broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0); + BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } - -static void __exit sock_diag_exit(void) -{ - unregister_pernet_subsys(&diag_net_ops); -} - -module_init(sock_diag_init); -module_exit(sock_diag_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); +device_initcall(sock_diag_init); diff --git a/net/core/sock_map.c b/net/core/sock_map.c new file mode 100644 index 000000000000..5947b38e4f8b --- /dev/null +++ b/net/core/sock_map.c @@ -0,0 +1,1959 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/filter.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/net.h> +#include <linux/workqueue.h> +#include <linux/skmsg.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/sock_diag.h> +#include <net/udp.h> + +struct bpf_stab { + struct bpf_map map; + struct sock **sks; + struct sk_psock_progs progs; + spinlock_t lock; +}; + +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, struct bpf_link *link, + u32 which); +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + + if (attr->max_entries == 0 || + attr->key_size != 4 || + (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); + if (!stab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&stab->map, attr); + spin_lock_init(&stab->lock); + + stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * + sizeof(struct sock *), + stab->map.numa_node); + if (!stab->sks) { + bpf_map_area_free(stab); + return ERR_PTR(-ENOMEM); + } + + return &stab->map; +} + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_map *map; + int ret; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + CLASS(fd, f)(attr->target_fd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); + return ret; +} + +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + struct bpf_prog *prog; + struct bpf_map *map; + int ret; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + CLASS(fd, f)(attr->target_fd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + prog = bpf_prog_get(attr->attach_bpf_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type != ptype) { + ret = -EINVAL; + goto put_prog; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); +put_prog: + bpf_prog_put(prog); + return ret; +} + +static void sock_map_sk_acquire(struct sock *sk) + __acquires(&sk->sk_lock.slock) +{ + lock_sock(sk); + rcu_read_lock(); +} + +static void sock_map_sk_release(struct sock *sk) + __releases(&sk->sk_lock.slock) +{ + rcu_read_unlock(); + release_sock(sk); +} + +static void sock_map_add_link(struct sk_psock *psock, + struct sk_psock_link *link, + struct bpf_map *map, void *link_raw) +{ + link->link_raw = link_raw; + link->map = map; + spin_lock_bh(&psock->link_lock); + list_add_tail(&link->list, &psock->link); + spin_unlock_bh(&psock->link_lock); +} + +static void sock_map_del_link(struct sock *sk, + struct sk_psock *psock, void *link_raw) +{ + bool strp_stop = false, verdict_stop = false; + struct sk_psock_link *link, *tmp; + + spin_lock_bh(&psock->link_lock); + list_for_each_entry_safe(link, tmp, &psock->link, list) { + if (link->link_raw == link_raw) { + struct bpf_map *map = link->map; + struct sk_psock_progs *progs = sock_map_progs(map); + + if (psock->saved_data_ready && progs->stream_parser) + strp_stop = true; + if (psock->saved_data_ready && progs->stream_verdict) + verdict_stop = true; + if (psock->saved_data_ready && progs->skb_verdict) + verdict_stop = true; + list_del(&link->list); + sk_psock_free_link(link); + break; + } + } + spin_unlock_bh(&psock->link_lock); + if (strp_stop || verdict_stop) { + write_lock_bh(&sk->sk_callback_lock); + if (strp_stop) + sk_psock_stop_strp(sk, psock); + if (verdict_stop) + sk_psock_stop_verdict(sk, psock); + + if (psock->psock_update_sk_prot) + psock->psock_update_sk_prot(sk, psock, false); + write_unlock_bh(&sk->sk_callback_lock); + } +} + +static void sock_map_unref(struct sock *sk, void *link_raw) +{ + struct sk_psock *psock = sk_psock(sk); + + if (likely(psock)) { + sock_map_del_link(sk, psock, link_raw); + sk_psock_put(sk, psock); + } +} + +static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) +{ + if (!sk->sk_prot->psock_update_sk_prot) + return -EINVAL; + psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; + return sk->sk_prot->psock_update_sk_prot(sk, psock, false); +} + +static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) { + if (sk->sk_prot->close != sock_map_close) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + if (!refcount_inc_not_zero(&psock->refcnt)) + psock = ERR_PTR(-EBUSY); + } +out: + rcu_read_unlock(); + return psock; +} + +static int sock_map_link(struct bpf_map *map, struct sock *sk) +{ + struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog *stream_verdict = NULL; + struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; + struct bpf_prog *msg_parser = NULL; + struct sk_psock *psock; + int ret; + + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); + } + + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; + } + } + + msg_parser = READ_ONCE(progs->msg_parser); + if (msg_parser) { + msg_parser = bpf_prog_inc_not_zero(msg_parser); + if (IS_ERR(msg_parser)) { + ret = PTR_ERR(msg_parser); + goto out_put_stream_parser; + } + } + + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + + psock = sock_map_psock_get_checked(sk); + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); + goto out_progs; + } + + if (psock) { + if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { + sk_psock_put(sk, psock); + ret = -EBUSY; + goto out_progs; + } + } else { + psock = sk_psock_init(sk, map->numa_node); + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); + goto out_progs; + } + } + + if (msg_parser) + psock_set_prog(&psock->progs.msg_parser, msg_parser); + if (stream_parser) + psock_set_prog(&psock->progs.stream_parser, stream_parser); + if (stream_verdict) + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + if (skb_verdict) + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + + /* msg_* and stream_* programs references tracked in psock after this + * point. Reference dec and cleanup will occur through psock destructor + */ + ret = sock_map_init_proto(sk, psock); + if (ret < 0) { + sk_psock_put(sk, psock); + goto out; + } + + write_lock_bh(&sk->sk_callback_lock); + if (stream_parser && stream_verdict && !psock->saved_data_ready) { + if (sk_is_tcp(sk)) + ret = sk_psock_init_strp(sk, psock); + else + ret = -EOPNOTSUPP; + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_put(sk, psock); + goto out; + } + sk_psock_start_strp(sk, psock); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + sk_psock_start_verdict(sk, psock); + } + write_unlock_bh(&sk->sk_callback_lock); + return 0; +out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: + if (msg_parser) + bpf_prog_put(msg_parser); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); +out: + return ret; +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ + synchronize_rcu(); + for (i = 0; i < stab->map.max_entries; i++) { + struct sock **psk = &stab->sks[i]; + struct sock *sk; + + sk = xchg(psk, NULL); + if (sk) { + sock_hold(sk); + lock_sock(sk); + rcu_read_lock(); + sock_map_unref(sk, psk); + rcu_read_unlock(); + release_sock(sk); + sock_put(sk); + } + } + + /* wait for psock readers accessing its map link */ + synchronize_rcu(); + + bpf_map_area_free(stab->sks); + bpf_map_area_free(stab); +} + +static void sock_map_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); +} + +static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(key >= map->max_entries)) + return NULL; + return READ_ONCE(stab->sks[key]); +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + struct sock *sk; + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; +} + +static void *sock_map_lookup_sys(struct bpf_map *map, void *key) +{ + struct sock *sk; + + if (map->value_size != sizeof(u64)) + return ERR_PTR(-ENOSPC); + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk) + return ERR_PTR(-ENOENT); + + __sock_gen_cookie(sk); + return &sk->sk_cookie; +} + +static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, + struct sock **psk) +{ + struct sock *sk = NULL; + int err = 0; + + spin_lock_bh(&stab->lock); + if (!sk_test || sk_test == *psk) + sk = xchg(psk, NULL); + + if (likely(sk)) + sock_map_unref(sk, psk); + else + err = -EINVAL; + + spin_unlock_bh(&stab->lock); + return err; +} + +static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + __sock_map_delete(stab, sk, link_raw); +} + +static long sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = *(u32 *)key; + struct sock **psk; + + if (unlikely(i >= map->max_entries)) + return -EINVAL; + + psk = &stab->sks[i]; + return __sock_map_delete(stab, NULL, psk); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *key_next = next; + + if (i == stab->map.max_entries - 1) + return -ENOENT; + if (i >= stab->map.max_entries) + *key_next = 0; + else + *key_next = i + 1; + return 0; +} + +static int sock_map_update_common(struct bpf_map *map, u32 idx, + struct sock *sk, u64 flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct sk_psock_link *link; + struct sk_psock *psock; + struct sock *osk; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(idx >= map->max_entries)) + return -E2BIG; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + spin_lock_bh(&stab->lock); + osk = stab->sks[idx]; + if (osk && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!osk && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + sock_map_add_link(psock, link, map, &stab->sks[idx]); + stab->sks[idx] = sk; + if (osk) + sock_map_unref(osk, &stab->sks[idx]); + spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + spin_unlock_bh(&stab->lock); + if (psock) + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; +} + +static bool sock_map_redirect_allowed(const struct sock *sk) +{ + if (sk_is_tcp(sk)) + return sk->sk_state != TCP_LISTEN; + else + return sk->sk_state == TCP_ESTABLISHED; +} + +static bool sock_map_sk_is_suitable(const struct sock *sk) +{ + return !!sk->sk_prot->psock_update_sk_prot; +} + +static bool sock_map_sk_state_allowed(const struct sock *sk) +{ + if (sk_is_tcp(sk)) + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); + if (sk_is_stream_unix(sk)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; + return true; +} + +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags); + +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + struct socket *sock; + struct sock *sk; + int ret; + u64 ufd; + + if (map->value_size == sizeof(u64)) + ufd = *(u64 *)value; + else + ufd = *(u32 *)value; + if (ufd > S32_MAX) + return -EINVAL; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk)) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + if (!sock_map_sk_state_allowed(sk)) + ret = -EOPNOTSUPP; + else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) + ret = sock_map_update_common(map, *(u32 *)key, sk, flags); + else + ret = sock_hash_update_common(map, key, sk, flags); + sock_map_sk_release(sk); +out: + sockfd_put(sock); + return ret; +} + +static long sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct sock *sk = (struct sock *)value; + int ret; + + if (unlikely(!sk || !sk_fullsock(sk))) + return -EINVAL; + + if (!sock_map_sk_is_suitable(sk)) + return -EOPNOTSUPP; + + local_bh_disable(); + bh_lock_sock(sk); + if (!sock_map_sk_state_allowed(sk)) + ret = -EOPNOTSUPP; + else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) + ret = sock_map_update_common(map, *(u32 *)key, sk, flags); + else + ret = sock_hash_update_common(map, key, sk, flags); + bh_unlock_sock(sk); + local_bh_enable(); + return ret; +} + +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_map_update_common(map, *(u32 *)key, sops->sk, + flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct sock *sk; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + sk = __sock_map_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) + return SK_DROP; + if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) + return SK_DROP; + + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct sock *sk; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + sk = __sock_map_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) + return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; + if (sk_is_vsock(sk)) + return SK_DROP; + + msg->flags = flags; + msg->sk_redir = sk; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +struct sock_map_seq_info { + struct bpf_map *map; + struct sock *sk; + u32 index; +}; + +struct bpf_iter__sockmap { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(struct sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, + struct sock *sk) + +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) +{ + if (unlikely(info->index >= info->map->max_entries)) + return NULL; + + info->sk = __sock_map_lookup_elem(info->map, info->index); + + /* can't return sk directly, since that might be NULL */ + return info; +} + +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + struct sock_map_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_map_seq_stop */ + rcu_read_lock(); + return sock_map_seq_lookup_elem(info); +} + +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) + __must_hold(rcu) +{ + struct sock_map_seq_info *info = seq->private; + + ++*pos; + ++info->index; + + return sock_map_seq_lookup_elem(info); +} + +static int sock_map_seq_show(struct seq_file *seq, void *v) + __must_hold(rcu) +{ + struct sock_map_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !v); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + ctx.sk = info->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_map_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + if (!v) + (void)sock_map_seq_show(seq, NULL); + + /* pairs with sock_map_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_map_seq_ops = { + .start = sock_map_seq_start, + .next = sock_map_seq_next, + .stop = sock_map_seq_stop, + .show = sock_map_seq_show, +}; + +static int sock_map_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_map_seq_info *info = priv_data; + + bpf_map_inc_with_uref(aux->map); + info->map = aux->map; + return 0; +} + +static void sock_map_fini_seq_private(void *priv_data) +{ + struct sock_map_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + +static u64 sock_map_mem_usage(const struct bpf_map *map) +{ + u64 usage = sizeof(struct bpf_stab); + + usage += (u64)map->max_entries * sizeof(struct sock *); + return usage; +} + +static const struct bpf_iter_seq_info sock_map_iter_seq_info = { + .seq_ops = &sock_map_seq_ops, + .init_seq_private = sock_map_init_seq_private, + .fini_seq_private = sock_map_fini_seq_private, + .seq_priv_size = sizeof(struct sock_map_seq_info), +}; + +BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) +const struct bpf_map_ops sock_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_get_next_key = sock_map_get_next_key, + .map_lookup_elem_sys_only = sock_map_lookup_sys, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_map_release_progs, + .map_check_btf = map_check_no_btf, + .map_mem_usage = sock_map_mem_usage, + .map_btf_id = &sock_map_btf_ids[0], + .iter_seq_info = &sock_map_iter_seq_info, +}; + +struct bpf_shtab_elem { + struct rcu_head rcu; + u32 hash; + struct sock *sk; + struct hlist_node node; + u8 key[]; +}; + +struct bpf_shtab_bucket { + struct hlist_head head; + spinlock_t lock; +}; + +struct bpf_shtab { + struct bpf_map map; + struct bpf_shtab_bucket *buckets; + u32 buckets_num; + u32 elem_size; + struct sk_psock_progs progs; + atomic_t count; +}; + +static inline u32 sock_hash_bucket_hash(const void *key, u32 len) +{ + return jhash(key, len, 0); +} + +static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, + u32 hash) +{ + return &htab->buckets[hash & (htab->buckets_num - 1)]; +} + +static struct bpf_shtab_elem * +sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, + u32 key_size) +{ + struct bpf_shtab_elem *elem; + + hlist_for_each_entry_rcu(elem, head, node) { + if (elem->hash == hash && + !memcmp(&elem->key, key, key_size)) + return elem; + } + + return NULL; +} + +static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u32 key_size = map->key_size, hash; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + + return elem ? elem->sk : NULL; +} + +static void sock_hash_free_elem(struct bpf_shtab *htab, + struct bpf_shtab_elem *elem) +{ + atomic_dec(&htab->count); + kfree_rcu(elem, rcu); +} + +static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_elem *elem_probe, *elem = link_raw; + struct bpf_shtab_bucket *bucket; + + WARN_ON_ONCE(!rcu_read_lock_held()); + bucket = sock_hash_select_bucket(htab, elem->hash); + + /* elem may be deleted in parallel from the map, but access here + * is okay since it's going away only after RCU grace period. + * However, we need to check whether it's still present. + */ + spin_lock_bh(&bucket->lock); + elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, + elem->key, map->key_size); + if (elem_probe && elem_probe == elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + spin_unlock_bh(&bucket->lock); +} + +static long sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u32 hash, key_size = map->key_size; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + int ret = -ENOENT; + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + ret = 0; + } + spin_unlock_bh(&bucket->lock); + return ret; +} + +static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, + void *key, u32 key_size, + u32 hash, struct sock *sk, + struct bpf_shtab_elem *old) +{ + struct bpf_shtab_elem *new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + + new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!new) { + atomic_dec(&htab->count); + return ERR_PTR(-ENOMEM); + } + memcpy(new->key, key, key_size); + new->sk = sk; + new->hash = hash; + return new; +} + +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u32 key_size = map->key_size, hash; + struct bpf_shtab_elem *elem, *elem_new; + struct bpf_shtab_bucket *bucket; + struct sk_psock_link *link; + struct sk_psock *psock; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!elem && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); + if (IS_ERR(elem_new)) { + ret = PTR_ERR(elem_new); + goto out_unlock; + } + + sock_map_add_link(psock, link, map, elem_new); + /* Add new element to the head of the list, so that + * concurrent search will find it before old elem. + */ + hlist_add_head_rcu(&elem_new->node, &bucket->head); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + spin_unlock_bh(&bucket->lock); + return 0; +out_unlock: + spin_unlock_bh(&bucket->lock); + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static int sock_hash_get_next_key(struct bpf_map *map, void *key, + void *key_next) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_elem *elem, *elem_next; + u32 hash, key_size = map->key_size; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first_elem; + hash = sock_hash_bucket_hash(key, key_size); + head = &sock_hash_select_bucket(htab, hash)->head; + elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); + if (!elem) + goto find_first_elem; + + elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), + struct bpf_shtab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + + i = hash & (htab->buckets_num - 1); + i++; +find_first_elem: + for (; i < htab->buckets_num; i++) { + head = &sock_hash_select_bucket(htab, i)->head; + elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), + struct bpf_shtab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + } + + return -ENOENT; +} + +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_shtab *htab; + int i, err; + + if (attr->max_entries == 0 || + attr->key_size == 0 || + (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + if (attr->key_size > MAX_BPF_STACK) + return ERR_PTR(-E2BIG); + + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct bpf_shtab_elem) + + round_up(htab->map.key_size, 8); + if (htab->buckets_num == 0 || + htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { + err = -EINVAL; + goto free_htab; + } + + htab->buckets = bpf_map_area_alloc(htab->buckets_num * + sizeof(struct bpf_shtab_bucket), + htab->map.numa_node); + if (!htab->buckets) { + err = -ENOMEM; + goto free_htab; + } + + for (i = 0; i < htab->buckets_num; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + bpf_map_area_free(htab); + return ERR_PTR(err); +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_bucket *bucket; + struct hlist_head unlink_list; + struct bpf_shtab_elem *elem; + struct hlist_node *node; + int i; + + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ + synchronize_rcu(); + for (i = 0; i < htab->buckets_num; i++) { + bucket = sock_hash_select_bucket(htab, i); + + /* We are racing with sock_hash_delete_from_link to + * enter the spin-lock critical section. Every socket on + * the list is still linked to sockhash. Since link + * exists, psock exists and holds a ref to socket. That + * lets us to grab a socket ref too. + */ + spin_lock_bh(&bucket->lock); + hlist_for_each_entry(elem, &bucket->head, node) + sock_hold(elem->sk); + hlist_move_list(&bucket->head, &unlink_list); + spin_unlock_bh(&bucket->lock); + + /* Process removed entries out of atomic context to + * block for socket lock before deleting the psock's + * link to sockhash. + */ + hlist_for_each_entry_safe(elem, node, &unlink_list, node) { + hlist_del(&elem->node); + lock_sock(elem->sk); + rcu_read_lock(); + sock_map_unref(elem->sk, elem); + rcu_read_unlock(); + release_sock(elem->sk); + sock_put(elem->sk); + sock_hash_free_elem(htab, elem); + } + cond_resched(); + } + + /* wait for psock readers accessing its map link */ + synchronize_rcu(); + + bpf_map_area_free(htab->buckets); + bpf_map_area_free(htab); +} + +static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) +{ + struct sock *sk; + + if (map->value_size != sizeof(u64)) + return ERR_PTR(-ENOSPC); + + sk = __sock_hash_lookup_elem(map, key); + if (!sk) + return ERR_PTR(-ENOENT); + + __sock_gen_cookie(sk); + return &sk->sk_cookie; +} + +static void *sock_hash_lookup(struct bpf_map *map, void *key) +{ + struct sock *sk; + + sk = __sock_hash_lookup_elem(map, key); + if (!sk) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; +} + +static void sock_hash_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); +} + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_hash_update_common(map, key, sops->sk, flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct sock *sk; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + sk = __sock_hash_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) + return SK_DROP; + if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) + return SK_DROP; + + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct sock *sk; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + sk = __sock_hash_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) + return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; + if (sk_is_vsock(sk)) + return SK_DROP; + + msg->flags = flags; + msg->sk_redir = sk; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +struct sock_hash_seq_info { + struct bpf_map *map; + struct bpf_shtab *htab; + u32 bucket_id; +}; + +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, + struct bpf_shtab_elem *prev_elem) +{ + const struct bpf_shtab *htab = info->htab; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + struct hlist_node *node; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + + /* no more elements, continue in the next bucket */ + info->bucket_id++; + } + + for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { + bucket = &htab->buckets[info->bucket_id]; + node = rcu_dereference(hlist_first_rcu(&bucket->head)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + } + + return NULL; +} + +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_hash_seq_stop */ + rcu_read_lock(); + return sock_hash_seq_find_next(info, NULL); +} + +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) + __must_hold(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + + ++*pos; + return sock_hash_seq_find_next(info, v); +} + +static int sock_hash_seq_show(struct seq_file *seq, void *v) + __must_hold(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_shtab_elem *elem = v; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !elem); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->key; + ctx.sk = elem->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_hash_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + if (!v) + (void)sock_hash_seq_show(seq, NULL); + + /* pairs with sock_hash_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_hash_seq_ops = { + .start = sock_hash_seq_start, + .next = sock_hash_seq_next, + .stop = sock_hash_seq_stop, + .show = sock_hash_seq_show, +}; + +static int sock_hash_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_hash_seq_info *info = priv_data; + + bpf_map_inc_with_uref(aux->map); + info->map = aux->map; + info->htab = container_of(aux->map, struct bpf_shtab, map); + return 0; +} + +static void sock_hash_fini_seq_private(void *priv_data) +{ + struct sock_hash_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + +static u64 sock_hash_mem_usage(const struct bpf_map *map) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u64 usage = sizeof(*htab); + + usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); + usage += atomic_read(&htab->count) * (u64)htab->elem_size; + return usage; +} + +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { + .seq_ops = &sock_hash_seq_ops, + .init_seq_private = sock_hash_init_seq_private, + .fini_seq_private = sock_hash_fini_seq_private, + .seq_priv_size = sizeof(struct sock_hash_seq_info), +}; + +BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) +const struct bpf_map_ops sock_hash_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_hash_delete_elem, + .map_lookup_elem = sock_hash_lookup, + .map_lookup_elem_sys_only = sock_hash_lookup_sys, + .map_release_uref = sock_hash_release_progs, + .map_check_btf = map_check_no_btf, + .map_mem_usage = sock_hash_mem_usage, + .map_btf_id = &sock_hash_map_btf_ids[0], + .iter_seq_info = &sock_hash_iter_seq_info, +}; + +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return &container_of(map, struct bpf_stab, map)->progs; + case BPF_MAP_TYPE_SOCKHASH: + return &container_of(map, struct bpf_shtab, map)->progs; + default: + break; + } + + return NULL; +} + +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) +{ + struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; + + if (!progs) + return -EOPNOTSUPP; + + switch (which) { + case BPF_SK_MSG_VERDICT: + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; + break; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + case BPF_SK_SKB_STREAM_PARSER: + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; + break; +#endif + case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; + break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; + break; + default: + return -EOPNOTSUPP; + } + + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; + return 0; +} + +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, struct bpf_link *link, + u32 which) +{ + struct bpf_prog **pprog; + struct bpf_link **plink; + int ret; + + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); + if (ret) + return ret; + + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; +} + +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_cnt = 0, flags = 0; + struct bpf_prog **pprog; + struct bpf_prog *prog; + struct bpf_map *map; + u32 id = 0; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + CLASS(fd, f)(attr->target_fd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + rcu_read_lock(); + + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); + if (ret) + goto end; + + prog = *pprog; + prog_cnt = !prog ? 0 : 1; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + goto end; + + /* we do not hold the refcnt, the bpf prog may be released + * asynchronously and the id would be set to 0. + */ + id = data_race(prog->aux->id); + if (id == 0) + prog_cnt = 0; + +end: + rcu_read_unlock(); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || + (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || + copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + ret = -EFAULT; + + return ret; +} + +static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) +{ + switch (link->map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return sock_map_delete_from_link(link->map, sk, + link->link_raw); + case BPF_MAP_TYPE_SOCKHASH: + return sock_hash_delete_from_link(link->map, sk, + link->link_raw); + default: + break; + } +} + +static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_link *link; + + while ((link = sk_psock_link_pop(psock))) { + sock_map_unlink(sk, link); + sk_psock_free_link(link); + } +} + +void sock_map_unhash(struct sock *sk) +{ + void (*saved_unhash)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + } else { + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + } + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; + if (saved_unhash) + saved_unhash(sk); +} +EXPORT_SYMBOL_GPL(sock_map_unhash); + +void sock_map_destroy(struct sock *sk) +{ + void (*saved_destroy)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock_get(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + } else { + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + sk_psock_put(sk, psock); + } + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; + if (saved_destroy) + saved_destroy(sk); +} +EXPORT_SYMBOL_GPL(sock_map_destroy); + +void sock_map_close(struct sock *sk, long timeout) +{ + void (*saved_close)(struct sock *sk, long timeout); + struct sk_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + psock = sk_psock_get(sk); + if (unlikely(!psock)) + goto no_psock; + rcu_read_unlock(); + sk_psock_stop(psock); + release_sock(sk); + cancel_delayed_work_sync(&psock->work); + sk_psock_put(sk, psock); + } else { + saved_close = READ_ONCE(sk->sk_prot)->close; +no_psock: + rcu_read_unlock(); + release_sock(sk); + } + + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; + saved_close(sk, timeout); +} +EXPORT_SYMBOL_GPL(sock_map_close); + +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + if (!sockmap_link->map) { + ret = -ENOLINK; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog, + attach_type); + sockmap_link->map = map; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + +static int sock_map_iter_attach_target(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && + map->map_type != BPF_MAP_TYPE_SOCKHASH) + goto put_map; + + if (prog->aux->max_rdonly_access > map->key_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +static struct bpf_iter_reg sock_map_iter_reg = { + .target = "sockmap", + .attach_target = sock_map_iter_attach_target, + .detach_target = sock_map_iter_detach_target, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__sockmap, key), + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, + { offsetof(struct bpf_iter__sockmap, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_sockmap_iter_init(void) +{ + sock_map_iter_reg.ctx_arg_info[1].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&sock_map_iter_reg); +} +late_initcall(bpf_sockmap_iter_init); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c new file mode 100644 index 000000000000..4211710393a8 --- /dev/null +++ b/net/core/sock_reuseport.c @@ -0,0 +1,748 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * To speed up listener socket lookup, create an array to store all sockets + * listening on the same port. This allows a decision to be made after finding + * the first socket. An optional BPF program can also be configured for + * selecting the socket index from the array of available sockets. + */ + +#include <net/ip.h> +#include <net/sock_reuseport.h> +#include <linux/bpf.h> +#include <linux/idr.h> +#include <linux/filter.h> +#include <linux/rcupdate.h> + +#define INIT_SOCKS 128 + +DEFINE_SPINLOCK(reuseport_lock); + +static DEFINE_IDA(reuseport_ida); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany); + +void reuseport_has_conns_set(struct sock *sk) +{ + struct sock_reuseport *reuse; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (likely(reuse)) + reuse->has_conns = 1; + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_has_conns_set); + +static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse) +{ + /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ + WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1); +} + +static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse) +{ + /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ + WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1); +} + +static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) +{ + if (sk->sk_incoming_cpu >= 0) + __reuseport_get_incoming_cpu(reuse); +} + +static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) +{ + if (sk->sk_incoming_cpu >= 0) + __reuseport_put_incoming_cpu(reuse); +} + +void reuseport_update_incoming_cpu(struct sock *sk, int val) +{ + struct sock_reuseport *reuse; + int old_sk_incoming_cpu; + + if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) { + /* Paired with REAE_ONCE() in sk_incoming_cpu_update() + * and compute_score(). + */ + WRITE_ONCE(sk->sk_incoming_cpu, val); + return; + } + + spin_lock_bh(&reuseport_lock); + + /* This must be done under reuseport_lock to avoid a race with + * reuseport_grow(), which accesses sk->sk_incoming_cpu without + * lock_sock() when detaching a shutdown()ed sk. + * + * Paired with READ_ONCE() in reuseport_select_sock_by_hash(). + */ + old_sk_incoming_cpu = sk->sk_incoming_cpu; + WRITE_ONCE(sk->sk_incoming_cpu, val); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + /* reuseport_grow() has detached a closed sk. */ + if (!reuse) + goto out; + + if (old_sk_incoming_cpu < 0 && val >= 0) + __reuseport_get_incoming_cpu(reuse); + else if (old_sk_incoming_cpu >= 0 && val < 0) + __reuseport_put_incoming_cpu(reuse); + +out: + spin_unlock_bh(&reuseport_lock); +} + +static int reuseport_sock_index(struct sock *sk, + const struct sock_reuseport *reuse, + bool closed) +{ + int left, right; + + if (!closed) { + left = 0; + right = reuse->num_socks; + } else { + left = reuse->max_socks - reuse->num_closed_socks; + right = reuse->max_socks; + } + + for (; left < right; left++) + if (reuse->socks[left] == sk) + return left; + return -1; +} + +static void __reuseport_add_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ + smp_wmb(); + reuse->num_socks++; + reuseport_get_incoming_cpu(sk, reuse); +} + +static bool __reuseport_detach_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, false); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + reuseport_put_incoming_cpu(sk, reuse); + + return true; +} + +static void __reuseport_add_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); + reuseport_get_incoming_cpu(sk, reuse); +} + +static bool __reuseport_detach_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, true); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); + reuseport_put_incoming_cpu(sk, reuse); + + return true; +} + +static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) +{ + struct sock_reuseport *reuse; + + reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC); + if (!reuse) + return NULL; + + reuse->max_socks = max_socks; + + RCU_INIT_POINTER(reuse->prog, NULL); + return reuse; +} + +int reuseport_alloc(struct sock *sk, bool bind_inany) +{ + struct sock_reuseport *reuse; + int id, ret = 0; + + /* bh lock used since this function call may precede hlist lock in + * soft irq of receive path or setsockopt from process context + */ + spin_lock_bh(&reuseport_lock); + + /* Allocation attempts can occur concurrently via the setsockopt path + * and the bind/hash path. Nothing to do when we lose the race. + */ + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + if (reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + ret = reuseport_resurrect(sk, reuse, NULL, bind_inany); + goto out; + } + + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; + goto out; + } + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) { + ret = -ENOMEM; + goto out; + } + + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + ret = id; + goto out; + } + + reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; + reuse->socks[0] = sk; + reuse->num_socks = 1; + reuseport_get_incoming_cpu(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + +out: + spin_unlock_bh(&reuseport_lock); + + return ret; +} +EXPORT_SYMBOL(reuseport_alloc); + +static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) +{ + struct sock_reuseport *more_reuse; + u32 more_socks_size, i; + + more_socks_size = reuse->max_socks * 2U; + if (more_socks_size > U16_MAX) { + if (reuse->num_closed_socks) { + /* Make room by removing a closed sk. + * The child has already been migrated. + * Only reqsk left at this point. + */ + struct sock *sk; + + sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL); + __reuseport_detach_closed_sock(sk, reuse); + + return reuse; + } + + return NULL; + } + + more_reuse = __reuseport_alloc(more_socks_size); + if (!more_reuse) + return NULL; + + more_reuse->num_socks = reuse->num_socks; + more_reuse->num_closed_socks = reuse->num_closed_socks; + more_reuse->prog = reuse->prog; + more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; + more_reuse->has_conns = reuse->has_conns; + more_reuse->incoming_cpu = reuse->incoming_cpu; + + memcpy(more_reuse->socks, reuse->socks, + reuse->num_socks * sizeof(struct sock *)); + memcpy(more_reuse->socks + + (more_reuse->max_socks - more_reuse->num_closed_socks), + reuse->socks + (reuse->max_socks - reuse->num_closed_socks), + reuse->num_closed_socks * sizeof(struct sock *)); + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); + + for (i = 0; i < reuse->max_socks; ++i) + rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, + more_reuse); + + /* Note: we use kfree_rcu here instead of reuseport_free_rcu so + * that reuse and more_reuse can temporarily share a reference + * to prog. + */ + kfree_rcu(reuse, rcu); + return more_reuse; +} + +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); + ida_free(&reuseport_ida, reuse->reuseport_id); + kfree(reuse); +} + +/** + * reuseport_add_sock - Add a socket to the reuseport group of another. + * @sk: New socket to add to the group. + * @sk2: Socket belonging to the existing reuseport group. + * @bind_inany: Whether or not the group is bound to a local INANY address. + * + * May return ENOMEM and not add socket to group under memory pressure. + */ +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) +{ + struct sock_reuseport *old_reuse, *reuse; + + if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { + int err = reuseport_alloc(sk2, bind_inany); + + if (err) + return err; + } + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany); + + spin_unlock_bh(&reuseport_lock); + return err; + } + + if (old_reuse && old_reuse->num_socks != 1) { + spin_unlock_bh(&reuseport_lock); + return -EBUSY; + } + + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + } + + __reuseport_add_sock(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + if (old_reuse) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + return 0; +} +EXPORT_SYMBOL(reuseport_add_sock); + +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany) +{ + if (old_reuse == reuse) { + /* If sk was in the same reuseport group, just pop sk out of + * the closed section and push sk into the listening section. + */ + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, old_reuse); + return 0; + } + + if (!reuse) { + /* In bind()/listen() path, we cannot carry over the eBPF prog + * for the shutdown()ed socket. In setsockopt() path, we should + * not change the eBPF prog of listening sockets by attaching a + * prog to the shutdown()ed socket. Thus, we will allocate a new + * reuseport group and detach sk from the old group. + */ + int id; + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) + return -ENOMEM; + + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + return id; + } + + reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; + } else { + /* Move sk from the old group to the new one if + * - all the other listeners in the old group were close()d or + * shutdown()ed, and then sk2 has listen()ed on the same port + * OR + * - sk listen()ed without bind() (or with autobind), was + * shutdown()ed, and then listen()s on another port which + * sk2 listen()s on. + */ + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) + return -ENOMEM; + } + } + + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + if (old_reuse->num_socks + old_reuse->num_closed_socks == 0) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + + return 0; +} + +void reuseport_detach_sock(struct sock *sk) +{ + struct sock_reuseport *reuse; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + /* reuseport_grow() has detached a closed sk */ + if (!reuse) + goto out; + + /* Notify the bpf side. The sk may be added to a sockarray + * map. If so, sockarray logic will remove it from the map. + * + * Other bpf map types that work with reuseport, like sockmap, + * don't need an explicit callback from here. They override sk + * unhash/close ops to remove the sk from the map before we + * get to this point. + */ + bpf_sk_reuseport_detach(sk); + + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); + + if (!__reuseport_detach_closed_sock(sk, reuse)) + __reuseport_detach_sock(sk, reuse); + + if (reuse->num_socks + reuse->num_closed_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + +out: + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_detach_sock); + +void reuseport_stop_listen_sock(struct sock *sk) +{ + if (sk->sk_protocol == IPPROTO_TCP) { + struct sock_reuseport *reuse; + struct bpf_prog *prog; + + spin_lock_bh(&reuseport_lock); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) || + (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { + /* Migration capable, move sk from the listening section + * to the closed section. + */ + bpf_sk_reuseport_detach(sk); + + __reuseport_detach_sock(sk, reuse); + __reuseport_add_closed_sock(sk, reuse); + + spin_unlock_bh(&reuseport_lock); + return; + } + + spin_unlock_bh(&reuseport_lock); + } + + /* Not capable to do migration, detach immediately */ + reuseport_detach_sock(sk); +} +EXPORT_SYMBOL(reuseport_stop_listen_sock); + +static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) +{ + struct sk_buff *nskb = NULL; + u32 index; + + if (skb_shared(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + skb = nskb; + } + + /* temporarily advance data past protocol header */ + if (!pskb_pull(skb, hdr_len)) { + kfree_skb(nskb); + return NULL; + } + index = bpf_prog_run_save_cb(prog, skb); + __skb_push(skb, hdr_len); + + consume_skb(nskb); + + if (index >= socks) + return NULL; + + return reuse->socks[index]; +} + +static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, + u32 hash, u16 num_socks) +{ + struct sock *first_valid_sk = NULL; + int i, j; + + i = j = reciprocal_scale(hash, num_socks); + do { + struct sock *sk = reuse->socks[i]; + + if (sk->sk_state != TCP_ESTABLISHED) { + /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */ + if (!READ_ONCE(reuse->incoming_cpu)) + return sk; + + /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */ + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) + return sk; + + if (!first_valid_sk) + first_valid_sk = sk; + } + + i++; + if (i >= num_socks) + i = 0; + } while (i != j); + + return first_valid_sk; +} + +/** + * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. + * @sk: First socket in the group. + * @hash: When no BPF filter is available, use this hash to select. + * @skb: skb to run through BPF filter. + * @hdr_len: BPF filter expects skb data pointer at payload data. If + * the skb does not yet point at the payload, this parameter represents + * how far the pointer needs to advance to reach the payload. + * Returns a socket that should receive the packet (or NULL on error). + */ +struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len) +{ + struct sock_reuseport *reuse; + struct bpf_prog *prog; + struct sock *sk2 = NULL; + u16 socks; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + + /* if memory allocation failed or add call is not yet complete */ + if (!reuse) + goto out; + + prog = rcu_dereference(reuse->prog); + socks = READ_ONCE(reuse->num_socks); + if (likely(socks)) { + /* paired with smp_wmb() in __reuseport_add_sock() */ + smp_rmb(); + + if (!prog || !skb) + goto select_by_hash; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash); + else + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); + +select_by_hash: + /* no bpf or invalid bpf result: fall back to hash usage */ + if (!sk2) + sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); + } + +out: + rcu_read_unlock(); + return sk2; +} +EXPORT_SYMBOL(reuseport_select_sock); + +/** + * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. + * @sk: close()ed or shutdown()ed socket in the group. + * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or + * NEW_SYN_RECV request socket during 3WHS. + * @skb: skb to run through BPF filter. + * Returns a socket (with sk_refcnt +1) that should accept the child socket + * (or NULL on error). + */ +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb) +{ + struct sock_reuseport *reuse; + struct sock *nsk = NULL; + bool allocated = false; + struct bpf_prog *prog; + u16 socks; + u32 hash; + + rcu_read_lock(); + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (!reuse) + goto out; + + socks = READ_ONCE(reuse->num_socks); + if (unlikely(!socks)) + goto failure; + + /* paired with smp_wmb() in __reuseport_add_sock() */ + smp_rmb(); + + hash = migrating_sk->sk_hash; + prog = rcu_dereference(reuse->prog); + if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req)) + goto select_by_hash; + goto failure; + } + + if (!skb) { + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto failure; + allocated = true; + } + + nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash); + + if (allocated) + kfree_skb(skb); + +select_by_hash: + if (!nsk) + nsk = reuseport_select_sock_by_hash(reuse, hash, socks); + + if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) { + nsk = NULL; + goto failure; + } + +out: + rcu_read_unlock(); + return nsk; + +failure: + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + goto out; +} +EXPORT_SYMBOL(reuseport_migrate_sock); + +int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + if (sk_unhashed(sk)) { + int err; + + if (!sk->sk_reuseport) + return -EINVAL; + + err = reuseport_alloc(sk, false); + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(reuse->prog, prog); + spin_unlock_bh(&reuseport_lock); + + sk_reuseport_prog_free(old_prog); + return 0; +} +EXPORT_SYMBOL(reuseport_attach_prog); + +int reuseport_detach_prog(struct sock *sk) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + old_prog = NULL; + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + /* reuse must be checked after acquiring the reuseport_lock + * because reuseport_grow() can detach a closed sk. + */ + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return sk->sk_reuseport ? -ENOENT : -EINVAL; + } + + if (sk_unhashed(sk) && reuse->num_closed_socks) { + spin_unlock_bh(&reuseport_lock); + return -ENOENT; + } + + old_prog = rcu_replace_pointer(reuse->prog, old_prog, + lockdep_is_held(&reuseport_lock)); + spin_unlock_bh(&reuseport_lock); + + if (!old_prog) + return -ENOENT; + + sk_reuseport_prog_free(old_prog); + return 0; +} +EXPORT_SYMBOL(reuseport_detach_prog); diff --git a/net/core/stream.c b/net/core/stream.c index f5df85dcd20b..7a37e7dd2c43 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * SUCS NET3: * @@ -13,6 +14,7 @@ */ #include <linux/module.h> +#include <linux/sched/signal.h> #include <linux/net.h> #include <linux/signal.h> #include <linux/tcp.h> @@ -21,29 +23,32 @@ /** * sk_stream_write_space - stream socket write_space callback. - * @sk: socket + * @sk: pointer to the socket structure * - * FIXME: write proper description + * This function is invoked when there's space available in the socket's + * send buffer for writing. It first checks if the socket is writable, + * clears the SOCK_NOSPACE flag indicating that memory for writing + * is now available, wakes up any processes waiting for write operations + * and sends asynchronous notifications if needed. */ void sk_stream_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) - wake_up_interruptible_poll(&wq->wait, POLLOUT | - POLLWRNORM | POLLWRBAND); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } -EXPORT_SYMBOL(sk_stream_write_space); /** * sk_stream_wait_connect - Wait for a socket to get into the connected state @@ -54,8 +59,8 @@ EXPORT_SYMBOL(sk_stream_write_space); */ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); struct task_struct *tsk = current; - DEFINE_WAIT(wait); int done; do { @@ -69,16 +74,16 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) if (signal_pending(tsk)) return sock_intr_errno(*timeo_p); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, - !sk->sk_err && - !((1 << sk->sk_state) & - ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); - finish_wait(sk_sleep(sk), &wait); + !READ_ONCE(sk->sk_err) && + !((1 << READ_ONCE(sk->sk_state)) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); + remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); - return 0; + return done < 0 ? done : 0; } EXPORT_SYMBOL(sk_stream_wait_connect); @@ -86,25 +91,25 @@ EXPORT_SYMBOL(sk_stream_wait_connect); * sk_stream_closing - Return 1 if we still have things to send in our buffers. * @sk: socket to verify */ -static inline int sk_stream_closing(struct sock *sk) +static int sk_stream_closing(const struct sock *sk) { - return (1 << sk->sk_state) & + return (1 << READ_ONCE(sk->sk_state)) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); } void sk_stream_wait_close(struct sock *sk, long timeout) { if (timeout) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); do { - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); - if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) + if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait)) break; } while (!signal_pending(current) && timeout); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); } } EXPORT_SYMBOL(sk_stream_wait_close); @@ -116,36 +121,38 @@ EXPORT_SYMBOL(sk_stream_wait_close); */ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) { - int err = 0; + int ret, err = 0; long vm_wait = 0; long current_timeo = *timeo_p; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + current_timeo = vm_wait = get_random_u32_below(HZ / 5) + 2; - while (1) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + add_wait_queue(sk_sleep(sk), &wait); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + while (1) { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; if (!*timeo_p) - goto do_nonblock; + goto do_eagain; if (signal_pending(current)) goto do_interrupted; - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk_stream_memory_free(sk) && !vm_wait) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || - (sk_stream_memory_free(sk) && - !vm_wait)); + ret = sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && !vm_wait), + &wait); sk->sk_write_pending--; + if (ret < 0) + goto do_error; if (vm_wait) { vm_wait -= current_timeo; @@ -158,13 +165,20 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - finish_wait(sk_sleep(sk), &wait); + if (!sock_flag(sk, SOCK_DEAD)) + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: err = -EPIPE; goto out; -do_nonblock: +do_eagain: + /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can + * be generated later. + * When TCP receives ACK packets that make room, tcp_check_space() + * only calls tcp_new_space() if SOCK_NOSPACE is set. + */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; goto out; do_interrupted: @@ -188,17 +202,19 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); - /* Next, the error queue. */ - __skb_queue_purge(&sk->sk_error_queue); + /* Next, the error queue. + * We need to use queue lock, because other threads might + * add packets to the queue without socket lock being held. + */ + skb_queue_purge(&sk->sk_error_queue); /* Next, the write queue. */ - WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue)); /* Account for returned memory. */ - sk_mem_reclaim(sk); + sk_mem_reclaim_final(sk); - WARN_ON(sk->sk_wmem_queued); - WARN_ON(sk->sk_forward_alloc); + WARN_ON_ONCE(sk->sk_wmem_queued); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 660968616637..8d4decb2606f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* -*- linux-c -*- * sysctl_net_core.c: sysctl interface to net core subsystem. * @@ -5,6 +6,7 @@ * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ +#include <linux/filter.h> #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/module.h> @@ -14,18 +16,126 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/kmemleak.h> +#include <linux/sched/isolation.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> +#include <net/pkt_sched.h> +#include <net/hotdata.h> +#include <net/proto_memory.h> +#include <net/rps.h> + +#include "dev.h" +#include "net-sysfs.h" + +static int int_3600 = 3600; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; +static int max_skb_frags = MAX_SKB_FRAGS; +static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; + +static int net_msg_warn; /* Unused, but still a sysctl */ + +int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; +EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); + +/* 0 - Keep current behavior: + * IPv4: inherit all current settings from init_net + * IPv6: reset all settings to default + * 1 - Both inherit all current settings from init_net + * 2 - Both reset all settings to default + * 3 - Both inherit all settings from current netns + */ +int sysctl_devconf_inherit_init_net __read_mostly; +EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); + +#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) +static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) +{ + char *kbuf; + int len; + + if (*ppos || !*lenp) { + *lenp = 0; + return 0; + } + + /* CPUs are displayed as a hex bitmap + a comma between each groups of 8 + * nibbles (except the last one which has a newline instead). + * Guesstimate the buffer size at the group granularity level. + */ + len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp); + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) { + *lenp = 0; + return -ENOMEM; + } + + len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); + if (!len) { + *lenp = 0; + goto free_buf; + } + + /* scnprintf writes a trailing null char not counted in the returned + * length, override it with a newline. + */ + kbuf[len++] = '\n'; + memcpy(buffer, kbuf, len); + *lenp = len; + *ppos += len; -static int one = 1; +free_buf: + kfree(kbuf); + return 0; +} +#endif #ifdef CONFIG_RPS -static int rps_sock_flow_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + +DEFINE_MUTEX(rps_default_mask_mutex); + +static int rps_default_mask_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = (struct net *)table->data; + struct cpumask *mask; + int err = 0; + + mutex_lock(&rps_default_mask_mutex); + mask = net->core.rps_default_mask; + if (write) { + if (!mask) { + mask = kzalloc(cpumask_size(), GFP_KERNEL); + net->core.rps_default_mask = mask; + } + err = -ENOMEM; + if (!mask) + goto done; + + err = cpumask_parse(buffer, mask); + if (err) + goto done; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto done; + } else { + err = dump_cpumask(buffer, lenp, ppos, + mask ?: cpu_none_mask); + } + +done: + mutex_unlock(&rps_default_mask_mutex); + return err; +} + +static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; @@ -39,7 +149,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + orig_sock_table = rcu_dereference_protected( + net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; @@ -47,7 +158,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (write) { if (size) { - if (size > 1<<30) { + if (size > 1<<29) { /* Enforce limit to prevent overflow */ mutex_unlock(&sock_flow_mutex); return -EINVAL; @@ -60,7 +171,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -ENOMEM; } - + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; @@ -71,13 +183,16 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, sock_table = NULL; if (sock_table != orig_sock_table) { - rcu_assign_pointer(rps_sock_flow_table, sock_table); - if (sock_table) - static_key_slow_inc(&rps_needed); + rcu_assign_pointer(net_hotdata.rps_sock_flow_table, + sock_table); + if (sock_table) { + static_branch_inc(&rps_needed); + static_branch_inc(&rfs_needed); + } if (orig_sock_table) { - static_key_slow_dec(&rps_needed); - synchronize_rcu(); - vfree(orig_sock_table); + static_branch_dec(&rps_needed); + static_branch_dec(&rfs_needed); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -91,9 +206,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); -static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; @@ -104,7 +218,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, return -ENOMEM; if (write) { - ret = cpumask_parse_user(buffer, *lenp, mask); + ret = cpumask_parse(buffer, mask); if (ret) goto done; @@ -116,29 +230,22 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - synchronize_rcu(); - kfree(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { - cur = kzalloc(len, GFP_KERNEL); + cur = kzalloc_node(len, GFP_KERNEL, + cpu_to_node(i)); if (!cur) { /* not unwinding previous changes */ ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { - char kbuf[128]; - - if (*ppos || !*lenp) { - *lenp = 0; - goto done; - } - cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { @@ -148,20 +255,7 @@ write_unlock: } rcu_read_unlock(); - len = min(sizeof(kbuf) - 1, *lenp); - len = cpumask_scnprintf(kbuf, len, mask); - if (!len) { - *lenp = 0; - goto done; - } - if (len < *lenp) - kbuf[len++] = '\n'; - if (copy_to_user(buffer, kbuf, len)) { - ret = -EFAULT; - goto done; - } - *lenp = len; - *ppos += len; + ret = dump_cpumask(buffer, lenp, ppos, mask); } done: @@ -169,9 +263,8 @@ done: return ret; } -static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; @@ -191,53 +284,157 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, } #endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_SCHED +static int set_default_qdisc(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + char id[IFNAMSIZ]; + struct ctl_table tbl = { + .data = id, + .maxlen = IFNAMSIZ, + }; + int ret; + + qdisc_get_default(id, IFNAMSIZ); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = qdisc_set_default(id); + return ret; +} +#endif + +static int proc_do_dev_weight(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + static DEFINE_MUTEX(dev_weight_mutex); + int ret, weight; + + mutex_lock(&dev_weight_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + weight = READ_ONCE(weight_p); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); + } + mutex_unlock(&dev_weight_mutex); + + return ret; +} + +static int proc_do_rss_key(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table fake_table; + char buf[NETDEV_RSS_KEY_LEN * 3]; + + snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); + fake_table.data = buf; + fake_table.maxlen = sizeof(buf); + return proc_dostring(&fake_table, write, buffer, lenp, ppos); +} + +#ifdef CONFIG_BPF_JIT +static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, jit_enable = *(int *)table->data; + int min = *(int *)table->extra1; + int max = *(int *)table->extra2; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &jit_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (jit_enable < 2 || + (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } else { + ret = -EPERM; + } + } + + if (write && ret && min == max) + pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n"); + + return ret; +} + +# ifdef CONFIG_HAVE_EBPF_JIT +static int +proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +# endif /* CONFIG_HAVE_EBPF_JIT */ + +static int +proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + static struct ctl_table net_core_table[] = { -#ifdef CONFIG_NET { - .procname = "wmem_max", - .data = &sysctl_wmem_max, + .procname = "mem_pcpu_rsv", + .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_mem_pcpu_rsv, }, { - .procname = "rmem_max", - .data = &sysctl_rmem_max, + .procname = "dev_weight", + .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { - .procname = "wmem_default", - .data = &sysctl_wmem_default, + .procname = "dev_weight_rx_bias", + .data = &dev_weight_rx_bias, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { - .procname = "rmem_default", - .data = &sysctl_rmem_default, + .procname = "dev_weight_tx_bias", + .data = &dev_weight_tx_bias, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { - .procname = "dev_weight", - .data = &weight_p, + .procname = "netdev_max_backlog", + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { - .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .procname = "netdev_rss_key", + .data = &netdev_rss_key, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec + .mode = 0444, + .proc_handler = proc_do_rss_key, }, #ifdef CONFIG_BPF_JIT { @@ -245,12 +442,48 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax_bpf_enable, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, +# else + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, +# endif + }, +# ifdef CONFIG_HAVE_EBPF_JIT + { + .procname = "bpf_jit_harden", + .data = &bpf_jit_harden, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "bpf_jit_kallsyms", + .data = &bpf_jit_kallsyms, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +# endif + { + .procname = "bpf_jit_limit", + .data = &bpf_jit_limit, + .maxlen = sizeof(long), + .mode = 0600, + .proc_handler = proc_dolongvec_minmax_bpf_restricted, + .extra1 = SYSCTL_LONG_ONE, + .extra2 = &bpf_jit_limit_max, }, #endif { .procname = "netdev_tstamp_prequeue", - .data = &netdev_tstamp_prequeue, + .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -269,13 +502,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "optmem_max", - .data = &sysctl_optmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", @@ -298,27 +524,35 @@ static struct ctl_table net_core_table[] = { .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ -#ifdef CONFIG_NET_LL_RX_POLL +#ifdef CONFIG_NET_RX_BUSY_POLL { .procname = "busy_poll", .data = &sysctl_net_busy_poll, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, }, { .procname = "busy_read", .data = &sysctl_net_busy_read, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_NET_SCHED + { + .procname = "default_qdisc", + .mode = 0644, + .maxlen = IFNAMSIZ, + .proc_handler = set_default_qdisc }, -# #endif -#endif /* CONFIG_NET */ { .procname = "netdev_budget", - .data = &netdev_budget, + .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -330,41 +564,207 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { } + { + .procname = "max_skb_frags", + .data = &net_hotdata.sysctl_max_skb_frags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &max_skb_frags, + }, + { + .procname = "netdev_budget_usecs", + .data = &net_hotdata.netdev_budget_usecs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &netdev_budget_usecs_min, + }, + { + .procname = "fb_tunnels_only_for_init_net", + .data = &sysctl_fb_tunnels_only_for_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "devconf_inherit_init_net", + .data = &sysctl_devconf_inherit_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "high_order_alloc_disable", + .data = &net_high_order_alloc_disable_key.key, + .maxlen = sizeof(net_high_order_alloc_disable_key), + .mode = 0644, + .proc_handler = proc_do_static_key, + }, + { + .procname = "gro_normal_batch", + .data = &net_hotdata.gro_normal_batch, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "netdev_unregister_timeout_secs", + .data = &netdev_unregister_timeout_secs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &int_3600, + }, + { + .procname = "skb_defer_max", + .data = &net_hotdata.sysctl_skb_defer_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, }; static struct ctl_table netns_core_table[] = { +#if IS_ENABLED(CONFIG_RPS) + { + .procname = "rps_default_mask", + .data = &init_net, + .mode = 0644, + .proc_handler = rps_default_mask_sysctl + }, +#endif { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, + { + .procname = "optmem_max", + .data = &init_net.core.sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, + { + .procname = "txrehash", + .data = &init_net.core.sysctl_txrehash, + .maxlen = sizeof(u8), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "txq_reselection_ms", + .data = &init_net.core.sysctl_txq_reselection, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "tstamp_allow_data", + .data = &init_net.core.sysctl_tstamp_allow_data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + /* sysctl_core_net_init() will set the values after this + * to readonly in network namespaces + */ + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, }, - { } }; +static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) +{ + /* fallback tunnels for initns only */ + if (!strncmp(str, "initns", 6)) + sysctl_fb_tunnels_only_for_init_net = 1; + /* no fallback tunnels anywhere */ + else if (!strncmp(str, "none", 4)) + sysctl_fb_tunnels_only_for_init_net = 2; + + return 1; +} +__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); + static __net_init int sysctl_core_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(netns_core_table); struct ctl_table *tbl; - net->core.sysctl_somaxconn = SOMAXCONN; - tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - tbl[0].data = &net->core.sysctl_somaxconn; + for (i = 0; i < table_size; ++i) { + if (tbl[i].data == &sysctl_wmem_max) + break; - /* Don't export any sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; + tbl[i].data += (char *)net - (char *)&init_net; } + for (; i < table_size; ++i) + tbl[i].mode &= ~0222; } - net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; @@ -379,11 +779,14 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); +#if IS_ENABLED(CONFIG_RPS) + kfree(net->core.rps_default_mask); +#endif kfree(tbl); } diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 661b5a40ec10..a50a7ef49ae8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -1,139 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PTP 1588 clock support - support for timestamping in PHY devices * * Copyright (C) 2010 OMICRON electronics GmbH - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/errqueue.h> #include <linux/phy.h> #include <linux/ptp_classify.h> #include <linux/skbuff.h> #include <linux/export.h> - -static struct sock_filter ptp_filter[] = { - PTP_FILTER -}; +#include <linux/ptp_clock_kernel.h> static unsigned int classify(const struct sk_buff *skb) { - if (likely(skb->dev && - skb->dev->phydev && - skb->dev->phydev->drv)) - return sk_run_filter(skb, ptp_filter); + if (likely(skb->dev && skb->dev->phydev && + skb->dev->phydev->mii_ts)) + return ptp_classify_raw(skb); else return PTP_CLASS_NONE; } void skb_clone_tx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; + struct mii_timestamper *mii_ts; struct phy_device *phydev; struct sk_buff *clone; - struct sock *sk = skb->sk; unsigned int type; - if (!sk) + if (!skb->sk || !skb->dev) return; - type = classify(skb); + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return; + } - switch (type) { - case PTP_CLASS_V1_IPV4: - case PTP_CLASS_V1_IPV6: - case PTP_CLASS_V2_IPV4: - case PTP_CLASS_V2_IPV6: - case PTP_CLASS_V2_L2: - case PTP_CLASS_V2_VLAN: + phydev = hwprov->phydev; + } else { phydev = skb->dev->phydev; - if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) - return; - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - clone->sk = sk; - phydev->drv->txtstamp(phydev, clone, type); + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return; } - break; - default: - break; } -} -EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); + rcu_read_unlock(); -void skb_complete_tx_timestamp(struct sk_buff *skb, - struct skb_shared_hwtstamps *hwtstamps) -{ - struct sock *sk = skb->sk; - struct sock_exterr_skb *serr; - int err; - - if (!hwtstamps) { - sock_put(sk); - kfree_skb(skb); + type = classify(skb); + if (type == PTP_CLASS_NONE) return; - } - *skb_hwtstamps(skb) = *hwtstamps; - serr = SKB_EXT_ERR(skb); - memset(serr, 0, sizeof(*serr)); - serr->ee.ee_errno = ENOMSG; - serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - skb->sk = NULL; - err = sock_queue_err_skb(sk, skb); - sock_put(sk); - if (err) - kfree_skb(skb); + mii_ts = phydev->mii_ts; + if (likely(mii_ts->txtstamp)) { + clone = skb_clone_sk(skb); + if (!clone) + return; + mii_ts->txtstamp(mii_ts, clone, type); + } } -EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); +EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; + struct mii_timestamper *mii_ts; struct phy_device *phydev; unsigned int type; + if (!skb->dev) + return false; + + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return false; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + if (skb_headroom(skb) < ETH_HLEN) return false; + __skb_push(skb, ETH_HLEN); - type = classify(skb); + type = ptp_classify_raw(skb); __skb_pull(skb, ETH_HLEN); - switch (type) { - case PTP_CLASS_V1_IPV4: - case PTP_CLASS_V1_IPV6: - case PTP_CLASS_V2_IPV4: - case PTP_CLASS_V2_IPV6: - case PTP_CLASS_V2_L2: - case PTP_CLASS_V2_VLAN: - phydev = skb->dev->phydev; - if (likely(phydev->drv->rxtstamp)) - return phydev->drv->rxtstamp(phydev, skb, type); - break; - default: - break; - } + if (type == PTP_CLASS_NONE) + return false; + + mii_ts = phydev->mii_ts; + if (likely(mii_ts->rxtstamp)) + return mii_ts->rxtstamp(mii_ts, skb, type); return false; } EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp); - -void __init skb_timestamping_init(void) -{ - BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter))); -} diff --git a/net/core/tso.c b/net/core/tso.c new file mode 100644 index 000000000000..6df997b9076e --- /dev/null +++ b/net/core/tso.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/export.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <net/tso.h> +#include <linux/unaligned.h> + +void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, + int size, bool is_last) +{ + int hdr_len = skb_transport_offset(skb) + tso->tlen; + int mac_hdr_len = skb_network_offset(skb); + + memcpy(hdr, skb->data, hdr_len); + if (!tso->ipv6) { + struct iphdr *iph = (void *)(hdr + mac_hdr_len); + + iph->id = htons(tso->ip_id); + iph->tot_len = htons(size + hdr_len - mac_hdr_len); + tso->ip_id++; + } else { + struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len); + + iph->payload_len = htons(size + tso->tlen); + } + hdr += skb_transport_offset(skb); + if (tso->tlen != sizeof(struct udphdr)) { + struct tcphdr *tcph = (struct tcphdr *)hdr; + + put_unaligned_be32(tso->tcp_seq, &tcph->seq); + + if (!is_last) { + /* Clear all special flags for not last packet */ + tcph->psh = 0; + tcph->fin = 0; + tcph->rst = 0; + } + } else { + struct udphdr *uh = (struct udphdr *)hdr; + + uh->len = htons(sizeof(*uh) + size); + } +} +EXPORT_SYMBOL(tso_build_hdr); + +void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size) +{ + tso->tcp_seq += size; /* not worth avoiding this operation for UDP */ + tso->size -= size; + tso->data += size; + + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = skb_frag_size(frag); + tso->data = skb_frag_address(frag); + tso->next_frag_idx++; + } +} +EXPORT_SYMBOL(tso_build_data); + +int tso_start(struct sk_buff *skb, struct tso_t *tso) +{ + int tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr); + int hdr_len = skb_transport_offset(skb) + tlen; + + tso->tlen = tlen; + tso->ip_id = ntohs(ip_hdr(skb)->id); + tso->tcp_seq = (tlen != sizeof(struct udphdr)) ? ntohl(tcp_hdr(skb)->seq) : 0; + tso->next_frag_idx = 0; + tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6); + + /* Build first data */ + tso->size = skb_headlen(skb) - hdr_len; + tso->data = skb->data + hdr_len; + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = skb_frag_size(frag); + tso->data = skb_frag_address(frag); + tso->next_frag_idx++; + } + return hdr_len; +} +EXPORT_SYMBOL(tso_start); diff --git a/net/core/user_dma.c b/net/core/user_dma.c deleted file mode 100644 index 1b5fefdb8198..000000000000 --- a/net/core/user_dma.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * Portions based on net/core/datagram.c and copyrighted by their authors. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ - -/* - * This code allows the net stack to make use of a DMA engine for - * skb to iovec copies. - */ - -#include <linux/dmaengine.h> -#include <linux/socket.h> -#include <linux/export.h> -#include <net/tcp.h> -#include <net/netdma.h> - -#define NET_DMA_DEFAULT_COPYBREAK 4096 - -int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; -EXPORT_SYMBOL(sysctl_tcp_dma_copybreak); - -/** - * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. - * @skb - buffer to copy - * @offset - offset in the buffer to start copying from - * @iovec - io vector to copy to - * @len - amount of data to copy from buffer to iovec - * @pinned_list - locked iovec buffer data - * - * Note: the iovec is modified during the copy. - */ -int dma_skb_copy_datagram_iovec(struct dma_chan *chan, - struct sk_buff *skb, int offset, struct iovec *to, - size_t len, struct dma_pinned_list *pinned_list) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - dma_cookie_t cookie = 0; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_memcpy_to_iovec(chan, to, pinned_list, - skb->data + offset, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - copy = end - offset; - if (copy > 0) { - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - - cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, - frag->page_offset + offset - start, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - copy = end - offset; - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, - offset - start, - to, copy, - pinned_list); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - -end: - if (!len) { - skb->dma_cookie = cookie; - return cookie; - } - -fault: - return -EFAULT; -} diff --git a/net/core/utils.c b/net/core/utils.c index aa88e23fc87a..5e63b0ea21f3 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Generic address resultion entity + * Generic address resolution entity * * Authors: * net_random Alan Cox @@ -7,11 +8,6 @@ * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project * * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -26,15 +22,14 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/ratelimit.h> +#include <linux/socket.h> #include <net/sock.h> #include <net/net_ratelimit.h> +#include <net/ipv6.h> #include <asm/byteorder.h> -#include <asm/uaccess.h> - -int net_msg_warn __read_mostly = 1; -EXPORT_SYMBOL(net_msg_warn); +#include <linux/uaccess.h> DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); /* @@ -54,7 +49,7 @@ EXPORT_SYMBOL(net_ratelimit); __be32 in_aton(const char *str) { - unsigned long l; + unsigned int l; unsigned int val; int i; @@ -136,7 +131,7 @@ int in4_pton(const char *src, int srclen, s = src; d = dbuf; i = 0; - while(1) { + while (1) { int c; c = xdigit2bin(srclen > 0 ? *s : '\0', delim); if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { @@ -286,11 +281,11 @@ cont: i = 15; d--; if (dc) { - while(d >= dc) + while (d >= dc) dst[i--] = *d--; - while(i >= dc - dbuf) + while (i >= dc - dbuf) dst[i--] = 0; - while(i >= 0) + while (i >= 0) dst[i--] = *d--; } else memcpy(dst, dbuf, sizeof(dbuf)); @@ -303,25 +298,166 @@ out: } EXPORT_SYMBOL(in6_pton); +static int inet4_pton(const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + size_t srclen = strlen(src); + + if (srclen > INET_ADDRSTRLEN) + return -EINVAL; + + if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr, + '\n', NULL) == 0) + return -EINVAL; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(port_num); + + return 0; +} + +static int inet6_pton(struct net *net, const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + const char *scope_delim; + size_t srclen = strlen(src); + + if (srclen > INET6_ADDRSTRLEN) + return -EINVAL; + + if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr, + '%', &scope_delim) == 0) + return -EINVAL; + + if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL && + src + srclen != scope_delim && *scope_delim == '%') { + struct net_device *dev; + char scope_id[16]; + size_t scope_len = min_t(size_t, sizeof(scope_id) - 1, + src + srclen - scope_delim - 1); + + memcpy(scope_id, scope_delim + 1, scope_len); + scope_id[scope_len] = '\0'; + + dev = dev_get_by_name(net, scope_id); + if (dev) { + addr6->sin6_scope_id = dev->ifindex; + dev_put(dev); + } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) { + return -EINVAL; + } + } + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(port_num); + + return 0; +} + +/** + * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address + * @net: net namespace (used for scope handling) + * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either + * @src: the start of the address string + * @port: the start of the port string (or NULL for none) + * @addr: output socket address + * + * Return zero on success, return errno when any error occurs. + */ +int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, + const char *src, const char *port, struct sockaddr_storage *addr) +{ + u16 port_num; + int ret = -EINVAL; + + if (port) { + if (kstrtou16(port, 0, &port_num)) + return -EINVAL; + } else { + port_num = 0; + } + + switch (af) { + case AF_INET: + ret = inet4_pton(src, port_num, addr); + break; + case AF_INET6: + ret = inet6_pton(net, src, port_num, addr); + break; + case AF_UNSPEC: + ret = inet4_pton(src, port_num, addr); + if (ret) + ret = inet6_pton(net, src, port_num, addr); + break; + default: + pr_err("unexpected address family %d\n", af); + } + + return ret; +} +EXPORT_SYMBOL(inet_pton_with_scope); + +bool inet_addr_is_any(struct sockaddr_storage *addr) +{ + if (addr->ss_family == AF_INET6) { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; + const struct sockaddr_in6 in6_any = + { .sin6_addr = IN6ADDR_ANY_INIT }; + + if (!memcmp(in6->sin6_addr.s6_addr, + in6_any.sin6_addr.s6_addr, 16)) + return true; + } else if (addr->ss_family == AF_INET) { + struct sockaddr_in *in = (struct sockaddr_in *)addr; + + if (in->sin_addr.s_addr == htonl(INADDR_ANY)) + return true; + } else { + pr_warn("unexpected address family %u\n", addr->ss_family); + } + + return false; +} +EXPORT_SYMBOL(inet_addr_is_any); + void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr) + __be32 from, __be32 to, bool pseudohdr) { - __be32 diff[] = { ~from, to }; if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_partial(diff, sizeof(diff), - ~csum_unfold(*sum))); + csum_replace4(sum, from, to); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); + skb->csum = ~csum_add(csum_sub(~(skb->csum), + (__force __wsum)from), + (__force __wsum)to); } else if (pseudohdr) - *sum = ~csum_fold(csum_partial(diff, sizeof(diff), - csum_unfold(*sum))); + *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), + (__force __wsum)from), + (__force __wsum)to)); } EXPORT_SYMBOL(inet_proto_csum_replace4); +/** + * inet_proto_csum_replace16 - update layer 4 header checksum field + * @sum: Layer 4 header checksum field + * @skb: sk_buff for the packet + * @from: old IPv6 address + * @to: new IPv6 address + * @pseudohdr: True if layer 4 header checksum includes pseudoheader + * + * Update layer 4 header as per the update in IPv6 src/dst address. + * + * There is no need to update skb->csum in this function, because update in two + * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other + * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to + * update skb->csum, because update in 3 fields a.) IPv4 src/dst address, + * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as + * L4 Header checksum for skb->csum calculation. + */ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, - int pseudohdr) + bool pseudohdr) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], @@ -330,11 +466,21 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { *sum = csum_fold(csum_partial(diff, sizeof(diff), ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); } else if (pseudohdr) *sum = ~csum_fold(csum_partial(diff, sizeof(diff), csum_unfold(*sum))); } EXPORT_SYMBOL(inet_proto_csum_replace16); + +void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, + __wsum diff, bool pseudohdr, bool ipv6) +{ + if (skb->ip_summed != CHECKSUM_PARTIAL) { + csum_replace_by_diff(sum, diff); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6) + skb->csum = ~csum_sub(diff, skb->csum); + } else if (pseudohdr) { + *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); + } +} +EXPORT_SYMBOL(inet_proto_csum_replace_by_diff); diff --git a/net/core/xdp.c b/net/core/xdp.c new file mode 100644 index 000000000000..9100e160113a --- /dev/null +++ b/net/core/xdp.c @@ -0,0 +1,1055 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* net/core/xdp.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + */ +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/filter.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/rhashtable.h> +#include <linux/bug.h> +#include <net/page_pool/helpers.h> + +#include <net/hotdata.h> +#include <net/netdev_lock.h> +#include <net/xdp.h> +#include <net/xdp_priv.h> /* struct xdp_mem_allocator */ +#include <trace/events/xdp.h> +#include <net/xdp_sock_drv.h> + +#define REG_STATE_NEW 0x0 +#define REG_STATE_REGISTERED 0x1 +#define REG_STATE_UNREGISTERED 0x2 +#define REG_STATE_UNUSED 0x3 + +static DEFINE_IDA(mem_id_pool); +static DEFINE_MUTEX(mem_id_lock); +#define MEM_ID_MAX 0xFFFE +#define MEM_ID_MIN 1 +static int mem_id_next = MEM_ID_MIN; + +static bool mem_id_init; /* false */ +static struct rhashtable *mem_id_ht; + +static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) +{ + const u32 *k = data; + const u32 key = *k; + + BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id) + != sizeof(u32)); + + /* Use cyclic increasing ID as direct hash key */ + return key; +} + +static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xdp_mem_allocator *xa = ptr; + u32 mem_id = *(u32 *)arg->key; + + return xa->mem.id != mem_id; +} + +static const struct rhashtable_params mem_id_rht_params = { + .nelem_hint = 64, + .head_offset = offsetof(struct xdp_mem_allocator, node), + .key_offset = offsetof(struct xdp_mem_allocator, mem.id), + .key_len = sizeof_field(struct xdp_mem_allocator, mem.id), + .max_size = MEM_ID_MAX, + .min_size = 8, + .automatic_shrinking = true, + .hashfn = xdp_mem_id_hashfn, + .obj_cmpfn = xdp_mem_id_cmp, +}; + +static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) +{ + struct xdp_mem_allocator *xa; + + xa = container_of(rcu, struct xdp_mem_allocator, rcu); + + /* Allow this ID to be reused */ + ida_free(&mem_id_pool, xa->mem.id); + + kfree(xa); +} + +static void mem_xa_remove(struct xdp_mem_allocator *xa) +{ + trace_mem_disconnect(xa); + + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); +} + +static void mem_allocator_disconnect(void *allocator) +{ + struct xdp_mem_allocator *xa; + struct rhashtable_iter iter; + + mutex_lock(&mem_id_lock); + + rhashtable_walk_enter(mem_id_ht, &iter); + do { + rhashtable_walk_start(&iter); + + while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) { + if (xa->allocator == allocator) + mem_xa_remove(xa); + } + + rhashtable_walk_stop(&iter); + + } while (xa == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); + + mutex_unlock(&mem_id_lock); +} + +void xdp_unreg_mem_model(struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + int type = mem->type; + int id = mem->id; + + /* Reset mem info to defaults */ + mem->id = 0; + mem->type = 0; + + if (id == 0) + return; + + if (type == MEM_TYPE_PAGE_POOL) { + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + page_pool_destroy(xa->page_pool); + } +} +EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); + +void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return; + } + + xdp_unreg_mem_model(&xdp_rxq->mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); + +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +{ + /* Simplify driver cleanup code paths, allow unreg "unused" */ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) + return; + + xdp_rxq_info_unreg_mem_model(xdp_rxq); + + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; + xdp_rxq->dev = NULL; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); + +static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) +{ + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +/* Returns 0 on success, negative on failure */ +int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id, u32 frag_size) +{ + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { + WARN(1, "Driver promised not to register this"); + return -EINVAL; + } + + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { + WARN(1, "Missing unregister, handled but fix driver"); + xdp_rxq_info_unreg(xdp_rxq); + } + + /* State either UNREGISTERED or NEW */ + xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; + xdp_rxq->frag_size = frag_size; + + xdp_rxq->reg_state = REG_STATE_REGISTERED; + return 0; +} +EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg); + +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->reg_state = REG_STATE_UNUSED; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); + +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) +{ + return (xdp_rxq->reg_state == REG_STATE_REGISTERED); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); + +static int __mem_id_init_hash_table(void) +{ + struct rhashtable *rht; + int ret; + + if (unlikely(mem_id_init)) + return 0; + + rht = kzalloc(sizeof(*rht), GFP_KERNEL); + if (!rht) + return -ENOMEM; + + ret = rhashtable_init(rht, &mem_id_rht_params); + if (ret < 0) { + kfree(rht); + return ret; + } + mem_id_ht = rht; + smp_mb(); /* mutex lock should provide enough pairing */ + mem_id_init = true; + + return 0; +} + +/* Allocate a cyclic ID that maps to allocator pointer. + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + * + * Caller must lock mem_id_lock. + */ +static int __mem_id_cyclic_get(gfp_t gfp) +{ + int retries = 1; + int id; + +again: + id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp); + if (id < 0) { + if (id == -ENOSPC) { + /* Cyclic allocator, reset next id */ + if (retries--) { + mem_id_next = MEM_ID_MIN; + goto again; + } + } + return id; /* errno */ + } + mem_id_next = id + 1; + + return id; +} + +static bool __is_supported_mem_type(enum xdp_mem_type type) +{ + if (type == MEM_TYPE_PAGE_POOL) + return is_page_pool_compiled_in(); + + if (type >= MEM_TYPE_MAX) + return false; + + return true; +} + +static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, + void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + gfp_t gfp = GFP_KERNEL; + int id, errno, ret; + void *ptr; + + if (!__is_supported_mem_type(type)) + return ERR_PTR(-EOPNOTSUPP); + + mem->type = type; + + if (!allocator) { + if (type == MEM_TYPE_PAGE_POOL) + return ERR_PTR(-EINVAL); /* Setup time check page_pool req */ + return NULL; + } + + /* Delay init of rhashtable to save memory if feature isn't used */ + if (!mem_id_init) { + mutex_lock(&mem_id_lock); + ret = __mem_id_init_hash_table(); + mutex_unlock(&mem_id_lock); + if (ret < 0) + return ERR_PTR(ret); + } + + xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); + if (!xdp_alloc) + return ERR_PTR(-ENOMEM); + + mutex_lock(&mem_id_lock); + id = __mem_id_cyclic_get(gfp); + if (id < 0) { + errno = id; + goto err; + } + mem->id = id; + xdp_alloc->mem = *mem; + xdp_alloc->allocator = allocator; + + /* Insert allocator into ID lookup table */ + ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); + if (IS_ERR(ptr)) { + ida_free(&mem_id_pool, mem->id); + mem->id = 0; + errno = PTR_ERR(ptr); + goto err; + } + + if (type == MEM_TYPE_PAGE_POOL) + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem); + + mutex_unlock(&mem_id_lock); + + return xdp_alloc; +err: + mutex_unlock(&mem_id_lock); + kfree(xdp_alloc); + return ERR_PTR(errno); +} + +int xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + xdp_alloc = __xdp_reg_mem_model(mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + return 0; +} +EXPORT_SYMBOL_GPL(xdp_reg_mem_model); + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + + xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + + if (type == MEM_TYPE_XSK_BUFF_POOL && allocator) + xsk_pool_set_rxq_info(allocator, xdp_rxq); + + if (trace_mem_connect_enabled() && xdp_alloc) + trace_mem_connect(xdp_alloc, xdp_rxq); + return 0; +} + +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); + +/** + * xdp_reg_page_pool - register &page_pool as a memory provider for XDP + * @pool: &page_pool to register + * + * Can be used to register pools manually without connecting to any XDP RxQ + * info, so that the XDP layer will be aware of them. Then, they can be + * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool(). + * + * Return: %0 on success, -errno on error. + */ +int xdp_reg_page_pool(struct page_pool *pool) +{ + struct xdp_mem_info mem; + + return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool); +} +EXPORT_SYMBOL_GPL(xdp_reg_page_pool); + +/** + * xdp_unreg_page_pool - unregister &page_pool from the memory providers list + * @pool: &page_pool to unregister + * + * A shorthand for manual unregistering page pools. If the pool was previously + * attached to an RxQ info, it must be detached first. + */ +void xdp_unreg_page_pool(const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_unreg_mem_model(&mem); +} +EXPORT_SYMBOL_GPL(xdp_unreg_page_pool); + +/** + * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info + * @xdp_rxq: XDP RxQ info to attach the pool to + * @pool: pool to attach + * + * If the pool was registered manually, this function must be called instead + * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info. + */ +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_rxq_info_attach_mem_model(xdp_rxq, &mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); + +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection. The @napi_direct boolean + * is used for those calls sites. Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp) +{ + switch (mem_type) { + case MEM_TYPE_PAGE_POOL: + netmem = netmem_compound_head(netmem); + if (napi_direct && xdp_return_frame_no_direct()) + napi_direct = false; + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page + */ + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, + napi_direct); + break; + case MEM_TYPE_PAGE_SHARED: + page_frag_free(__netmem_address(netmem)); + break; + case MEM_TYPE_PAGE_ORDER0: + put_page(__netmem_to_page(netmem)); + break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; + default: + /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + WARN(1, "Incorrect XDP memory type (%d) usage", mem_type); + break; + } +} + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + struct skb_shared_info *sinfo; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + false, NULL); + +out: + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ + struct skb_shared_info *sinfo; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + true, NULL); + +out: + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + +/* XDP bulk APIs introduce a defer/flush mechanism to return + * pages belonging to the same xdp_mem_allocator object + * (identified via the mem.id field) in bulk to optimize + * I-cache and D-cache. + * The bulk queue size is set to 16 to be aligned to how + * XDP_REDIRECT bulking works. The bulk is flushed when + * it is full or when mem.id changes. + * xdp_frame_bulk is usually stored/allocated on the function + * call-stack to avoid locking penalties. + */ + +/* Must be called with rcu_read_lock held */ +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq) +{ + if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) { + xdp_return_frame(xdpf); + return; + } + + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + + if (unlikely(xdp_frame_has_frags(xdpf))) { + struct skb_shared_info *sinfo; + int i; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + + bq->q[bq->count++] = skb_frag_netmem(frag); + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + } + } + bq->q[bq->count++] = virt_to_netmem(xdpf->data); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); + +/** + * xdp_return_frag -- free one XDP frag or decrement its refcount + * @netmem: network memory reference to release + * @xdp: &xdp_buff to release the frag for + */ +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp) +{ + __xdp_return(netmem, xdp->rxq->mem.type, true, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frag); + +void xdp_return_buff(struct xdp_buff *xdp) +{ + struct skb_shared_info *sinfo; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), + xdp->rxq->mem.type, true, xdp); + +out: + __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp); +} +EXPORT_SYMBOL_GPL(xdp_return_buff); + +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog) + bpf_prog_put(info->prog); + info->prog = bpf->prog; + info->flags = bpf->flags; +} +EXPORT_SYMBOL_GPL(xdp_attachment_setup); + +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) +{ + unsigned int metasize, totsize; + void *addr, *data_to_copy; + struct xdp_frame *xdpf; + struct page *page; + + /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */ + metasize = xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; + totsize = xdp->data_end - xdp->data + metasize; + + if (sizeof(*xdpf) + totsize > PAGE_SIZE) + return NULL; + + page = dev_alloc_page(); + if (!page) + return NULL; + + addr = page_to_virt(page); + xdpf = addr; + memset(xdpf, 0, sizeof(*xdpf)); + + addr += sizeof(*xdpf); + data_to_copy = metasize ? xdp->data_meta : xdp->data; + memcpy(addr, data_to_copy, totsize); + + xdpf->data = addr + metasize; + xdpf->len = totsize - metasize; + xdpf->headroom = 0; + xdpf->metasize = metasize; + xdpf->frame_sz = PAGE_SIZE; + xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; + + xsk_buff_free(xdp); + return xdpf; +} +EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); + +/** + * xdp_build_skb_from_buff - create an skb from &xdp_buff + * @xdp: &xdp_buff to convert to an skb + * + * Perform common operations to create a new skb to pass up the stack from + * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize + * skb data pointers and offsets, set the recycle bit if the buff is + * PP-backed, Rx queue index, protocol and update frags info. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) +{ + const struct xdp_rxq_info *rxq = xdp->rxq; + const struct skb_shared_info *sinfo; + struct sk_buff *skb; + u32 nr_frags = 0; + int metalen; + + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } + + skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + __skb_put(skb, xdp->data_end - xdp->data); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) + skb_metadata_set(skb, metalen); + + if (rxq->mem.type == MEM_TYPE_PAGE_POOL) + skb_mark_for_recycle(skb); + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(nr_frags)) { + u32 tsize; + + tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + tsize, xdp_buff_get_skb_flags(xdp)); + } + + skb->protocol = eth_type_trans(skb, rxq->dev); + + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff); + +/** + * xdp_copy_frags_from_zc - copy frags from XSk buff to skb + * @skb: skb to copy frags to + * @xdp: XSk &xdp_buff from which the frags will be copied + * @pp: &page_pool backing page allocation, if available + * + * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack. + * Allocate a new buffer for each frag, copy it and attach to the skb. + * + * Return: true on success, false on netmem allocation fail. + */ +static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, + const struct xdp_buff *xdp, + struct page_pool *pp) +{ + struct skb_shared_info *sinfo = skb_shinfo(skb); + const struct skb_shared_info *xinfo; + u32 nr_frags, tsize = 0; + u32 flags = 0; + + xinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = xinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + const skb_frag_t *frag = &xinfo->frags[i]; + u32 len = skb_frag_size(frag); + u32 offset, truesize = len; + struct page *page; + + page = page_pool_dev_alloc(pp, &offset, &truesize); + if (unlikely(!page)) { + sinfo->nr_frags = i; + return false; + } + + memcpy(page_address(page) + offset, skb_frag_address(frag), + LARGEST_ALIGN(len)); + __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); + + tsize += truesize; + if (page_is_pfmemalloc(page)) + flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; + } + + xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize, + flags); + + return true; +} + +/** + * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff + * @xdp: source XSk buff + * + * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb + * head, new buffer for the head, copy the data and initialize the skb fields. + * If there are frags, allocate new buffers for them and copy. + * Buffers are allocated from the system percpu pools to try recycling them. + * If new skb was built successfully, @xdp is returned to XSk pool's freelist. + * On error, it remains untouched and the caller must take care of this. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) +{ + const struct xdp_rxq_info *rxq = xdp->rxq; + u32 len = xdp->data_end - xdp->data_meta; + u32 truesize = xdp->frame_sz; + struct sk_buff *skb = NULL; + struct page_pool *pp; + int metalen; + void *data; + + if (!IS_ENABLED(CONFIG_PAGE_POOL)) + return NULL; + + local_lock_nested_bh(&system_page_pool.bh_lock); + pp = this_cpu_read(system_page_pool.pool); + data = page_pool_dev_alloc_va(pp, &truesize); + if (unlikely(!data)) + goto out; + + skb = napi_build_skb(data, truesize); + if (unlikely(!skb)) { + page_pool_free_va(pp, data, true); + goto out; + } + + skb_mark_for_recycle(skb); + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); + + memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len)); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) { + skb_metadata_set(skb, metalen); + __skb_pull(skb, metalen); + } + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(xdp_buff_has_frags(xdp)) && + unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { + napi_consume_skb(skb, true); + skb = NULL; + goto out; + } + + xsk_buff_free(xdp); + + skb->protocol = eth_type_trans(skb, rxq->dev); + +out: + local_unlock_nested_bh(&system_page_pool.bh_lock); + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); + +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf); + unsigned int headroom, frame_size; + void *hard_start; + u8 nr_frags; + + /* xdp frags frame */ + if (unlikely(xdp_frame_has_frags(xdpf))) + nr_frags = sinfo->nr_frags; + + /* Part of headroom was reserved to xdpf */ + headroom = sizeof(*xdpf) + xdpf->headroom; + + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. + */ + frame_size = xdpf->frame_sz; + + hard_start = xdpf->data - headroom; + skb = build_skb_around(skb, hard_start, frame_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); + + if (unlikely(xdp_frame_has_frags(xdpf))) + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_get_skb_flags(xdpf)); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, dev); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + if (xdpf->mem_type == MEM_TYPE_PAGE_POOL) + skb_mark_for_recycle(skb); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + + return skb; +} +EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); + +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + + return __xdp_build_skb_from_frame(xdpf, skb, dev); +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); + +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) +{ + unsigned int headroom, totalsize; + struct xdp_frame *nxdpf; + struct page *page; + void *addr; + + headroom = xdpf->headroom + sizeof(*xdpf); + totalsize = headroom + xdpf->len; + + if (unlikely(totalsize > PAGE_SIZE)) + return NULL; + page = dev_alloc_page(); + if (!page) + return NULL; + addr = page_to_virt(page); + + memcpy(addr, xdpf, totalsize); + + nxdpf = addr; + nxdpf->data = addr + headroom; + nxdpf->frame_sz = PAGE_SIZE; + nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0; + + return nxdpf; +} + +__bpf_kfunc_start_defs(); + +/** + * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp. + * @ctx: XDP context pointer. + * @timestamp: Return value pointer. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : means device driver does not implement kfunc + * * ``-ENODATA`` : means no RX-timestamp available for this frame + */ +__bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + return -EOPNOTSUPP; +} + +/** + * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash. + * @ctx: XDP context pointer. + * @hash: Return value pointer. + * @rss_type: Return value pointer for RSS type. + * + * The RSS hash type (@rss_type) specifies what portion of packet headers NIC + * hardware used when calculating RSS hash value. The RSS type can be decoded + * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits + * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types* + * ``XDP_RSS_TYPE_L*``. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc + * * ``-ENODATA`` : means no RX-hash available for this frame + */ +__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) +{ + return -EOPNOTSUPP; +} + +/** + * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag + * @ctx: XDP context pointer. + * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID). + * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP) + * + * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*, + * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use + * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)** + * and should be used as follows: + * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();`` + * + * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag. + * Driver is expected to provide those in **host byte order (usually LE)**, + * so the bpf program should not perform byte conversion. + * According to 802.1Q standard, *VLAN TCI (Tag control information)* + * is a bit field that contains: + * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``, + * *Drop eligible indicator (DEI)* - 1 bit, + * *Priority code point (PCP)* - 3 bits. + * For detailed meaning of DEI and PCP, please refer to other sources. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc + * * ``-ENODATA`` : VLAN tag was not stripped or is not available + */ +__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, + __be16 *vlan_proto, u16 *vlan_tci) +{ + return -EOPNOTSUPP; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(xdp_metadata_kfunc_ids) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC +BTF_KFUNCS_END(xdp_metadata_kfunc_ids) + +static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { + .owner = THIS_MODULE, + .set = &xdp_metadata_kfunc_ids, +}; + +BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) +#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + +u32 bpf_xdp_metadata_kfunc_id(int id) +{ + /* xdp_metadata_kfunc_ids is sorted and can't be used */ + return xdp_metadata_kfunc_ids_unsorted[id]; +} + +bool bpf_dev_bound_kfunc_id(u32 btf_id) +{ + return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id); +} + +static int __init xdp_metadata_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); +} +late_initcall(xdp_metadata_init); + +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) +{ + val &= NETDEV_XDP_ACT_MASK; + if (dev->xdp_features == val) + return; + + netdev_assert_locked_or_invisible(dev); + dev->xdp_features = val; + + if (dev->reg_state == NETREG_REGISTERED) + call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} +EXPORT_SYMBOL_GPL(xdp_set_features_flag); + +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg) +{ + xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); + + if (support_sg) + val |= NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + netdev_lock(dev); + xdp_features_set_redirect_target_locked(dev, support_sg); + netdev_unlock(dev); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); + +void xdp_features_clear_redirect_target_locked(struct net_device *dev) +{ + xdp_features_t val = dev->xdp_features; + + val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + netdev_lock(dev); + xdp_features_clear_redirect_target_locked(dev); + netdev_unlock(dev); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); |
