diff options
author | Petr Mladek <pmladek@suse.com> | 2020-06-01 10:15:16 +0200 |
---|---|---|
committer | Petr Mladek <pmladek@suse.com> | 2020-06-01 10:15:16 +0200 |
commit | d053cf0d771f6547cb0537759a9af63cf402908d (patch) | |
tree | df61806e45c6cf7e9cdd0b271f959f0962f8623e /net/core | |
parent | 6a0af9fc8ccef5304ef88dc7e27362732e047076 (diff) | |
parent | eb012d125a2419786f5bcaaf8a901babc7b6e3d7 (diff) |
Merge branch 'for-5.8' into for-linus
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/bpf_sk_storage.c | 290 | ||||
-rw-r--r-- | net/core/datagram.c | 39 | ||||
-rw-r--r-- | net/core/dev.c | 93 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 6 | ||||
-rw-r--r-- | net/core/devlink.c | 1344 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 47 | ||||
-rw-r--r-- | net/core/fib_rules.c | 2 | ||||
-rw-r--r-- | net/core/filter.c | 236 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 4 | ||||
-rw-r--r-- | net/core/flow_offload.c | 34 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 2 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 6 | ||||
-rw-r--r-- | net/core/neighbour.c | 13 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 133 | ||||
-rw-r--r-- | net/core/net-sysfs.h | 2 | ||||
-rw-r--r-- | net/core/net_namespace.c | 15 | ||||
-rw-r--r-- | net/core/netclassid_cgroup.c | 47 | ||||
-rw-r--r-- | net/core/page_pool.c | 100 | ||||
-rw-r--r-- | net/core/pktgen.c | 50 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 62 | ||||
-rw-r--r-- | net/core/skbuff.c | 30 | ||||
-rw-r--r-- | net/core/skmsg.c | 10 | ||||
-rw-r--r-- | net/core/sock.c | 33 | ||||
-rw-r--r-- | net/core/sock_map.c | 342 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 50 | ||||
-rw-r--r-- | net/core/xdp.c | 2 |
26 files changed, 2437 insertions, 555 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 458be6b3eda9..756b63b6f7b3 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -8,6 +8,7 @@ #include <linux/bpf.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> +#include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> static atomic_t cache_idx; @@ -60,7 +61,7 @@ struct bpf_sk_storage_data { * the number of cachelines access during the cache hit case. */ struct bpf_sk_storage_map __rcu *smap; - u8 data[0] __aligned(8); + u8 data[] __aligned(8); }; /* Linked to bpf_sk_storage and bpf_sk_storage_map */ @@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) kfree(map); } +/* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ +#define MAX_VALUE_SIZE \ + min_t(u32, \ + (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \ + (U16_MAX - sizeof(struct bpf_sk_storage_elem))) + static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || @@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || - /* U16_MAX is much more than enough for sk local storage - * considering a tcp_sock is ~2k. - */ - attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) + if (attr->value_size > MAX_VALUE_SIZE) return -E2BIG; return 0; @@ -643,9 +647,10 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); + nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); - nbuckets = 1U << smap->bucket_log; + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); ret = bpf_map_charge_init(&smap->map.memory, cost); @@ -909,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_SOCKET, }; + +struct bpf_sk_storage_diag { + u32 nr_maps; + struct bpf_map *maps[]; +}; + +/* The reply will be like: + * INET_DIAG_BPF_SK_STORAGES (nla_nest) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * .... + */ +static int nla_value_size(u32 value_size) +{ + /* SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + */ + return nla_total_size(0) + nla_total_size(sizeof(u32)) + + nla_total_size_64bit(value_size); +} + +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ + u32 i; + + if (!diag) + return; + + for (i = 0; i < diag->nr_maps; i++) + bpf_map_put(diag->maps[i]); + + kfree(diag); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free); + +static bool diag_check_dup(const struct bpf_sk_storage_diag *diag, + const struct bpf_map *map) +{ + u32 i; + + for (i = 0; i < diag->nr_maps; i++) { + if (diag->maps[i] == map) + return true; + } + + return false; +} + +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) +{ + struct bpf_sk_storage_diag *diag; + struct nlattr *nla; + u32 nr_maps = 0; + int rem, err; + + /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as + * the map_alloc_check() side also does. + */ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + nla_for_each_nested(nla, nla_stgs, rem) { + if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + nr_maps++; + } + + diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps, + GFP_KERNEL); + if (!diag) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested(nla, nla_stgs, rem) { + struct bpf_map *map; + int map_fd; + + if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + continue; + + map_fd = nla_get_u32(nla); + map = bpf_map_get(map_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto err_free; + } + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) { + bpf_map_put(map); + err = -EINVAL; + goto err_free; + } + if (diag_check_dup(diag, map)) { + bpf_map_put(map); + err = -EEXIST; + goto err_free; + } + diag->maps[diag->nr_maps++] = map; + } + + return diag; + +err_free: + bpf_sk_storage_diag_free(diag); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); + +static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb) +{ + struct nlattr *nla_stg, *nla_value; + struct bpf_sk_storage_map *smap; + + /* It cannot exceed max nlattr's payload */ + BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE); + + nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); + if (!nla_stg) + return -EMSGSIZE; + + smap = rcu_dereference(sdata->smap); + if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) + goto errout; + + nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE, + smap->map.value_size, + SK_DIAG_BPF_STORAGE_PAD); + if (!nla_value) + goto errout; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, nla_data(nla_value), + sdata->data, true); + else + copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + + nla_nest_end(skb, nla_stg); + return 0; + +errout: + nla_nest_cancel(skb, nla_stg); + return -EMSGSIZE; +} + +static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage_map *smap; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + + rcu_read_lock(); + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + smap = rcu_dereference(SDATA(selem)->smap); + diag_size += nla_value_size(smap->map.value_size); + + if (nla_stgs && diag_get(SDATA(selem), skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} + +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_data *sdata; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + u32 i; + + *res_diag_size = 0; + + /* No map has been specified. Dump all. */ + if (!diag->nr_maps) + return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type, + res_diag_size); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + for (i = 0; i < diag->nr_maps; i++) { + sdata = __sk_storage_lookup(sk_storage, + (struct bpf_sk_storage_map *)diag->maps[i], + false); + + if (!sdata) + continue; + + diag_size += nla_value_size(diag->maps[i]->value_size); + + if (nla_stgs && diag_get(sdata, skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); diff --git a/net/core/datagram.c b/net/core/datagram.c index a78e7f864c1e..639745d4f3b9 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -51,6 +51,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/uio.h> +#include <linux/indirect_call_wrapper.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -166,8 +167,6 @@ done: struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, - void (*destructor)(struct sock *sk, - struct sk_buff *skb), int *off, int *err, struct sk_buff **last) { @@ -198,8 +197,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, refcount_inc(&skb->users); } else { __skb_unlink(skb, queue); - if (destructor) - destructor(sk, skb); } *off = _off; return skb; @@ -212,7 +209,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, * @sk: socket * @queue: socket queue from which to receive * @flags: MSG\_ flags - * @destructor: invoked under the receive lock on successful dequeue * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned @@ -245,10 +241,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, */ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, - unsigned int flags, - void (*destructor)(struct sock *sk, - struct sk_buff *skb), - int *off, int *err, + unsigned int flags, int *off, int *err, struct sk_buff **last) { struct sk_buff *skb; @@ -269,8 +262,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, destructor, - off, &error, last); + skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) goto no_packet; @@ -293,10 +286,7 @@ EXPORT_SYMBOL(__skb_try_recv_datagram); struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, - unsigned int flags, - void (*destructor)(struct sock *sk, - struct sk_buff *skb), - int *off, int *err) + unsigned int flags, int *off, int *err) { struct sk_buff *skb, *last; long timeo; @@ -304,8 +294,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor, - off, err, &last); + skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err, + &last); if (skb) return skb; @@ -326,7 +316,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags | (noblock ? MSG_DONTWAIT : 0), - NULL, &off, err); + &off, err); } EXPORT_SYMBOL(skb_recv_datagram); @@ -414,6 +404,11 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } EXPORT_SYMBOL(skb_kill_datagram); +INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, + size_t bytes, + void *data __always_unused, + struct iov_iter *i)); + static int __skb_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, bool fault_short, size_t (*cb)(const void *, size_t, void *, @@ -427,7 +422,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - n = cb(skb->data + offset, copy, data, to); + n = INDIRECT_CALL_1(cb, simple_copy_to_iter, + skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; @@ -449,8 +445,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, if (copy > len) copy = len; - n = cb(vaddr + skb_frag_off(frag) + offset - start, - copy, data, to); + n = INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + skb_frag_off(frag) + offset - start, + copy, data, to); kunmap(page); offset += n; if (n != copy) diff --git a/net/core/dev.c b/net/core/dev.c index 17529d49faec..9c9e763bfe0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,7 +146,6 @@ #include "net-sysfs.h" #define MAX_GRO_SKBS 8 -#define MAX_NEST_DEV 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) @@ -331,6 +330,12 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; + /* lookup might have found our primary name or a name belonging + * to another device. + */ + if (name_node == dev->name_node || name_node->dev != dev) + return -EINVAL; + __netdev_name_node_alt_destroy(name_node); return 0; @@ -3071,6 +3076,8 @@ static u16 skb_tx_hash(const struct net_device *dev, if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); + if (hash >= qoffset) + hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; @@ -3259,7 +3266,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. * - * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) @@ -3288,7 +3295,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, features &= ~NETIF_F_GSO_PARTIAL; } - BUILD_BUG_ON(SKB_SGO_CB_OFFSET + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); @@ -3657,26 +3664,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && - qdisc_run_begin(q)) { - if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, - &q->state))) { - __qdisc_drop(skb, &to_free); - rc = NET_XMIT_DROP; - goto end_run; - } - qdisc_bstats_cpu_update(q, skb); - - rc = NET_XMIT_SUCCESS; - if (sch_direct_xmit(skb, q, dev, txq, NULL, true)) - __qdisc_run(q); - -end_run: - qdisc_run_end(q); - } else { - rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - qdisc_run(q); - } + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + qdisc_run(q); if (unlikely(to_free)) kfree_skb_list(to_free); @@ -4527,14 +4516,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_cloned(skb) || skb_is_tc_redirected(skb)) + if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also * native XDP provides, thus we need to do it here as well. */ - if (skb_is_nonlinear(skb) || + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); int troom = skb->tail + skb->data_len - skb->end; @@ -4649,7 +4638,6 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) kfree_skb(skb); } } -EXPORT_SYMBOL_GPL(generic_xdp_tx); static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); @@ -4860,7 +4848,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { + switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, + &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -5074,7 +5063,7 @@ skip_taps: goto out; } #endif - skb_reset_tc(skb); + skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; @@ -5206,7 +5195,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. - * Caller must also take care of handling if (page_is_)pfmemalloc. + * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. @@ -5792,7 +5781,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (&ptype->list == head) goto normal; - if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) { + if (PTR_ERR(pp) == -EINPROGRESS) { ret = GRO_CONSUMED; goto ok; } @@ -7201,8 +7190,8 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev, return 0; } -static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, - struct list_head **iter) +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; @@ -7214,6 +7203,7 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, return lower->dev; } +EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { @@ -8665,15 +8655,17 @@ static void dev_xdp_uninstall(struct net_device *dev) * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear + * @expected_fd: old program fd that userspace expects to replace or clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, u32 flags) + int fd, int expected_fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; enum bpf_netdev_command query; + u32 prog_id, expected_id = 0; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; bool offload; @@ -8694,15 +8686,29 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, if (bpf_op == bpf_chk) bpf_chk = generic_xdp_install; - if (fd >= 0) { - u32 prog_id; + prog_id = __dev_xdp_query(dev, bpf_op, query); + if (flags & XDP_FLAGS_REPLACE) { + if (expected_fd >= 0) { + prog = bpf_prog_get_type_dev(expected_fd, + BPF_PROG_TYPE_XDP, + bpf_op == ops->ndo_bpf); + if (IS_ERR(prog)) + return PTR_ERR(prog); + expected_id = prog->aux->id; + bpf_prog_put(prog); + } + if (prog_id != expected_id) { + NL_SET_ERR_MSG(extack, "Active program does not match expected"); + return -EEXIST; + } + } + if (fd >= 0) { if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) { NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time"); return -EEXIST; } - prog_id = __dev_xdp_query(dev, bpf_op, query); if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) { NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; @@ -8725,7 +8731,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return 0; } } else { - if (!__dev_xdp_query(dev, bpf_op, query)) + if (!prog_id) return 0; } @@ -9293,6 +9299,10 @@ int register_netdevice(struct net_device *dev) BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); + ret = ethtool_check_ops(dev->ethtool_ops); + if (ret) + return ret; + spin_lock_init(&dev->addr_list_lock); lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); @@ -10016,6 +10026,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { + struct net *net_old = dev_net(dev); int err, new_nsid, new_ifindex; ASSERT_RTNL(); @@ -10031,7 +10042,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Get out if there is nothing todo */ err = 0; - if (net_eq(dev_net(dev), net)) + if (net_eq(net_old, net)) goto out; /* Pick the destination device name, and ensure @@ -10107,6 +10118,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char err = device_rename(&dev->dev, dev->name); WARN_ON(err); + /* Adapt owner in case owning user namespace of target network + * namespace is different from the original one. + */ + err = netdev_change_owner(dev, net_old, net); + WARN_ON(err); + /* Add the device back in the hashes */ list_netdevice(dev); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index dbaebbe573f0..547b587c1950 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -190,6 +190,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; + case __HWTSTAMP_TX_CNT: + /* not a real value */ + break; } switch (rx_filter) { @@ -211,6 +214,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; + case __HWTSTAMP_FILTER_CNT: + /* not a real value */ + break; } if (!tx_type_valid || !rx_filter_valid) diff --git a/net/core/devlink.c b/net/core/devlink.c index ca1df0ec3c97..80f97722f31f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -344,7 +344,7 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, struct devlink_region { struct devlink *devlink; struct list_head list; - const char *name; + const struct devlink_region_ops *ops; struct list_head snapshot_list; u32 max_snapshots; u32 cur_snapshots; @@ -354,7 +354,6 @@ struct devlink_region { struct devlink_snapshot { struct list_head list; struct devlink_region *region; - devlink_snapshot_data_dest_t *data_destructor; u8 *data; u32 id; }; @@ -365,7 +364,7 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name) struct devlink_region *region; list_for_each_entry(region, &devlink->region_list, list) - if (!strcmp(region->name, region_name)) + if (!strcmp(region->ops->name, region_name)) return region; return NULL; @@ -545,6 +544,7 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: + case DEVLINK_PORT_FLAVOUR_VIRTUAL: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->phys.port_number)) return -EMSGSIZE; @@ -2103,11 +2103,11 @@ err_action_values_put: static struct devlink_dpipe_table * devlink_dpipe_table_find(struct list_head *dpipe_tables, - const char *table_name) + const char *table_name, struct devlink *devlink) { struct devlink_dpipe_table *table; - - list_for_each_entry_rcu(table, dpipe_tables, list) { + list_for_each_entry_rcu(table, dpipe_tables, list, + lockdep_is_held(&devlink->lock)) { if (!strcmp(table->name, table_name)) return table; } @@ -2226,7 +2226,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) return -EINVAL; @@ -2382,7 +2382,7 @@ static int devlink_dpipe_table_counters_set(struct devlink *devlink, struct devlink_dpipe_table *table; table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) return -EINVAL; @@ -2709,7 +2709,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb, struct net *net; if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { - NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); + NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified"); return ERR_PTR(-EINVAL); } @@ -2727,7 +2727,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb, net = ERR_PTR(-EINVAL); } if (IS_ERR(net)) { - NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); + NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace"); return ERR_PTR(-EINVAL); } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { @@ -3352,34 +3352,41 @@ devlink_param_value_get_from_info(const struct devlink_param *param, struct genl_info *info, union devlink_param_value *value) { + struct nlattr *param_data; int len; - if (param->type != DEVLINK_PARAM_TYPE_BOOL && - !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) + param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; + + if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) return -EINVAL; switch (param->type) { case DEVLINK_PARAM_TYPE_U8: - value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u8)) + return -EINVAL; + value->vu8 = nla_get_u8(param_data); break; case DEVLINK_PARAM_TYPE_U16: - value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u16)) + return -EINVAL; + value->vu16 = nla_get_u16(param_data); break; case DEVLINK_PARAM_TYPE_U32: - value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + if (nla_len(param_data) != sizeof(u32)) + return -EINVAL; + value->vu32 = nla_get_u32(param_data); break; case DEVLINK_PARAM_TYPE_STRING: - len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]), - nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); - if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) || + len = strnlen(nla_data(param_data), nla_len(param_data)); + if (len == nla_len(param_data) || len >= __DEVLINK_PARAM_MAX_STRING_VALUE) return -EINVAL; - strcpy(value->vstr, - nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])); + strcpy(value->vstr, nla_data(param_data)); break; case DEVLINK_PARAM_TYPE_BOOL: - value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? - true : false; + if (param_data && nla_len(param_data)) + return -EINVAL; + value->vbool = nla_get_flag(param_data); break; } return 0; @@ -3687,7 +3694,7 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; - err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name); + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto nla_put_failure; @@ -3733,7 +3740,7 @@ static void devlink_nl_region_notify(struct devlink_region *region, goto out_cancel_msg; err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, - region->name); + region->ops->name); if (err) goto out_cancel_msg; @@ -3761,13 +3768,201 @@ out_free_msg: nlmsg_free(msg); } +/** + * __devlink_snapshot_id_increment - Increment number of snapshots using an id + * @devlink: devlink instance + * @id: the snapshot id + * + * Track when a new snapshot begins using an id. Load the count for the + * given id from the snapshot xarray, increment it, and store it back. + * + * Called when a new snapshot is created with the given id. + * + * The id *must* have been previously allocated by + * devlink_region_snapshot_id_get(). + * + * Returns 0 on success, or an error on failure. + */ +static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) +{ + unsigned long count; + void *p; + + lockdep_assert_held(&devlink->lock); + + p = xa_load(&devlink->snapshot_ids, id); + if (WARN_ON(!p)) + return -EINVAL; + + if (WARN_ON(!xa_is_value(p))) + return -EINVAL; + + count = xa_to_value(p); + count++; + + return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), + GFP_KERNEL)); +} + +/** + * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id + * @devlink: devlink instance + * @id: the snapshot id + * + * Track when a snapshot is deleted and stops using an id. Load the count + * for the given id from the snapshot xarray, decrement it, and store it + * back. + * + * If the count reaches zero, erase this id from the xarray, freeing it + * up for future re-use by devlink_region_snapshot_id_get(). + * + * Called when a snapshot using the given id is deleted, and when the + * initial allocator of the id is finished using it. + */ +static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) +{ + unsigned long count; + void *p; + + lockdep_assert_held(&devlink->lock); + + p = xa_load(&devlink->snapshot_ids, id); + if (WARN_ON(!p)) + return; + + if (WARN_ON(!xa_is_value(p))) + return; + + count = xa_to_value(p); + + if (count > 1) { + count--; + xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), + GFP_KERNEL); + } else { + /* If this was the last user, we can erase this id */ + xa_erase(&devlink->snapshot_ids, id); + } +} + +/** + * __devlink_snapshot_id_insert - Insert a specific snapshot ID + * @devlink: devlink instance + * @id: the snapshot id + * + * Mark the given snapshot id as used by inserting a zero value into the + * snapshot xarray. + * + * This must be called while holding the devlink instance lock. Unlike + * devlink_snapshot_id_get, the initial reference count is zero, not one. + * It is expected that the id will immediately be used before + * releasing the devlink instance lock. + * + * Returns zero on success, or an error code if the snapshot id could not + * be inserted. + */ +static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) +{ + lockdep_assert_held(&devlink->lock); + + if (WARN_ON(xa_load(&devlink->snapshot_ids, id))) + return -EEXIST; + + return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), + GFP_KERNEL)); +} + +/** + * __devlink_region_snapshot_id_get - get snapshot ID + * @devlink: devlink instance + * @id: storage to return snapshot id + * + * Allocates a new snapshot id. Returns zero on success, or a negative + * error on failure. Must be called while holding the devlink instance + * lock. + * + * Snapshot IDs are tracked using an xarray which stores the number of + * users of the snapshot id. + * + * Note that the caller of this function counts as a 'user', in order to + * avoid race conditions. The caller must release its hold on the + * snapshot by using devlink_region_snapshot_id_put. + */ +static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) +{ + lockdep_assert_held(&devlink->lock); + + return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), + xa_limit_32b, GFP_KERNEL); +} + +/** + * __devlink_region_snapshot_create - create a new snapshot + * This will add a new snapshot of a region. The snapshot + * will be stored on the region struct and can be accessed + * from devlink. This is useful for future analyses of snapshots. + * Multiple snapshots can be created on a region. + * The @snapshot_id should be obtained using the getter function. + * + * Must be called only while holding the devlink instance lock. + * + * @region: devlink region of the snapshot + * @data: snapshot data + * @snapshot_id: snapshot id to be created + */ +static int +__devlink_region_snapshot_create(struct devlink_region *region, + u8 *data, u32 snapshot_id) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot; + int err; + + lockdep_assert_held(&devlink->lock); + + /* check if region can hold one more snapshot */ + if (region->cur_snapshots == region->max_snapshots) + return -ENOSPC; + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) + return -EEXIST; + + snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) + return -ENOMEM; + + err = __devlink_snapshot_id_increment(devlink, snapshot_id); + if (err) + goto err_snapshot_id_increment; + + snapshot->id = snapshot_id; + snapshot->region = region; + snapshot->data = data; + + list_add_tail(&snapshot->list, ®ion->snapshot_list); + + region->cur_snapshots++; + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); + return 0; + +err_snapshot_id_increment: + kfree(snapshot); + return err; +} + static void devlink_region_snapshot_del(struct devlink_region *region, struct devlink_snapshot *snapshot) { + struct devlink *devlink = region->devlink; + + lockdep_assert_held(&devlink->lock); + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); region->cur_snapshots--; list_del(&snapshot->list); - (*snapshot->data_destructor)(snapshot->data); + region->ops->destructor(snapshot->data); + __devlink_snapshot_id_decrement(devlink, snapshot->id); kfree(snapshot); } @@ -3870,6 +4065,71 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, return 0; } +static int +devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_region *region; + const char *region_name; + u32 snapshot_id; + u8 *data; + int err; + + if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) { + NL_SET_ERR_MSG_MOD(info->extack, "No region name provided"); + return -EINVAL; + } + + if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { + NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided"); + return -EINVAL; + } + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + region = devlink_region_get_by_name(devlink, region_name); + if (!region) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist"); + return -EINVAL; + } + + if (!region->ops->snapshot) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot"); + return -EOPNOTSUPP; + } + + if (region->cur_snapshots == region->max_snapshots) { + NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots"); + return -ENOSPC; + } + + snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); + return -EEXIST; + } + + err = __devlink_snapshot_id_insert(devlink, snapshot_id); + if (err) + return err; + + err = region->ops->snapshot(devlink, info->extack, &data); + if (err) + goto err_snapshot_capture; + + err = __devlink_region_snapshot_create(region, data, snapshot_id); + if (err) + goto err_snapshot_create; + + return 0; + +err_snapshot_create: + region->ops->destructor(data); +err_snapshot_capture: + __devlink_snapshot_id_decrement(devlink, snapshot_id); + return err; +} + static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, struct devlink *devlink, u8 *chunk, u32 chunk_size, @@ -3986,6 +4246,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + /* return 0 if there is no further data to read */ + if (start_offset >= region->size) { + err = 0; + goto out_unlock; + } + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, DEVLINK_CMD_REGION_READ); @@ -4226,11 +4492,17 @@ struct devlink_fmsg_item { int attrtype; u8 nla_type; u16 len; - int value[0]; + int value[]; }; struct devlink_fmsg { struct list_head item_list; + bool putting_binary; /* This flag forces enclosing of binary data + * in an array brackets. It forces using + * of designated API: + * devlink_fmsg_binary_pair_nest_start() + * devlink_fmsg_binary_pair_nest_end() + */ }; static struct devlink_fmsg *devlink_fmsg_alloc(void) @@ -4274,17 +4546,26 @@ static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); } EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); } int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); @@ -4295,6 +4576,9 @@ static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) { struct devlink_fmsg_item *item; + if (fmsg->putting_binary) + return -EINVAL; + if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) return -EMSGSIZE; @@ -4315,6 +4599,9 @@ int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { int err; + if (fmsg->putting_binary) + return -EINVAL; + err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); if (err) return err; @@ -4329,6 +4616,9 @@ EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); @@ -4338,6 +4628,9 @@ int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, { int err; + if (fmsg->putting_binary) + return -EINVAL; + err = devlink_fmsg_pair_nest_start(fmsg, name); if (err) return err; @@ -4354,6 +4647,9 @@ int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) { int err; + if (fmsg->putting_binary) + return -EINVAL; + err = devlink_fmsg_nest_end(fmsg); if (err) return err; @@ -4366,6 +4662,30 @@ int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) } EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); +int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name) +{ + int err; + + err = devlink_fmsg_arr_pair_nest_start(fmsg, name); + if (err) + return err; + + fmsg->putting_binary = true; + return err; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start); + +int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg) +{ + if (!fmsg->putting_binary) + return -EINVAL; + + fmsg->putting_binary = false; + return devlink_fmsg_arr_pair_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end); + static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, const void *value, u16 value_len, u8 value_nla_type) @@ -4390,40 +4710,59 @@ static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); } EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put); int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); } EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put); int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); } EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put); int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) { + if (fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, NLA_NUL_STRING); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); -static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) +int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) { + if (!fmsg->putting_binary) + return -EINVAL; + return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); } +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value) @@ -4534,10 +4873,11 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, const void *value, u32 value_len) { u32 data_size; + int end_err; u32 offset; int err; - err = devlink_fmsg_arr_pair_nest_start(fmsg, name); + err = devlink_fmsg_binary_pair_nest_start(fmsg, name); if (err) return err; @@ -4547,14 +4887,18 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, data_size = DEVLINK_FMSG_MAX_SIZE; err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); if (err) - return err; + break; + /* Exit from loop with a break (instead of + * return) to make sure putting_binary is turned off in + * devlink_fmsg_binary_pair_nest_end + */ } - err = devlink_fmsg_arr_pair_nest_end(fmsg); - if (err) - return err; + end_err = devlink_fmsg_binary_pair_nest_end(fmsg); + if (end_err) + err = end_err; - return 0; + return err; } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); @@ -4745,6 +5089,7 @@ struct devlink_health_reporter { struct mutex dump_lock; /* lock parallel read/write from dump buffers */ u64 graceful_period; bool auto_recover; + bool auto_dump; u8 health_state; u64 dump_ts; u64 dump_real_ts; @@ -4780,14 +5125,12 @@ devlink_health_reporter_find_by_name(struct devlink *devlink, * @devlink: devlink * @ops: ops * @graceful_period: to avoid recovery loops, in msecs - * @auto_recover: auto recover when error occurs * @priv: priv */ struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) + u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; @@ -4797,8 +5140,7 @@ devlink_health_reporter_create(struct devlink *devlink, goto unlock; } - if (WARN_ON(auto_recover && !ops->recover) || - WARN_ON(graceful_period && !ops->recover)) { + if (WARN_ON(graceful_period && !ops->recover)) { reporter = ERR_PTR(-EINVAL); goto unlock; } @@ -4813,7 +5155,8 @@ devlink_health_reporter_create(struct devlink *devlink, reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = graceful_period; - reporter->auto_recover = auto_recover; + reporter->auto_recover = !!ops->recover; + reporter->auto_dump = !!ops->dump; mutex_init(&reporter->dump_lock); refcount_set(&reporter->refcount, 1); list_add_tail(&reporter->list, &devlink->reporter_list); @@ -4894,6 +5237,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, reporter->dump_real_ts, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; + if (reporter->ops->dump && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + reporter->auto_dump)) + goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); @@ -5040,10 +5387,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - mutex_lock(&reporter->dump_lock); - /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx, NULL); - mutex_unlock(&reporter->dump_lock); + if (reporter->auto_dump) { + mutex_lock(&reporter->dump_lock); + /* store current dump of current error, for later analysis */ + devlink_health_do_dump(reporter, priv_ctx, NULL); + mutex_unlock(&reporter->dump_lock); + } if (reporter->auto_recover) return devlink_health_reporter_recover(reporter, @@ -5217,6 +5566,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, err = -EOPNOTSUPP; goto out; } + if (!reporter->ops->dump && + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { + err = -EOPNOTSUPP; + goto out; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = @@ -5226,6 +5580,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, reporter->auto_recover = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + reporter->auto_dump = + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); + devlink_health_reporter_put(reporter); return 0; out: @@ -5362,18 +5720,35 @@ struct devlink_stats { }; /** + * struct devlink_trap_policer_item - Packet trap policer attributes. + * @policer: Immutable packet trap policer attributes. + * @rate: Rate in packets / sec. + * @burst: Burst size in packets. + * @list: trap_policer_list member. + * + * Describes packet trap policer attributes. Created by devlink during trap + * policer registration. + */ +struct devlink_trap_policer_item { + const struct devlink_trap_policer *policer; + u64 rate; + u64 burst; + struct list_head list; +}; + +/** * struct devlink_trap_group_item - Packet trap group attributes. * @group: Immutable packet trap group attributes. - * @refcount: Number of trap items using the group. + * @policer_item: Associated policer item. Can be NULL. * @list: trap_group_list member. * @stats: Trap group statistics. * * Describes packet trap group attributes. Created by devlink during trap - * registration. + * group registration. */ struct devlink_trap_group_item { const struct devlink_trap_group *group; - refcount_t refcount; + struct devlink_trap_policer_item *policer_item; struct list_head list; struct devlink_stats __percpu *stats; }; @@ -5399,6 +5774,19 @@ struct devlink_trap_item { void *priv; }; +static struct devlink_trap_policer_item * +devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id) +{ + struct devlink_trap_policer_item *policer_item; + + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { + if (policer_item->policer->id == id) + return policer_item; + } + + return NULL; +} + static struct devlink_trap_item * devlink_trap_item_lookup(struct devlink *devlink, const char *name) { @@ -5456,6 +5844,9 @@ static int devlink_trap_metadata_put(struct sk_buff *msg, if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) goto nla_put_failure; + if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE)) + goto nla_put_failure; nla_nest_end(msg, attr); @@ -5723,6 +6114,19 @@ devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) } static struct devlink_trap_group_item * +devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (group_item->group->id == id) + return group_item; + } + + return NULL; +} + +static struct devlink_trap_group_item * devlink_trap_group_item_get_from_info(struct devlink *devlink, struct genl_info *info) { @@ -5759,6 +6163,11 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) goto nla_put_failure; + if (group_item->policer_item && + nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, + group_item->policer_item->policer->id)) + goto nla_put_failure; + err = devlink_trap_stats_put(msg, group_item->stats); if (err) goto nla_put_failure; @@ -5860,7 +6269,7 @@ __devlink_trap_group_action_set(struct devlink *devlink, int err; list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (strcmp(trap_item->trap->group.name, group_name)) + if (strcmp(trap_item->group_item->group->name, group_name)) continue; err = __devlink_trap_action_set(devlink, trap_item, trap_action, extack); @@ -5874,7 +6283,7 @@ __devlink_trap_group_action_set(struct devlink *devlink, static int devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, - struct genl_info *info) + struct genl_info *info, bool *p_modified) { enum devlink_trap_action trap_action; int err; @@ -5893,6 +6302,47 @@ devlink_trap_group_action_set(struct devlink *devlink, if (err) return err; + *p_modified = true; + + return 0; +} + +static int devlink_trap_group_set(struct devlink *devlink, + struct devlink_trap_group_item *group_item, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + const struct devlink_trap_policer *policer; + struct nlattr **attrs = info->attrs; + int err; + + if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) + return 0; + + if (!devlink->ops->trap_group_set) + return -EOPNOTSUPP; + + policer_item = group_item->policer_item; + if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) { + u32 policer_id; + + policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + policer_item = devlink_trap_policer_item_lookup(devlink, + policer_id); + if (policer_id && !policer_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + return -ENOENT; + } + } + policer = policer_item ? policer_item->policer : NULL; + + err = devlink->ops->trap_group_set(devlink, group_item->group, policer); + if (err) + return err; + + group_item->policer_item = policer_item; + return 0; } @@ -5902,6 +6352,7 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_group_item *group_item; + bool modified = false; int err; if (list_empty(&devlink->trap_group_list)) @@ -5913,14 +6364,262 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, return -ENOENT; } - err = devlink_trap_group_action_set(devlink, group_item, info); + err = devlink_trap_group_action_set(devlink, group_item, info, + &modified); if (err) return err; + err = devlink_trap_group_set(devlink, group_item, info); + if (err) + goto err_trap_group_set; + return 0; + +err_trap_group_set: + if (modified) + NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already"); + return err; +} + +static struct devlink_trap_policer_item * +devlink_trap_policer_item_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + u32 id; + + if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) + return NULL; + id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + + return devlink_trap_policer_item_lookup(devlink, id); +} + +static int +devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct nlattr *attr; + u64 drops; + int err; + + if (!devlink->ops->trap_policer_counter_get) + return 0; + + err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops); + if (err) + return err; + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int +devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_policer_item *policer_item, + enum devlink_command cmd, u32 portid, u32 seq, + int flags) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, + policer_item->policer->id)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, + policer_item->rate, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, + policer_item->burst, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + err = devlink_trap_policer_stats_put(msg, devlink, + policer_item->policer); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + if (list_empty(&devlink->trap_policer_list)) + return -EOPNOTSUPP; + + policer_item = devlink_trap_policer_item_get_from_info(devlink, info); + if (!policer_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) + goto err_trap_policer_fill; + + return genlmsg_reply(msg, info); + +err_trap_policer_fill: + nlmsg_free(msg); + return err; +} + +static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW; + struct devlink_trap_policer_item *policer_item; + u32 portid = NETLINK_CB(cb->skb).portid; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(policer_item, &devlink->trap_policer_list, + list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_trap_policer_fill(msg, devlink, + policer_item, cmd, + portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int +devlink_trap_policer_set(struct devlink *devlink, + struct devlink_trap_policer_item *policer_item, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct nlattr **attrs = info->attrs; + u64 rate, burst; + int err; + + rate = policer_item->rate; + burst = policer_item->burst; + + if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]) + rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]); + + if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]) + burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); + + if (rate < policer_item->policer->min_rate) { + NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit"); + return -EINVAL; + } + + if (rate > policer_item->policer->max_rate) { + NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit"); + return -EINVAL; + } + + if (burst < policer_item->policer->min_burst) { + NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit"); + return -EINVAL; + } + + if (burst > policer_item->policer->max_burst) { + NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit"); + return -EINVAL; + } + + err = devlink->ops->trap_policer_set(devlink, policer_item->policer, + rate, burst, info->extack); + if (err) + return err; + + policer_item->rate = rate; + policer_item->burst = burst; + + return 0; +} + +static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->trap_policer_list)) + return -EOPNOTSUPP; + + if (!devlink->ops->trap_policer_set) + return -EOPNOTSUPP; + + policer_item = devlink_trap_policer_item_get_from_info(devlink, info); + if (!policer_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + return -ENOENT; + } + + return devlink_trap_policer_set(devlink, policer_item, info); } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = + DEVLINK_ATTR_TRAP_POLICER_ID }, [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, @@ -5945,6 +6644,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 }, + [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, @@ -5956,6 +6657,10 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -6079,7 +6784,8 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_get_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_ESWITCH_SET, @@ -6178,6 +6884,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { + .cmd = DEVLINK_CMD_REGION_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_region_new, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { .cmd = DEVLINK_CMD_REGION_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_del, @@ -6283,6 +6996,19 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_GET, + .doit = devlink_nl_cmd_trap_policer_get_doit, + .dumpit = devlink_nl_cmd_trap_policer_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_SET, + .doit = devlink_nl_cmd_trap_policer_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -6320,6 +7046,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (!devlink) return NULL; devlink->ops = ops; + xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); @@ -6330,6 +7057,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->reporter_list); INIT_LIST_HEAD(&devlink->trap_list); INIT_LIST_HEAD(&devlink->trap_group_list); + INIT_LIST_HEAD(&devlink->trap_policer_list); mutex_init(&devlink->lock); mutex_init(&devlink->reporters_lock); return devlink; @@ -6414,6 +7142,7 @@ void devlink_free(struct devlink *devlink) { mutex_destroy(&devlink->reporters_lock); mutex_destroy(&devlink->lock); + WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->reporter_list)); @@ -6424,6 +7153,8 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->port_list)); + xa_destroy(&devlink->snapshot_ids); + kfree(devlink); } EXPORT_SYMBOL_GPL(devlink_free); @@ -6719,6 +7450,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: + case DEVLINK_PORT_FLAVOUR_VIRTUAL: if (!attrs->split) n = snprintf(name, len, "p%u", attrs->phys.port_number); else @@ -6848,7 +7580,7 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, rcu_read_lock(); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); enabled = false; if (table) enabled = table->counters_enabled; @@ -6872,26 +7604,34 @@ int devlink_dpipe_table_register(struct devlink *devlink, void *priv, bool counter_control_extern) { struct devlink_dpipe_table *table; - - if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) - return -EEXIST; + int err = 0; if (WARN_ON(!table_ops->size_get)) return -EINVAL; + mutex_lock(&devlink->lock); + + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, + devlink)) { + err = -EEXIST; + goto unlock; + } + table = kzalloc(sizeof(*table), GFP_KERNEL); - if (!table) - return -ENOMEM; + if (!table) { + err = -ENOMEM; + goto unlock; + } table->name = table_name; table->table_ops = table_ops; table->priv = priv; table->counter_control_extern = counter_control_extern; - mutex_lock(&devlink->lock); list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); +unlock: mutex_unlock(&devlink->lock); - return 0; + return err; } EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); @@ -6908,7 +7648,7 @@ void devlink_dpipe_table_unregister(struct devlink *devlink, mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) goto unlock; list_del_rcu(&table->list); @@ -7065,7 +7805,7 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink, mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) { err = -EINVAL; goto out; @@ -7530,21 +8270,24 @@ EXPORT_SYMBOL_GPL(devlink_param_value_str_fill); * devlink_region_create - create a new address region * * @devlink: devlink - * @region_name: region name + * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region */ -struct devlink_region *devlink_region_create(struct devlink *devlink, - const char *region_name, - u32 region_max_snapshots, - u64 region_size) +struct devlink_region * +devlink_region_create(struct devlink *devlink, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, u64 region_size) { struct devlink_region *region; int err = 0; + if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) + return ERR_PTR(-EINVAL); + mutex_lock(&devlink->lock); - if (devlink_region_get_by_name(devlink, region_name)) { + if (devlink_region_get_by_name(devlink, ops->name)) { err = -EEXIST; goto unlock; } @@ -7557,7 +8300,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink, region->devlink = devlink; region->max_snapshots = region_max_snapshots; - region->name = region_name; + region->ops = ops; region->size = region_size; INIT_LIST_HEAD(®ion->snapshot_list); list_add_tail(®ion->list, &devlink->region_list); @@ -7603,75 +8346,66 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy); * Driver should use the same id for multiple snapshots taken * on multiple regions at the same time/by the same trigger. * + * The caller of this function must use devlink_region_snapshot_id_put + * when finished creating regions using this id. + * + * Returns zero on success, or a negative error code on failure. + * * @devlink: devlink + * @id: storage to return id */ -u32 devlink_region_snapshot_id_get(struct devlink *devlink) +int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { - u32 id; + int err; mutex_lock(&devlink->lock); - id = ++devlink->snapshot_id; + err = __devlink_region_snapshot_id_get(devlink, id); mutex_unlock(&devlink->lock); - return id; + return err; } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); /** + * devlink_region_snapshot_id_put - put snapshot ID reference + * + * This should be called by a driver after finishing creating snapshots + * with an id. Doing so ensures that the ID can later be released in the + * event that all snapshots using it have been destroyed. + * + * @devlink: devlink + * @id: id to release reference on + */ +void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) +{ + mutex_lock(&devlink->lock); + __devlink_snapshot_id_decrement(devlink, id); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); + +/** * devlink_region_snapshot_create - create a new snapshot * This will add a new snapshot of a region. The snapshot * will be stored on the region struct and can be accessed - * from devlink. This is useful for future analyses of snapshots. + * from devlink. This is useful for future analyses of snapshots. * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * * @region: devlink region of the snapshot * @data: snapshot data * @snapshot_id: snapshot id to be created - * @data_destructor: pointer to destructor function to free data */ int devlink_region_snapshot_create(struct devlink_region *region, - u8 *data, u32 snapshot_id, - devlink_snapshot_data_dest_t *data_destructor) + u8 *data, u32 snapshot_id) { struct devlink *devlink = region->devlink; - struct devlink_snapshot *snapshot; int err; mutex_lock(&devlink->lock); - - /* check if region can hold one more snapshot */ - if (region->cur_snapshots == region->max_snapshots) { - err = -ENOMEM; - goto unlock; - } - - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - err = -EEXIST; - goto unlock; - } - - snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); - if (!snapshot) { - err = -ENOMEM; - goto unlock; - } - - snapshot->id = snapshot_id; - snapshot->region = region; - snapshot->data = data; - snapshot->data_destructor = data_destructor; - - list_add_tail(&snapshot->list, ®ion->snapshot_list); - - region->cur_snapshots++; - - devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); + err = __devlink_region_snapshot_create(region, data, snapshot_id); mutex_unlock(&devlink->lock); - return 0; -unlock: - mutex_unlock(&devlink->lock); return err; } EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); @@ -7711,6 +8445,8 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(NON_ROUTABLE, DROP), DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), + DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), + DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -7724,6 +8460,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L3_DROPS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), + DEVLINK_TRAP_GROUP(ACL_DROPS), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) @@ -7757,7 +8494,7 @@ static int devlink_trap_driver_verify(const struct devlink_trap *trap) static int devlink_trap_verify(const struct devlink_trap *trap) { - if (!trap || !trap->name || !trap->group.name) + if (!trap || !trap->name) return -EINVAL; if (trap->generic) @@ -7828,108 +8565,22 @@ devlink_trap_group_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static struct devlink_trap_group_item * -devlink_trap_group_item_create(struct devlink *devlink, - const struct devlink_trap_group *group) -{ - struct devlink_trap_group_item *group_item; - int err; - - err = devlink_trap_group_verify(group); - if (err) - return ERR_PTR(err); - - group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); - if (!group_item) - return ERR_PTR(-ENOMEM); - - group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); - if (!group_item->stats) { - err = -ENOMEM; - goto err_stats_alloc; - } - - group_item->group = group; - refcount_set(&group_item->refcount, 1); - - if (devlink->ops->trap_group_init) { - err = devlink->ops->trap_group_init(devlink, group); - if (err) - goto err_group_init; - } - - list_add_tail(&group_item->list, &devlink->trap_group_list); - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW); - - return group_item; - -err_group_init: - free_percpu(group_item->stats); -err_stats_alloc: - kfree(group_item); - return ERR_PTR(err); -} - -static void -devlink_trap_group_item_destroy(struct devlink *devlink, - struct devlink_trap_group_item *group_item) -{ - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_DEL); - list_del(&group_item->list); - free_percpu(group_item->stats); - kfree(group_item); -} - -static struct devlink_trap_group_item * -devlink_trap_group_item_get(struct devlink *devlink, - const struct devlink_trap_group *group) -{ - struct devlink_trap_group_item *group_item; - - group_item = devlink_trap_group_item_lookup(devlink, group->name); - if (group_item) { - refcount_inc(&group_item->refcount); - return group_item; - } - - return devlink_trap_group_item_create(devlink, group); -} - -static void -devlink_trap_group_item_put(struct devlink *devlink, - struct devlink_trap_group_item *group_item) -{ - if (!refcount_dec_and_test(&group_item->refcount)) - return; - - devlink_trap_group_item_destroy(devlink, group_item); -} - static int devlink_trap_item_group_link(struct devlink *devlink, struct devlink_trap_item *trap_item) { + u16 group_id = trap_item->trap->init_group_id; struct devlink_trap_group_item *group_item; - group_item = devlink_trap_group_item_get(devlink, - &trap_item->trap->group); - if (IS_ERR(group_item)) - return PTR_ERR(group_item); + group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id); + if (WARN_ON_ONCE(!group_item)) + return -EINVAL; trap_item->group_item = group_item; return 0; } -static void -devlink_trap_item_group_unlink(struct devlink *devlink, - struct devlink_trap_item *trap_item) -{ - devlink_trap_group_item_put(devlink, trap_item->group_item); -} - static void devlink_trap_notify(struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd) @@ -7992,7 +8643,6 @@ devlink_trap_register(struct devlink *devlink, return 0; err_trap_init: - devlink_trap_item_group_unlink(devlink, trap_item); err_group_link: free_percpu(trap_item->stats); err_stats_alloc: @@ -8013,7 +8663,6 @@ static void devlink_trap_unregister(struct devlink *devlink, list_del(&trap_item->list); if (devlink->ops->trap_fini) devlink->ops->trap_fini(devlink, trap, trap_item); - devlink_trap_item_group_unlink(devlink, trap_item); free_percpu(trap_item->stats); kfree(trap_item); } @@ -8115,12 +8764,14 @@ devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, static void devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata, const struct devlink_trap_item *trap_item, - struct devlink_port *in_devlink_port) + struct devlink_port *in_devlink_port, + const struct flow_action_cookie *fa_cookie) { struct devlink_trap_group_item *group_item = trap_item->group_item; hw_metadata->trap_group_name = group_item->group->name; hw_metadata->trap_name = trap_item->trap->name; + hw_metadata->fa_cookie = fa_cookie; spin_lock(&in_devlink_port->type_lock); if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) @@ -8134,9 +8785,12 @@ devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata, * @skb: Trapped packet. * @trap_ctx: Trap context. * @in_devlink_port: Input devlink port. + * @fa_cookie: Flow action cookie. Could be NULL. */ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, - void *trap_ctx, struct devlink_port *in_devlink_port) + void *trap_ctx, struct devlink_port *in_devlink_port, + const struct flow_action_cookie *fa_cookie) + { struct devlink_trap_item *trap_item = trap_ctx; struct net_dm_hw_metadata hw_metadata = {}; @@ -8145,7 +8799,7 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, devlink_trap_stats_update(trap_item->group_item->stats, skb->len); devlink_trap_report_metadata_fill(&hw_metadata, trap_item, - in_devlink_port); + in_devlink_port, fa_cookie); net_dm_hw_report(skb, &hw_metadata); } EXPORT_SYMBOL_GPL(devlink_trap_report); @@ -8164,6 +8818,288 @@ void *devlink_trap_ctx_priv(void *trap_ctx) } EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); +static int +devlink_trap_group_item_policer_link(struct devlink *devlink, + struct devlink_trap_group_item *group_item) +{ + u32 policer_id = group_item->group->init_policer_id; + struct devlink_trap_policer_item *policer_item; + + if (policer_id == 0) + return 0; + + policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); + if (WARN_ON_ONCE(!policer_item)) + return -EINVAL; + + group_item->policer_item = policer_item; + + return 0; +} + +static int +devlink_trap_group_register(struct devlink *devlink, + const struct devlink_trap_group *group) +{ + struct devlink_trap_group_item *group_item; + int err; + + if (devlink_trap_group_item_lookup(devlink, group->name)) + return -EEXIST; + + group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); + if (!group_item) + return -ENOMEM; + + group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); + if (!group_item->stats) { + err = -ENOMEM; + goto err_stats_alloc; + } + + group_item->group = group; + + err = devlink_trap_group_item_policer_link(devlink, group_item); + if (err) + goto err_policer_link; + + if (devlink->ops->trap_group_init) { + err = devlink->ops->trap_group_init(devlink, group); + if (err) + goto err_group_init; + } + + list_add_tail(&group_item->list, &devlink->trap_group_list); + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW); + + return 0; + +err_group_init: +err_policer_link: + free_percpu(group_item->stats); +err_stats_alloc: + kfree(group_item); + return err; +} + +static void +devlink_trap_group_unregister(struct devlink *devlink, + const struct devlink_trap_group *group) +{ + struct devlink_trap_group_item *group_item; + + group_item = devlink_trap_group_item_lookup(devlink, group->name); + if (WARN_ON_ONCE(!group_item)) + return; + + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_DEL); + list_del(&group_item->list); + free_percpu(group_item->stats); + kfree(group_item); +} + +/** + * devlink_trap_groups_register - Register packet trap groups with devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + * + * Return: Non-zero value on failure. + */ +int devlink_trap_groups_register(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + int i, err; + + mutex_lock(&devlink->lock); + for (i = 0; i < groups_count; i++) { + const struct devlink_trap_group *group = &groups[i]; + + err = devlink_trap_group_verify(group); + if (err) + goto err_trap_group_verify; + + err = devlink_trap_group_register(devlink, group); + if (err) + goto err_trap_group_register; + } + mutex_unlock(&devlink->lock); + + return 0; + +err_trap_group_register: +err_trap_group_verify: + for (i--; i >= 0; i--) + devlink_trap_group_unregister(devlink, &groups[i]); + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_trap_groups_register); + +/** + * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + */ +void devlink_trap_groups_unregister(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + int i; + + mutex_lock(&devlink->lock); + for (i = groups_count - 1; i >= 0; i--) + devlink_trap_group_unregister(devlink, &groups[i]); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); + +static void +devlink_trap_policer_notify(struct devlink *devlink, + const struct devlink_trap_policer_item *policer_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && + cmd != DEVLINK_CMD_TRAP_POLICER_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0, + 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int +devlink_trap_policer_register(struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct devlink_trap_policer_item *policer_item; + int err; + + if (devlink_trap_policer_item_lookup(devlink, policer->id)) + return -EEXIST; + + policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); + if (!policer_item) + return -ENOMEM; + + policer_item->policer = policer; + policer_item->rate = policer->init_rate; + policer_item->burst = policer->init_burst; + + if (devlink->ops->trap_policer_init) { + err = devlink->ops->trap_policer_init(devlink, policer); + if (err) + goto err_policer_init; + } + + list_add_tail(&policer_item->list, &devlink->trap_policer_list); + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW); + + return 0; + +err_policer_init: + kfree(policer_item); + return err; +} + +static void +devlink_trap_policer_unregister(struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct devlink_trap_policer_item *policer_item; + + policer_item = devlink_trap_policer_item_lookup(devlink, policer->id); + if (WARN_ON_ONCE(!policer_item)) + return; + + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_DEL); + list_del(&policer_item->list); + if (devlink->ops->trap_policer_fini) + devlink->ops->trap_policer_fini(devlink, policer); + kfree(policer_item); +} + +/** + * devlink_trap_policers_register - Register packet trap policers with devlink. + * @devlink: devlink. + * @policers: Packet trap policers. + * @policers_count: Count of provided packet trap policers. + * + * Return: Non-zero value on failure. + */ +int +devlink_trap_policers_register(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count) +{ + int i, err; + + mutex_lock(&devlink->lock); + for (i = 0; i < policers_count; i++) { + const struct devlink_trap_policer *policer = &policers[i]; + + if (WARN_ON(policer->id == 0 || + policer->max_rate < policer->min_rate || + policer->max_burst < policer->min_burst)) { + err = -EINVAL; + goto err_trap_policer_verify; + } + + err = devlink_trap_policer_register(devlink, policer); + if (err) + goto err_trap_policer_register; + } + mutex_unlock(&devlink->lock); + + return 0; + +err_trap_policer_register: +err_trap_policer_verify: + for (i--; i >= 0; i--) + devlink_trap_policer_unregister(devlink, &policers[i]); + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_trap_policers_register); + +/** + * devlink_trap_policers_unregister - Unregister packet trap policers from devlink. + * @devlink: devlink. + * @policers: Packet trap policers. + * @policers_count: Count of provided packet trap policers. + */ +void +devlink_trap_policers_unregister(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count) +{ + int i; + + mutex_lock(&devlink->lock); + for (i = policers_count - 1; i >= 0; i--) + devlink_trap_policer_unregister(devlink, &policers[i]); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_trap_policers_unregister); + static void __devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 536e032d95c8..8e33cec9fc4e 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -29,6 +29,7 @@ #include <net/drop_monitor.h> #include <net/genetlink.h> #include <net/netevent.h> +#include <net/flow_offload.h> #include <trace/events/skb.h> #include <trace/events/napi.h> @@ -67,7 +68,7 @@ struct net_dm_hw_entry { struct net_dm_hw_entries { u32 num_entries; - struct net_dm_hw_entry entries[0]; + struct net_dm_hw_entry entries[]; }; struct per_cpu_dm_data { @@ -701,6 +702,13 @@ static void net_dm_packet_work(struct work_struct *work) } static size_t +net_dm_flow_action_cookie_size(const struct net_dm_hw_metadata *hw_metadata) +{ + return hw_metadata->fa_cookie ? + nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; +} + +static size_t net_dm_hw_packet_report_size(size_t payload_len, const struct net_dm_hw_metadata *hw_metadata) { @@ -717,6 +725,8 @@ net_dm_hw_packet_report_size(size_t payload_len, nla_total_size(strlen(hw_metadata->trap_name) + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + + /* NET_DM_ATTR_FLOW_ACTION_COOKIE */ + net_dm_flow_action_cookie_size(hw_metadata) + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ @@ -762,6 +772,12 @@ static int net_dm_hw_packet_report_fill(struct sk_buff *msg, goto nla_put_failure; } + if (hw_metadata->fa_cookie && + nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE, + hw_metadata->fa_cookie->cookie_len, + hw_metadata->fa_cookie->cookie)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; @@ -794,34 +810,44 @@ nla_put_failure: static struct net_dm_hw_metadata * net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata) { + const struct flow_action_cookie *fa_cookie; struct net_dm_hw_metadata *n_hw_metadata; const char *trap_group_name; const char *trap_name; - n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC); + n_hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); if (!n_hw_metadata) return NULL; - trap_group_name = kmemdup(hw_metadata->trap_group_name, - strlen(hw_metadata->trap_group_name) + 1, - GFP_ATOMIC | __GFP_ZERO); + trap_group_name = kstrdup(hw_metadata->trap_group_name, GFP_ATOMIC); if (!trap_group_name) goto free_hw_metadata; n_hw_metadata->trap_group_name = trap_group_name; - trap_name = kmemdup(hw_metadata->trap_name, - strlen(hw_metadata->trap_name) + 1, - GFP_ATOMIC | __GFP_ZERO); + trap_name = kstrdup(hw_metadata->trap_name, GFP_ATOMIC); if (!trap_name) goto free_trap_group; n_hw_metadata->trap_name = trap_name; + if (hw_metadata->fa_cookie) { + size_t cookie_size = sizeof(*fa_cookie) + + hw_metadata->fa_cookie->cookie_len; + + fa_cookie = kmemdup(hw_metadata->fa_cookie, cookie_size, + GFP_ATOMIC); + if (!fa_cookie) + goto free_trap_name; + n_hw_metadata->fa_cookie = fa_cookie; + } + n_hw_metadata->input_dev = hw_metadata->input_dev; if (n_hw_metadata->input_dev) dev_hold(n_hw_metadata->input_dev); return n_hw_metadata; +free_trap_name: + kfree(trap_name); free_trap_group: kfree(trap_group_name); free_hw_metadata: @@ -834,6 +860,7 @@ net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata) { if (hw_metadata->input_dev) dev_put(hw_metadata->input_dev); + kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); kfree(hw_metadata); @@ -1004,8 +1031,10 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) { int cpu; - if (!monitor_hw) + if (!monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); + return; + } monitor_hw = false; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3e7e15278c46..bd7eba9066f8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -974,7 +974,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh = nlmsg_data(nlh); frh->family = ops->family; - frh->table = rule->table; + frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) diff --git a/net/core/filter.c b/net/core/filter.c index 792e3744b915..7628b947dbc3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1573,7 +1573,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return -EPERM; prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); - if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) + if (PTR_ERR(prog) == -EINVAL) prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); @@ -2642,6 +2642,19 @@ static const struct bpf_func_proto bpf_msg_pop_data_proto = { .arg4_type = ARG_ANYTHING, }; +#ifdef CONFIG_CGROUP_NET_CLASSID +BPF_CALL_0(bpf_get_cgroup_classid_curr) +{ + return __task_get_classid(current); +} + +static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { + .func = bpf_get_cgroup_classid_curr, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; +#endif + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3626,7 +3639,6 @@ err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); return err; } -EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { @@ -4062,7 +4074,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + if (unlikely(!xdp || + xdp_size > (unsigned long)(xdp->data_end - xdp->data))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp->data, @@ -4080,6 +4093,19 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static int bpf_xdp_output_btf_ids[5]; +const struct bpf_func_proto bpf_xdp_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_xdp_output_btf_ids, +}; + BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? sock_gen_cookie(skb->sk) : 0; @@ -4104,6 +4130,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) +{ + return sock_gen_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { + .func = bpf_get_socket_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return sock_gen_cookie(ctx->sk); @@ -4116,6 +4154,39 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static u64 __bpf_get_netns_cookie(struct sock *sk) +{ +#ifdef CONFIG_NET_NS + return net_gen_cookie(sk ? sk->sk_net.net : &init_net); +#else + return 0; +#endif +} + +BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) +{ + return __bpf_get_netns_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { + .func = bpf_get_netns_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { + .func = bpf_get_netns_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -4134,8 +4205,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, u64, flags, void *, data, u64, size) +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, + void *, data, u64, size) { if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -4143,8 +4214,8 @@ BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } -static const struct bpf_func_proto bpf_sockopt_event_output_proto = { - .func = bpf_sockopt_event_output, +static const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -5330,8 +5401,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_1(bpf_sk_release, struct sock *, sk) { - /* Only full sockets have sk->sk_flags. */ - if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE)) + if (sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } @@ -5847,6 +5917,36 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) +{ + if (flags != 0) + return -EINVAL; + if (!skb_at_tc_ingress(skb)) + return -EOPNOTSUPP; + if (unlikely(dev_net(skb->dev) != sock_net(sk))) + return -ENETUNREACH; + if (unlikely(sk->sk_reuseport)) + return -ESOCKTNOSUPPORT; + if (sk_is_refcounted(sk) && + unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) + return -ENOENT; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_pfree; + + return 0; +} + +static const struct bpf_func_proto bpf_sk_assign_proto = { + .func = bpf_sk_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCK_COMMON, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5941,6 +6041,26 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5965,8 +6085,26 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; @@ -6140,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_assign_proto; #endif default: return bpf_base_func_proto(func_id); @@ -6209,7 +6349,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: - return &bpf_sockopt_event_output_proto; + return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: @@ -7140,6 +7280,27 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + + return insn; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -7462,26 +7623,21 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, gso_segs): - /* si->dst_reg = skb_shinfo(SKB); */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - BPF_REG_AX, si->src_reg, - offsetof(struct sk_buff, end)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, head)); - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); -#else - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, end)); -#endif + insn = bpf_convert_shinfo_access(si, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; + case offsetof(struct __sk_buff, gso_size): + insn = bpf_convert_shinfo_access(si, insn); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_size, 2, + target_size)); + break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); @@ -8620,6 +8776,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, struct bpf_map *, map, void *, key, u32, flags) { + bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; @@ -8628,26 +8785,20 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, return -ENOENT; reuse = rcu_dereference(selected_sk->sk_reuseport_cb); - if (!reuse) - /* selected_sk is unhashed (e.g. by close()) after the - * above map_lookup_elem(). Treat selected_sk has already - * been removed from the map. + if (!reuse) { + /* reuseport_array has only sk with non NULL sk_reuseport_cb. + * The only (!reuse) case here is - the sk has already been + * unhashed (e.g. by close()), so treat it as -ENOENT. + * + * Other maps (e.g. sock_map) do not provide this guarantee and + * the sk may never be in the reuseport group to begin with. */ - return -ENOENT; + return is_sockarray ? -ENOENT : -EINVAL; + } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { - struct sock *sk; - - if (unlikely(!reuse_kern->reuseport_id)) - /* There is a small race between adding the - * sk to the map and setting the - * reuse_kern->reuseport_id. - * Treat it as the sk has not been added to - * the bpf map yet. - */ - return -ENOENT; + struct sock *sk = reuse_kern->sk; - sk = reuse_kern->sk; if (sk->sk_protocol != selected_sk->sk_protocol) return -EPROTOTYPE; else if (sk->sk_family != selected_sk->sk_family) @@ -8835,10 +8986,9 @@ const struct bpf_prog_ops sk_reuseport_prog_ops = { }; #endif /* CONFIG_INET */ -DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp) +DEFINE_BPF_DISPATCHER(xdp) void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { - bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp), - prev_prog, prog); + bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index a1670dff0629..3eff84824c8b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -920,9 +920,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); flow_keys->flags = flags; - preempt_disable(); - result = BPF_PROG_RUN(prog, ctx); - preempt_enable(); + result = bpf_prog_run_pin_on_cpu(prog, ctx); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 45b6a59ac124..e951b743bed3 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -167,6 +167,34 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_enc_opts); +struct flow_action_cookie *flow_action_cookie_create(void *data, + unsigned int len, + gfp_t gfp) +{ + struct flow_action_cookie *cookie; + + cookie = kmalloc(sizeof(*cookie) + len, gfp); + if (!cookie) + return NULL; + cookie->cookie_len = len; + memcpy(cookie->cookie, data, len); + return cookie; +} +EXPORT_SYMBOL(flow_action_cookie_create); + +void flow_action_cookie_destroy(struct flow_action_cookie *cookie) +{ + kfree(cookie); +} +EXPORT_SYMBOL(flow_action_cookie_destroy); + +void flow_rule_match_ct(const struct flow_rule *rule, + struct flow_match_ct *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out); +} +EXPORT_SYMBOL(flow_rule_match_ct); + struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)) @@ -483,7 +511,8 @@ EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); void flow_indr_block_call(struct net_device *dev, struct flow_block_offload *bo, - enum flow_block_command command) + enum flow_block_command command, + enum tc_setup_type type) { struct flow_indr_block_cb *indr_block_cb; struct flow_indr_block_dev *indr_dev; @@ -493,8 +522,7 @@ void flow_indr_block_call(struct net_device *dev, return; list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, - bo); + indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo); } EXPORT_SYMBOL_GPL(flow_indr_block_call); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 99a6de52b21d..7d3438215f32 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -367,7 +367,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, }; -static int bpf_build_state(struct nlattr *nla, +static int bpf_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 2f9c0de533c7..8ec7d13d2860 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "BPF"; case LWTUNNEL_ENCAP_SEG6_LOCAL: return "SEG6LOCAL"; + case LWTUNNEL_ENCAP_RPL: + return "RPL"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: @@ -98,7 +100,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, } EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); -int lwtunnel_build_state(u16 encap_type, +int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) @@ -122,7 +124,7 @@ int lwtunnel_build_state(u16 encap_type, rcu_read_unlock(); if (found) { - ret = ops->build_state(encap, family, cfg, lws, extack); + ret = ops->build_state(net, encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } else { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 789a73aa7bd8..39d37d0ef575 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1065,11 +1065,12 @@ static void neigh_timer_handler(struct timer_list *t) neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1125,7 +1126,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/2); + HZ/100); neigh_add_timer(neigh, next); immediate_probe = true; } else { @@ -1427,7 +1428,8 @@ void __neigh_set_probe_once(struct neighbour *neigh) neigh->nud_state = NUD_INCOMPLETE; atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, - jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); + jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); @@ -3553,9 +3555,6 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) -#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \ - NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies) - #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4c826b8bf9b1..cf0215734ceb 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -944,6 +944,24 @@ err: kobject_put(kobj); return error; } + +static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid, + kgid_t kgid) +{ + struct netdev_rx_queue *queue = dev->_rx + index; + struct kobject *kobj = &queue->kobj; + int error; + + error = sysfs_change_owner(kobj, kuid, kgid); + if (error) + return error; + + if (dev->sysfs_rx_queue_group) + error = sysfs_group_change_owner( + kobj, dev->sysfs_rx_queue_group, kuid, kgid); + + return error; +} #endif /* CONFIG_SYSFS */ int @@ -981,6 +999,29 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) #endif } +static int net_rx_queue_change_owner(struct net_device *dev, int num, + kuid_t kuid, kgid_t kgid) +{ +#ifdef CONFIG_SYSFS + int error = 0; + int i; + +#ifndef CONFIG_RPS + if (!dev->sysfs_rx_queue_group) + return 0; +#endif + for (i = 0; i < num; i++) { + error = rx_queue_change_owner(dev, i, kuid, kgid); + if (error) + break; + } + + return error; +#else + return 0; +#endif +} + #ifdef CONFIG_SYSFS /* * netdev_queue sysfs structures and functions. @@ -1486,6 +1527,23 @@ err: kobject_put(kobj); return error; } + +static int tx_queue_change_owner(struct net_device *ndev, int index, + kuid_t kuid, kgid_t kgid) +{ + struct netdev_queue *queue = ndev->_tx + index; + struct kobject *kobj = &queue->kobj; + int error; + + error = sysfs_change_owner(kobj, kuid, kgid); + if (error) + return error; + +#ifdef CONFIG_BQL + error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); +#endif + return error; +} #endif /* CONFIG_SYSFS */ int @@ -1520,6 +1578,25 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) #endif /* CONFIG_SYSFS */ } +static int net_tx_queue_change_owner(struct net_device *dev, int num, + kuid_t kuid, kgid_t kgid) +{ +#ifdef CONFIG_SYSFS + int error = 0; + int i; + + for (i = 0; i < num; i++) { + error = tx_queue_change_owner(dev, i, kuid, kgid); + if (error) + break; + } + + return error; +#else + return 0; +#endif /* CONFIG_SYSFS */ +} + static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; @@ -1554,6 +1631,31 @@ error: return error; } +static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid) +{ + int error = 0, real_rx = 0, real_tx = 0; + +#ifdef CONFIG_SYSFS + if (ndev->queues_kset) { + error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid); + if (error) + return error; + } + real_rx = ndev->real_num_rx_queues; +#endif + real_tx = ndev->real_num_tx_queues; + + error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid); + if (error) + return error; + + error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid); + if (error) + return error; + + return 0; +} + static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; @@ -1767,6 +1869,37 @@ int netdev_register_kobject(struct net_device *ndev) return error; } +/* Change owner for sysfs entries when moving network devices across network + * namespaces owned by different user namespaces. + */ +int netdev_change_owner(struct net_device *ndev, const struct net *net_old, + const struct net *net_new) +{ + struct device *dev = &ndev->dev; + kuid_t old_uid, new_uid; + kgid_t old_gid, new_gid; + int error; + + net_ns_get_ownership(net_old, &old_uid, &old_gid); + net_ns_get_ownership(net_new, &new_uid, &new_gid); + + /* The network namespace was changed but the owning user namespace is + * identical so there's no need to change the owner of sysfs entries. + */ + if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid)) + return 0; + + error = device_change_owner(dev, new_uid, new_gid); + if (error) + return error; + + error = queue_change_owner(ndev, new_uid, new_gid); + if (error) + return error; + + return 0; +} + int netdev_class_create_file_ns(const struct class_attribute *class_attr, const void *ns) { diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 006876c7b78d..8a5b04c2699a 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -8,5 +8,7 @@ void netdev_unregister_kobject(struct net_device *); int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); int netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num); +int netdev_change_owner(struct net_device *, const struct net *net_old, + const struct net *net_new); #endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 757cc1d084e7..190ca66a383b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem); static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; +static atomic64_t cookie_gen; + +u64 net_gen_cookie(struct net *net) +{ + while (1) { + u64 res = atomic64_read(&net->net_cookie); + + if (res) + return res; + res = atomic64_inc_return(&cookie_gen); + atomic64_cmpxchg(&net->net_cookie, 0, res); + } +} + static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; @@ -1087,6 +1101,7 @@ static int __init net_ns_init(void) panic("Could not allocate generic netns"); rcu_assign_pointer(init_net.gen, ng); + net_gen_cookie(&init_net); down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 0642f91c4038..b4c87fe31be2 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -53,30 +53,60 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) kfree(css_cls_state(css)); } +/* + * To avoid freezing of sockets creation for tasks with big number of threads + * and opened sockets lets release file_lock every 1000 iterated descriptors. + * New sockets will already have been created with new classid. + */ + +struct update_classid_context { + u32 classid; + unsigned int batch; +}; + +#define UPDATE_CLASSID_BATCH 1000 + static int update_classid_sock(const void *v, struct file *file, unsigned n) { int err; + struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file, &err); if (sock) { spin_lock(&cgroup_sk_update_lock); - sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, - (unsigned long)v); + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); spin_unlock(&cgroup_sk_update_lock); } + if (--ctx->batch == 0) { + ctx->batch = UPDATE_CLASSID_BATCH; + return n + 1; + } return 0; } +static void update_classid_task(struct task_struct *p, u32 classid) +{ + struct update_classid_context ctx = { + .classid = classid, + .batch = UPDATE_CLASSID_BATCH + }; + unsigned int fd = 0; + + do { + task_lock(p); + fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); + task_unlock(p); + cond_resched(); + } while (fd); +} + static void cgrp_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct task_struct *p; cgroup_taskset_for_each(p, css, tset) { - task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, - (void *)(unsigned long)css_cls_state(css)->classid); - task_unlock(p); + update_classid_task(p, css_cls_state(css)->classid); } } @@ -98,10 +128,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) { - task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, - (void *)(unsigned long)cs->classid); - task_unlock(p); + update_classid_task(p, cs->classid); cond_resched(); } css_task_iter_end(&it); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b7cbe35df37..ef98372facf6 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -43,9 +43,11 @@ static int page_pool_init(struct page_pool *pool, * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ - if ((pool->p.dma_dir != DMA_FROM_DEVICE) && - (pool->p.dma_dir != DMA_BIDIRECTIONAL)) - return -EINVAL; + if (pool->p.flags & PP_FLAG_DMA_MAP) { + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + } if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page @@ -96,11 +98,10 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); -static void __page_pool_return_page(struct page_pool *pool, struct page *page); +static void page_pool_return_page(struct page_pool *pool, struct page *page); noinline -static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, - bool refill) +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; struct page *page; @@ -137,12 +138,11 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); page = NULL; break; } - } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL && - refill); + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) @@ -155,20 +155,16 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, /* fast path */ static struct page *__page_pool_get_cached(struct page_pool *pool) { - bool refill = false; struct page *page; - /* Test for safe-context, caller should provide this guarantee */ - if (likely(in_serving_softirq())) { - if (likely(pool->alloc.count)) { - /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; - return page; - } - refill = true; + /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + } else { + page = page_pool_refill_alloc_cache(pool); } - page = page_pool_refill_alloc_cache(pool, refill); return page; } @@ -280,18 +276,25 @@ static s32 page_pool_inflight(struct page_pool *pool) return inflight; } -/* Cleanup page_pool state from page */ -static void __page_pool_clean_page(struct page_pool *pool, - struct page *page) +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_release_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + /* Always account for inflight pages, even if we didn't + * map them + */ goto skip_dma_unmap; dma = page->dma_addr; - /* DMA unmap */ + + /* When page is unmapped, it cannot be returned our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); @@ -303,21 +306,12 @@ skip_dma_unmap: count = atomic_inc_return(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } - -/* unmap the page and clean our state */ -void page_pool_unmap_page(struct page_pool *pool, struct page *page) -{ - /* When page is unmapped, this implies page will not be - * returned to page_pool. - */ - __page_pool_clean_page(pool, page); -} -EXPORT_SYMBOL(page_pool_unmap_page); +EXPORT_SYMBOL(page_pool_release_page); /* Return a page to the page allocator, cleaning up our state */ -static void __page_pool_return_page(struct page_pool *pool, struct page *page) +static void page_pool_return_page(struct page_pool *pool, struct page *page) { - __page_pool_clean_page(pool, page); + page_pool_release_page(pool, page); put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) @@ -326,8 +320,7 @@ static void __page_pool_return_page(struct page_pool *pool, struct page *page) */ } -static bool __page_pool_recycle_into_ring(struct page_pool *pool, - struct page *page) +static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) { int ret; /* BH protection not needed if current is serving softirq */ @@ -344,7 +337,7 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool, * * Caller must provide appropriate safe context. */ -static bool __page_pool_recycle_direct(struct page *page, +static bool page_pool_recycle_in_cache(struct page *page, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) @@ -363,8 +356,14 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) return !page_is_pfmemalloc(page); } -void __page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +/* If the page refcnt == 1, this will try to recycle the page. + * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * the configured size min(dma_sync_size, pool->max_len). + * If the page refcnt != 1, then the page will be returned to memory + * subsystem. + */ +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -381,12 +380,12 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page, dma_sync_size); if (allow_direct && in_serving_softirq()) - if (__page_pool_recycle_direct(page, pool)) + if (page_pool_recycle_in_cache(page, pool)) return; - if (!__page_pool_recycle_into_ring(pool, page)) { + if (!page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } return; } @@ -403,12 +402,13 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page, * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ - __page_pool_clean_page(pool, page); + /* Do not replace this with page_pool_return_page() */ + page_pool_release_page(pool, page); put_page(page); } -EXPORT_SYMBOL(__page_pool_put_page); +EXPORT_SYMBOL(page_pool_put_page); -static void __page_pool_empty_ring(struct page_pool *pool) +static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -419,7 +419,7 @@ static void __page_pool_empty_ring(struct page_pool *pool) pr_crit("%s() page_pool refcnt %d violation\n", __func__, page_ref_count(page)); - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } } @@ -449,7 +449,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) */ while (pool->alloc.count) { page = pool->alloc.cache[--pool->alloc.count]; - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } } @@ -461,7 +461,7 @@ static void page_pool_scrub(struct page_pool *pool) /* No more consumers should exist, but producers could still * be in-flight. */ - __page_pool_empty_ring(pool); + page_pool_empty_ring(pool); } static int page_pool_release(struct page_pool *pool) @@ -535,7 +535,7 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { page = pool->alloc.cache[--pool->alloc.count]; - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } } EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 294bfcf0ce0e..08e2811b5274 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -535,12 +535,12 @@ static int pgctrl_open(struct inode *inode, struct file *file) return single_open(file, pgctrl_show, PDE_DATA(inode)); } -static const struct file_operations pktgen_fops = { - .open = pgctrl_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pgctrl_write, - .release = single_release, +static const struct proc_ops pktgen_proc_ops = { + .proc_open = pgctrl_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pgctrl_write, + .proc_release = single_release, }; static int pktgen_if_show(struct seq_file *seq, void *v) @@ -1707,12 +1707,12 @@ static int pktgen_if_open(struct inode *inode, struct file *file) return single_open(file, pktgen_if_show, PDE_DATA(inode)); } -static const struct file_operations pktgen_if_fops = { - .open = pktgen_if_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pktgen_if_write, - .release = single_release, +static const struct proc_ops pktgen_if_proc_ops = { + .proc_open = pktgen_if_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pktgen_if_write, + .proc_release = single_release, }; static int pktgen_thread_show(struct seq_file *seq, void *v) @@ -1844,12 +1844,12 @@ static int pktgen_thread_open(struct inode *inode, struct file *file) return single_open(file, pktgen_thread_show, PDE_DATA(inode)); } -static const struct file_operations pktgen_thread_fops = { - .open = pktgen_thread_open, - .read = seq_read, - .llseek = seq_lseek, - .write = pktgen_thread_write, - .release = single_release, +static const struct proc_ops pktgen_thread_proc_ops = { + .proc_open = pktgen_thread_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pktgen_thread_write, + .proc_release = single_release, }; /* Think find or remove for NN */ @@ -1926,7 +1926,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d pkt_dev->entry = proc_create_data(dev->name, 0600, pn->proc_dir, - &pktgen_if_fops, + &pktgen_if_proc_ops, pkt_dev); if (!pkt_dev->entry) pr_err("can't move proc entry for '%s'\n", @@ -2003,8 +2003,8 @@ static int pktgen_setup_dev(const struct pktgen_net *pn, return -ENODEV; } - if (odev->type != ARPHRD_ETHER) { - pr_err("not an ethernet device: \"%s\"\n", ifname); + if (odev->type != ARPHRD_ETHER && odev->type != ARPHRD_LOOPBACK) { + pr_err("not an ethernet or loopback device: \"%s\"\n", ifname); err = -EINVAL; } else if (!netif_running(odev)) { pr_err("device is down: \"%s\"\n", ifname); @@ -3362,7 +3362,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* skb was 'freed' by stack, so clean few * bits and reuse it */ - skb_reset_tc(skb); + skb_reset_redirect(skb); } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { @@ -3638,7 +3638,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->clone_skb = pg_clone_skb_d; pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir, - &pktgen_if_fops, pkt_dev); + &pktgen_if_proc_ops, pkt_dev); if (!pkt_dev->entry) { pr_err("cannot create %s/%s procfs entry\n", PG_PROC_DIR, ifname); @@ -3708,7 +3708,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) t->tsk = p; pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, - &pktgen_thread_fops, t); + &pktgen_thread_proc_ops, t); if (!pe) { pr_err("cannot create %s/%s procfs entry\n", PG_PROC_DIR, t->tsk->comm); @@ -3793,7 +3793,7 @@ static int __net_init pg_net_init(struct net *net) pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR); return -ENODEV; } - pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops); + pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_proc_ops); if (pe == NULL) { pr_err("cannot create %s procfs entry\n", PGCTRL); ret = -EINVAL; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cdad6ed532c4..709ebbf8ab5b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1242,6 +1242,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); + memset(&node_guid, 0, sizeof(node_guid)); + memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = @@ -1290,8 +1292,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; - memset(&node_guid, 0, sizeof(node_guid)); - memset(&port_guid, 0, sizeof(port_guid)); if (dev->netdev_ops->ndo_get_vf_guid && !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, &port_guid)) { @@ -1872,7 +1872,9 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { + [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, + [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, @@ -2799,8 +2801,20 @@ static int do_setlink(const struct sk_buff *skb, } if (xdp[IFLA_XDP_FD]) { + int expected_fd = -1; + + if (xdp_flags & XDP_FLAGS_REPLACE) { + if (!xdp[IFLA_XDP_EXPECTED_FD]) { + err = -EINVAL; + goto errout; + } + expected_fd = + nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); + } + err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), + expected_fd, xdp_flags); if (err) goto errout; @@ -3504,27 +3518,25 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, if (err) return err; - alt_ifname = nla_data(attr); + alt_ifname = nla_strdup(attr, GFP_KERNEL); + if (!alt_ifname) + return -ENOMEM; + if (cmd == RTM_NEWLINKPROP) { - alt_ifname = kstrdup(alt_ifname, GFP_KERNEL); - if (!alt_ifname) - return -ENOMEM; err = netdev_name_node_alt_create(dev, alt_ifname); - if (err) { - kfree(alt_ifname); - return err; - } + if (!err) + alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); - if (err) - return err; } else { - WARN_ON(1); - return 0; + WARN_ON_ONCE(1); + err = -EINVAL; } - *changed = true; - return 0; + kfree(alt_ifname); + if (!err) + *changed = true; + return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3911,7 +3923,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && - (dev->priv_flags & IFF_BRIDGE_PORT)) { + netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; @@ -4022,7 +4034,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && - (dev->priv_flags & IFF_BRIDGE_PORT)) { + netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; @@ -4248,13 +4260,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; if (!br_idx) { /* user did not specify a specific bridge */ - if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (netif_is_bridge_port(dev)) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; } } else { if (dev != br_dev && - !(dev->priv_flags & IFF_BRIDGE_PORT)) + !netif_is_bridge_port(dev)) continue; if (br_dev != netdev_master_upper_dev_get(dev) && @@ -4266,7 +4278,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) if (idx < s_idx) goto cont; - if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (netif_is_bridge_port(dev)) { if (cops && cops->ndo_fdb_dump) { err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, @@ -4416,7 +4428,7 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (dev) { if (!ndm_flags || (ndm_flags & NTF_MASTER)) { - if (!(dev->priv_flags & IFF_BRIDGE_PORT)) { + if (!netif_is_bridge_port(dev)) { NL_SET_ERR_MSG(extack, "Device is not a bridge port"); return -EINVAL; } @@ -4555,7 +4567,11 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || brport_nla_put_flag(skb, flags, mask, - IFLA_BRPORT_PROXYARP, BR_PROXYARP)) { + IFLA_BRPORT_PROXYARP, BR_PROXYARP) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { nla_nest_cancel(skb, protinfo); goto nla_put_failure; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 864cb9e9622f..7e29590482ce 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -467,7 +467,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, return NULL; } - /* use OR instead of assignment to avoid clearing of bits in mask */ if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -527,7 +526,6 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, return NULL; } - /* use OR instead of assignment to avoid clearing of bits in mask */ if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -3670,6 +3668,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(nskb, -skb_network_offset(nskb) + offset); + skb_release_head_state(nskb); __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); @@ -3928,14 +3927,21 @@ normal: goto perform_csum_check; if (!sg) { - if (!nskb->remcsum_offload) - nskb->ip_summed = CHECKSUM_NONE; - SKB_GSO_CB(nskb)->csum = - skb_copy_and_csum_bits(head_skb, offset, - skb_put(nskb, len), - len, 0); - SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + if (!csum) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, + len), + len, 0); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; + } else { + skb_copy_bits(head_skb, offset, + skb_put(nskb, len), + len); + } continue; } @@ -4805,9 +4811,9 @@ static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, typeof(IPPROTO_IP) proto, unsigned int off) { - switch (proto) { - int err; + int err; + switch (proto) { case IPPROTO_TCP: err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), off + MAX_TCP_HDR_LEN); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ded2d5227678..c479372f2cd2 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -512,7 +512,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); refcount_set(&psock->refcnt, 1); - rcu_assign_sk_user_data(sk, psock); + rcu_assign_sk_user_data_nocopy(sk, psock); sock_hold(sk); return psock; @@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct bpf_prog *prog; int ret; - preempt_disable(); rcu_read_lock(); prog = READ_ONCE(psock->progs.msg_parser); if (unlikely(!prog)) { @@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, sk_msg_compute_data_pointers(msg); msg->sk = sk; - ret = BPF_PROG_RUN(prog, msg); + ret = bpf_prog_run_pin_on_cpu(prog, msg); ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { @@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, } out: rcu_read_unlock(); - preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); @@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, skb->sk = psock->sk; bpf_compute_data_end_sk_skb(skb); - preempt_disable(); - ret = BPF_PROG_RUN(prog, skb); - preempt_enable(); + ret = bpf_prog_run_pin_on_cpu(prog, skb); /* strparser clones the skb before handing it to a upper layer, * meaning skb_orphan has been called. We NULL sk on the way out * to ensure we don't trigger a BUG_ON() in skb/sk operations diff --git a/net/core/sock.c b/net/core/sock.c index a4c8fac781ff..ce1d8dce9b7a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -574,7 +574,7 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) /* Sorry... */ ret = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_RAW)) + if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; @@ -1572,13 +1572,14 @@ static inline void sock_lock_init(struct sock *sk) */ static void sock_copy(struct sock *nsk, const struct sock *osk) { + const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, - osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; @@ -1792,16 +1793,17 @@ static void sk_init_common(struct sock *sk) */ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { + struct proto *prot = READ_ONCE(sk->sk_prot); struct sock *newsk; bool is_charged = true; - newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); + newsk = sk_prot_alloc(prot, priority, sk->sk_family); if (newsk != NULL) { struct sk_filter *filter; sock_copy(newsk, sk); - newsk->sk_prot_creator = sk->sk_prot; + newsk->sk_prot_creator = prot; /* SANITY */ if (likely(newsk->sk_net_refcnt)) @@ -1830,7 +1832,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); - mem_cgroup_sk_alloc(newsk); + + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; + cgroup_sk_alloc(&newsk->sk_cgrp_data); rcu_read_lock(); @@ -1863,6 +1868,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) goto out; } + /* Clear sk_user_data if parent had the pointer tagged + * as not suitable for copying when cloning. + */ + if (sk_user_data_is_nocopy(newsk)) + RCU_INIT_POINTER(newsk->sk_user_data, NULL); + newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; @@ -2060,6 +2071,18 @@ void sock_efree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_efree); +/* Buffer destructor for prefetch/receive path where reference count may + * not be held, e.g. for listen sockets. + */ +#ifdef CONFIG_INET +void sock_pfree(struct sk_buff *skb) +{ + if (sk_is_refcounted(skb->sk)) + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_pfree); +#endif /* CONFIG_INET */ + kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8998e356f423..b08dfae10f88 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -10,6 +10,8 @@ #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> +#include <linux/sock_diag.h> +#include <net/udp.h> struct bpf_stab { struct bpf_map map; @@ -31,7 +33,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) return ERR_PTR(-EPERM); if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || + (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); @@ -139,12 +142,58 @@ static void sock_map_unref(struct sock *sk, void *link_raw) } } +static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) +{ + struct proto *prot; + + sock_owned_by_me(sk); + + switch (sk->sk_type) { + case SOCK_STREAM: + prot = tcp_bpf_get_proto(sk, psock); + break; + + case SOCK_DGRAM: + prot = udp_bpf_get_proto(sk, psock); + break; + + default: + return -EINVAL; + } + + if (IS_ERR(prot)) + return PTR_ERR(prot); + + sk_psock_update_proto(sk, psock, prot); + return 0; +} + +static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) { + if (sk->sk_prot->close != sock_map_close) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + if (!refcount_inc_not_zero(&psock->refcnt)) + psock = ERR_PTR(-EBUSY); + } +out: + rcu_read_unlock(); + return psock; +} + static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; - bool skb_progs, sk_psock_is_new = false; struct sk_psock *psock; + bool skb_progs; int ret; skb_verdict = READ_ONCE(progs->skb_verdict); @@ -170,7 +219,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, } } - psock = sk_psock_get_checked(sk); + psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; @@ -189,18 +238,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, ret = -ENOMEM; goto out_progs; } - sk_psock_is_new = true; } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); - if (sk_psock_is_new) { - ret = tcp_bpf_init(sk); - if (ret < 0) - goto out_drop; - } else { - tcp_bpf_reinit(sk); - } + + ret = sock_map_init_proto(sk, psock); + if (ret < 0) + goto out_drop; write_lock_bh(&sk->sk_callback_lock); if (skb_progs && !psock->parser.enabled) { @@ -228,14 +273,37 @@ out: return ret; } +static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) +{ + struct sk_psock *psock; + int ret; + + psock = sock_map_psock_get_checked(sk); + if (IS_ERR(psock)) + return PTR_ERR(psock); + + if (!psock) { + psock = sk_psock_init(sk, map->numa_node); + if (!psock) + return -ENOMEM; + } + + ret = sock_map_init_proto(sk, psock); + if (ret < 0) + sk_psock_put(sk, psock); + return ret; +} + static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); - rcu_read_lock(); - raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; @@ -243,13 +311,14 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { lock_sock(sk); + rcu_read_lock(); sock_map_unref(sk, psk); + rcu_read_unlock(); release_sock(sk); } } - raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); + /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); @@ -274,7 +343,22 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static void *sock_map_lookup(struct bpf_map *map, void *key) { - return ERR_PTR(-EOPNOTSUPP); + return __sock_map_lookup_elem(map, *(u32 *)key); +} + +static void *sock_map_lookup_sys(struct bpf_map *map, void *key) +{ + struct sock *sk; + + if (map->value_size != sizeof(u64)) + return ERR_PTR(-ENOSPC); + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk) + return ERR_PTR(-ENOENT); + + sock_gen_cookie(sk); + return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, @@ -333,11 +417,15 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) return 0; } +static bool sock_map_redirect_allowed(const struct sock *sk) +{ + return sk->sk_state != TCP_LISTEN; +} + static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct inet_connection_sock *icsk = inet_csk(sk); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; @@ -348,14 +436,21 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; - if (unlikely(rcu_access_pointer(icsk->icsk_ulp_data))) + if (inet_csk_has_ulp(sk)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; - ret = sock_map_link(map, &stab->progs, sk); + /* Only sockets we can redirect into/from in BPF need to hold + * refs to parser/verdict progs and have their sk_data_ready + * and sk_write_space callbacks overridden. + */ + if (sock_map_redirect_allowed(sk)) + ret = sock_map_link(map, &stab->progs, sk); + else + ret = sock_map_link_no_progs(map, sk); if (ret < 0) goto out_free; @@ -390,23 +485,52 @@ out_free: static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || - ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } -static bool sock_map_sk_is_suitable(const struct sock *sk) +static bool sk_is_tcp(const struct sock *sk) { return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; } +static bool sk_is_udp(const struct sock *sk) +{ + return sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; +} + +static bool sock_map_sk_is_suitable(const struct sock *sk) +{ + return sk_is_tcp(sk) || sk_is_udp(sk); +} + +static bool sock_map_sk_state_allowed(const struct sock *sk) +{ + if (sk_is_tcp(sk)) + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); + else if (sk_is_udp(sk)) + return sk_hashed(sk); + + return false; +} + static int sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { - u32 ufd = *(u32 *)value; u32 idx = *(u32 *)key; struct socket *sock; struct sock *sk; int ret; + u64 ufd; + + if (map->value_size == sizeof(u64)) + ufd = *(u64 *)value; + else + ufd = *(u32 *)value; + if (ufd > S32_MAX) + return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) @@ -416,14 +540,16 @@ static int sock_map_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_map_update_common(map, idx, sk, flags); + if (!sock_map_sk_state_allowed(sk)) + ret = -EOPNOTSUPP; + else + ret = sock_map_update_common(map, idx, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); @@ -457,13 +583,17 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) + + sk = __sock_map_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = sk; return SK_PASS; } @@ -480,12 +610,17 @@ const struct bpf_func_proto bpf_sk_redirect_map_proto = { BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { + struct sock *sk; + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - msg->flags = flags; - msg->sk_redir = __sock_map_lookup_elem(map, key); - if (!msg->sk_redir) + + sk = __sock_map_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + + msg->flags = flags; + msg->sk_redir = sk; return SK_PASS; } @@ -503,6 +638,7 @@ const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, + .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, @@ -515,7 +651,7 @@ struct bpf_htab_elem { u32 hash; struct sock *sk; struct hlist_node node; - u8 key[0]; + u8 key[]; }; struct bpf_htab_bucket { @@ -659,7 +795,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct inet_connection_sock *icsk = inet_csk(sk); u32 key_size = map->key_size, hash; struct bpf_htab_elem *elem, *elem_new; struct bpf_htab_bucket *bucket; @@ -670,14 +805,21 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (unlikely(icsk->icsk_ulp_data)) + if (inet_csk_has_ulp(sk)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; - ret = sock_map_link(map, &htab->progs, sk); + /* Only sockets we can redirect into/from in BPF need to hold + * refs to parser/verdict progs and have their sk_data_ready + * and sk_write_space callbacks overridden. + */ + if (sock_map_redirect_allowed(sk)) + ret = sock_map_link(map, &htab->progs, sk); + else + ret = sock_map_link_no_progs(map, sk); if (ret < 0) goto out_free; @@ -726,10 +868,17 @@ out_free: static int sock_hash_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { - u32 ufd = *(u32 *)value; struct socket *sock; struct sock *sk; int ret; + u64 ufd; + + if (map->value_size == sizeof(u64)) + ufd = *(u64 *)value; + else + ufd = *(u32 *)value; + if (ufd > S32_MAX) + return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) @@ -739,14 +888,16 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_hash_update_common(map, key, sk, flags); + if (!sock_map_sk_state_allowed(sk)) + ret = -EOPNOTSUPP; + else + ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); @@ -803,7 +954,8 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) return ERR_PTR(-EPERM); if (attr->max_entries == 0 || attr->key_size == 0 || - attr->value_size != 4 || + (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) @@ -858,25 +1010,50 @@ static void sock_hash_free(struct bpf_map *map) struct hlist_node *node; int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); - rcu_read_lock(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); lock_sock(elem->sk); + rcu_read_lock(); sock_map_unref(elem->sk, elem); + rcu_read_unlock(); release_sock(elem->sk); } - raw_spin_unlock_bh(&bucket->lock); } - rcu_read_unlock(); + + /* wait for psock readers accessing its map link */ + synchronize_rcu(); bpf_map_area_free(htab->buckets); kfree(htab); } +static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) +{ + struct sock *sk; + + if (map->value_size != sizeof(u64)) + return ERR_PTR(-ENOSPC); + + sk = __sock_hash_lookup_elem(map, key); + if (!sk) + return ERR_PTR(-ENOENT); + + sock_gen_cookie(sk); + return &sk->sk_cookie; +} + +static void *sock_hash_lookup(struct bpf_map *map, void *key) +{ + return __sock_hash_lookup_elem(map, key); +} + static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs); @@ -908,13 +1085,17 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) + + sk = __sock_hash_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = sk; return SK_PASS; } @@ -931,12 +1112,17 @@ const struct bpf_func_proto bpf_sk_redirect_hash_proto = { BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { + struct sock *sk; + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - msg->flags = flags; - msg->sk_redir = __sock_hash_lookup_elem(map, key); - if (!msg->sk_redir) + + sk = __sock_hash_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + + msg->flags = flags; + msg->sk_redir = sk; return SK_PASS; } @@ -956,7 +1142,8 @@ const struct bpf_map_ops sock_hash_ops = { .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, - .map_lookup_elem = sock_map_lookup, + .map_lookup_elem = sock_hash_lookup, + .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, }; @@ -1000,7 +1187,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, return 0; } -void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) +static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: @@ -1013,3 +1200,54 @@ void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) break; } } + +static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_link *link; + + while ((link = sk_psock_link_pop(psock))) { + sock_map_unlink(sk, link); + sk_psock_free_link(link); + } +} + +void sock_map_unhash(struct sock *sk) +{ + void (*saved_unhash)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + saved_unhash(sk); +} + +void sock_map_close(struct sock *sk, long timeout) +{ + void (*saved_close)(struct sock *sk, long timeout); + struct sk_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + release_sock(sk); + saved_close(sk, timeout); +} diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 91e9f2223c39..adcb3aea576d 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -16,27 +16,8 @@ DEFINE_SPINLOCK(reuseport_lock); -#define REUSEPORT_MIN_ID 1 static DEFINE_IDA(reuseport_ida); -int reuseport_get_id(struct sock_reuseport *reuse) -{ - int id; - - if (reuse->reuseport_id) - return reuse->reuseport_id; - - id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0, - /* Called under reuseport_lock */ - GFP_ATOMIC); - if (id < 0) - return id; - - reuse->reuseport_id = id; - - return reuse->reuseport_id; -} - static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { unsigned int size = sizeof(struct sock_reuseport) + @@ -55,6 +36,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; + int id, ret = 0; /* bh lock used since this function call may precede hlist lock in * soft irq of receive path or setsockopt from process context @@ -78,10 +60,18 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { - spin_unlock_bh(&reuseport_lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + ret = id; + goto out; + } + + reuse->reuseport_id = id; reuse->socks[0] = sk; reuse->num_socks = 1; reuse->bind_inany = bind_inany; @@ -90,7 +80,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) out: spin_unlock_bh(&reuseport_lock); - return 0; + return ret; } EXPORT_SYMBOL(reuseport_alloc); @@ -134,8 +124,7 @@ static void reuseport_free_rcu(struct rcu_head *head) reuse = container_of(head, struct sock_reuseport, rcu); sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); - if (reuse->reuseport_id) - ida_simple_remove(&reuseport_ida, reuse->reuseport_id); + ida_free(&reuseport_ida, reuse->reuseport_id); kfree(reuse); } @@ -199,12 +188,15 @@ void reuseport_detach_sock(struct sock *sk) reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); - /* At least one of the sk in this reuseport group is added to - * a bpf map. Notify the bpf side. The bpf map logic will - * remove the sk if it is indeed added to a bpf map. + /* Notify the bpf side. The sk may be added to a sockarray + * map. If so, sockarray logic will remove it from the map. + * + * Other bpf map types that work with reuseport, like sockmap, + * don't need an explicit callback from here. They override sk + * unhash/close ops to remove the sk from the map before we + * get to this point. */ - if (reuse->reuseport_id) - bpf_sk_reuseport_detach(sk); + bpf_sk_reuseport_detach(sk); rcu_assign_pointer(sk->sk_reuseport_cb, NULL); diff --git a/net/core/xdp.c b/net/core/xdp.c index 8310714c47fd..4c7ea85486af 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -372,7 +372,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); napi_direct &= !xdp_return_frame_no_direct(); - page_pool_put_page(xa->page_pool, page, napi_direct); + page_pool_put_full_page(xa->page_pool, page, napi_direct); rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: |