summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/bpf_sk_storage.c285
-rw-r--r--net/core/datagram.c39
-rw-r--r--net/core/dev.c47
-rw-r--r--net/core/dev_ioctl.c6
-rw-r--r--net/core/devlink.c1267
-rw-r--r--net/core/drop_monitor.c35
-rw-r--r--net/core/filter.c234
-rw-r--r--net/core/flow_dissector.c4
-rw-r--r--net/core/flow_offload.c34
-rw-r--r--net/core/lwt_bpf.c2
-rw-r--r--net/core/lwtunnel.c6
-rw-r--r--net/core/neighbour.c3
-rw-r--r--net/core/net-sysfs.c133
-rw-r--r--net/core/net-sysfs.h2
-rw-r--r--net/core/net_namespace.c15
-rw-r--r--net/core/page_pool.c78
-rw-r--r--net/core/pktgen.c4
-rw-r--r--net/core/rtnetlink.c32
-rw-r--r--net/core/skbuff.c24
-rw-r--r--net/core/skmsg.c10
-rw-r--r--net/core/sock.c26
-rw-r--r--net/core/sock_map.c306
-rw-r--r--net/core/sock_reuseport.c50
-rw-r--r--net/core/xdp.c2
24 files changed, 2240 insertions, 404 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 3ab23f698221..756b63b6f7b3 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -8,6 +8,7 @@
#include <linux/bpf.h>
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
static atomic_t cache_idx;
@@ -60,7 +61,7 @@ struct bpf_sk_storage_data {
* the number of cachelines access during the cache hit case.
*/
struct bpf_sk_storage_map __rcu *smap;
- u8 data[0] __aligned(8);
+ u8 data[] __aligned(8);
};
/* Linked to bpf_sk_storage and bpf_sk_storage_map */
@@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
kfree(map);
}
+/* U16_MAX is much more than enough for sk local storage
+ * considering a tcp_sock is ~2k.
+ */
+#define MAX_VALUE_SIZE \
+ min_t(u32, \
+ (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
+ (U16_MAX - sizeof(struct bpf_sk_storage_elem)))
+
static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
@@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (attr->value_size >= KMALLOC_MAX_SIZE -
- MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
- /* U16_MAX is much more than enough for sk local storage
- * considering a tcp_sock is ~2k.
- */
- attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
+ if (attr->value_size > MAX_VALUE_SIZE)
return -E2BIG;
return 0;
@@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_SOCKET,
};
+
+struct bpf_sk_storage_diag {
+ u32 nr_maps;
+ struct bpf_map *maps[];
+};
+
+/* The reply will be like:
+ * INET_DIAG_BPF_SK_STORAGES (nla_nest)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * ....
+ */
+static int nla_value_size(u32 value_size)
+{
+ /* SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ */
+ return nla_total_size(0) + nla_total_size(sizeof(u32)) +
+ nla_total_size_64bit(value_size);
+}
+
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+ u32 i;
+
+ if (!diag)
+ return;
+
+ for (i = 0; i < diag->nr_maps; i++)
+ bpf_map_put(diag->maps[i]);
+
+ kfree(diag);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
+
+static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
+ const struct bpf_map *map)
+{
+ u32 i;
+
+ for (i = 0; i < diag->nr_maps; i++) {
+ if (diag->maps[i] == map)
+ return true;
+ }
+
+ return false;
+}
+
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
+{
+ struct bpf_sk_storage_diag *diag;
+ struct nlattr *nla;
+ u32 nr_maps = 0;
+ int rem, err;
+
+ /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
+ * the map_alloc_check() side also does.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ nla_for_each_nested(nla, nla_stgs, rem) {
+ if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+ nr_maps++;
+ }
+
+ diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
+ GFP_KERNEL);
+ if (!diag)
+ return ERR_PTR(-ENOMEM);
+
+ nla_for_each_nested(nla, nla_stgs, rem) {
+ struct bpf_map *map;
+ int map_fd;
+
+ if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+ continue;
+
+ map_fd = nla_get_u32(nla);
+ map = bpf_map_get(map_fd);
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto err_free;
+ }
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
+ bpf_map_put(map);
+ err = -EINVAL;
+ goto err_free;
+ }
+ if (diag_check_dup(diag, map)) {
+ bpf_map_put(map);
+ err = -EEXIST;
+ goto err_free;
+ }
+ diag->maps[diag->nr_maps++] = map;
+ }
+
+ return diag;
+
+err_free:
+ bpf_sk_storage_diag_free(diag);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
+
+static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
+{
+ struct nlattr *nla_stg, *nla_value;
+ struct bpf_sk_storage_map *smap;
+
+ /* It cannot exceed max nlattr's payload */
+ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
+
+ nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
+ if (!nla_stg)
+ return -EMSGSIZE;
+
+ smap = rcu_dereference(sdata->smap);
+ if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
+ goto errout;
+
+ nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
+ smap->map.value_size,
+ SK_DIAG_BPF_STORAGE_PAD);
+ if (!nla_value)
+ goto errout;
+
+ if (map_value_has_spin_lock(&smap->map))
+ copy_map_value_locked(&smap->map, nla_data(nla_value),
+ sdata->data, true);
+ else
+ copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+
+ nla_nest_end(skb, nla_stg);
+ return 0;
+
+errout:
+ nla_nest_cancel(skb, nla_stg);
+ return -EMSGSIZE;
+}
+
+static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_sk_storage *sk_storage;
+ struct bpf_sk_storage_elem *selem;
+ struct bpf_sk_storage_map *smap;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+
+ rcu_read_lock();
+
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ smap = rcu_dereference(SDATA(selem)->smap);
+ diag_size += nla_value_size(smap->map.value_size);
+
+ if (nla_stgs && diag_get(SDATA(selem), skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+ struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_sk_storage *sk_storage;
+ struct bpf_sk_storage_data *sdata;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+ u32 i;
+
+ *res_diag_size = 0;
+
+ /* No map has been specified. Dump all. */
+ if (!diag->nr_maps)
+ return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
+ res_diag_size);
+
+ rcu_read_lock();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ for (i = 0; i < diag->nr_maps; i++) {
+ sdata = __sk_storage_lookup(sk_storage,
+ (struct bpf_sk_storage_map *)diag->maps[i],
+ false);
+
+ if (!sdata)
+ continue;
+
+ diag_size += nla_value_size(diag->maps[i]->value_size);
+
+ if (nla_stgs && diag_get(sdata, skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a78e7f864c1e..639745d4f3b9 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -166,8 +167,6 @@ done:
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
struct sk_buff_head *queue,
unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
int *off, int *err,
struct sk_buff **last)
{
@@ -198,8 +197,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
refcount_inc(&skb->users);
} else {
__skb_unlink(skb, queue);
- if (destructor)
- destructor(sk, skb);
}
*off = _off;
return skb;
@@ -212,7 +209,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
* @sk: socket
* @queue: socket queue from which to receive
* @flags: MSG\_ flags
- * @destructor: invoked under the receive lock on successful dequeue
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
@@ -245,10 +241,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
struct sk_buff_head *queue,
- unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *off, int *err,
+ unsigned int flags, int *off, int *err,
struct sk_buff **last)
{
struct sk_buff *skb;
@@ -269,8 +262,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
* However, this function was correct in any case. 8)
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
- skb = __skb_try_recv_from_queue(sk, queue, flags, destructor,
- off, &error, last);
+ skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
+ last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
goto no_packet;
@@ -293,10 +286,7 @@ EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
struct sk_buff_head *sk_queue,
- unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *off, int *err)
+ unsigned int flags, int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
@@ -304,8 +294,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor,
- off, err, &last);
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
+ &last);
if (skb)
return skb;
@@ -326,7 +316,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
return __skb_recv_datagram(sk, &sk->sk_receive_queue,
flags | (noblock ? MSG_DONTWAIT : 0),
- NULL, &off, err);
+ &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
@@ -414,6 +404,11 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
}
EXPORT_SYMBOL(skb_kill_datagram);
+INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
+ size_t bytes,
+ void *data __always_unused,
+ struct iov_iter *i));
+
static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len, bool fault_short,
size_t (*cb)(const void *, size_t, void *,
@@ -427,7 +422,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- n = cb(skb->data + offset, copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ skb->data + offset, copy, data, to);
offset += n;
if (n != copy)
goto short_copy;
@@ -449,8 +445,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
- n = cb(vaddr + skb_frag_off(frag) + offset - start,
- copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + skb_frag_off(frag) + offset - start,
+ copy, data, to);
kunmap(page);
offset += n;
if (n != copy)
diff --git a/net/core/dev.c b/net/core/dev.c
index 500bba8874b0..9c9e763bfe0e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3266,7 +3266,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*
- * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
@@ -3295,7 +3295,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
features &= ~NETIF_F_GSO_PARTIAL;
}
- BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
@@ -4638,7 +4638,6 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
kfree_skb(skb);
}
}
-EXPORT_SYMBOL_GPL(generic_xdp_tx);
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
@@ -4849,7 +4848,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
+ &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -8655,15 +8655,17 @@ static void dev_xdp_uninstall(struct net_device *dev)
* @dev: device
* @extack: netlink extended ack
* @fd: new program fd or negative value to clear
+ * @expected_fd: old program fd that userspace expects to replace or clear
* @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
- int fd, u32 flags)
+ int fd, int expected_fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
enum bpf_netdev_command query;
+ u32 prog_id, expected_id = 0;
struct bpf_prog *prog = NULL;
bpf_op_t bpf_op, bpf_chk;
bool offload;
@@ -8684,15 +8686,29 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
if (bpf_op == bpf_chk)
bpf_chk = generic_xdp_install;
- if (fd >= 0) {
- u32 prog_id;
+ prog_id = __dev_xdp_query(dev, bpf_op, query);
+ if (flags & XDP_FLAGS_REPLACE) {
+ if (expected_fd >= 0) {
+ prog = bpf_prog_get_type_dev(expected_fd,
+ BPF_PROG_TYPE_XDP,
+ bpf_op == ops->ndo_bpf);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ expected_id = prog->aux->id;
+ bpf_prog_put(prog);
+ }
+ if (prog_id != expected_id) {
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
+ return -EEXIST;
+ }
+ }
+ if (fd >= 0) {
if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
return -EEXIST;
}
- prog_id = __dev_xdp_query(dev, bpf_op, query);
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
NL_SET_ERR_MSG(extack, "XDP program already attached");
return -EBUSY;
@@ -8715,7 +8731,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
return 0;
}
} else {
- if (!__dev_xdp_query(dev, bpf_op, query))
+ if (!prog_id)
return 0;
}
@@ -9283,6 +9299,10 @@ int register_netdevice(struct net_device *dev)
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
+ ret = ethtool_check_ops(dev->ethtool_ops);
+ if (ret)
+ return ret;
+
spin_lock_init(&dev->addr_list_lock);
lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
@@ -10006,6 +10026,7 @@ EXPORT_SYMBOL(unregister_netdev);
int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
{
+ struct net *net_old = dev_net(dev);
int err, new_nsid, new_ifindex;
ASSERT_RTNL();
@@ -10021,7 +10042,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* Get out if there is nothing todo */
err = 0;
- if (net_eq(dev_net(dev), net))
+ if (net_eq(net_old, net))
goto out;
/* Pick the destination device name, and ensure
@@ -10097,6 +10118,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
+ /* Adapt owner in case owning user namespace of target network
+ * namespace is different from the original one.
+ */
+ err = netdev_change_owner(dev, net_old, net);
+ WARN_ON(err);
+
/* Add the device back in the hashes */
list_netdevice(dev);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index dbaebbe573f0..547b587c1950 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -190,6 +190,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_TX_ONESTEP_P2P:
tx_type_valid = 1;
break;
+ case __HWTSTAMP_TX_CNT:
+ /* not a real value */
+ break;
}
switch (rx_filter) {
@@ -211,6 +214,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_FILTER_NTP_ALL:
rx_filter_valid = 1;
break;
+ case __HWTSTAMP_FILTER_CNT:
+ /* not a real value */
+ break;
}
if (!tx_type_valid || !rx_filter_valid)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b831c5545d6a..80f97722f31f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -344,7 +344,7 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
struct devlink_region {
struct devlink *devlink;
struct list_head list;
- const char *name;
+ const struct devlink_region_ops *ops;
struct list_head snapshot_list;
u32 max_snapshots;
u32 cur_snapshots;
@@ -354,7 +354,6 @@ struct devlink_region {
struct devlink_snapshot {
struct list_head list;
struct devlink_region *region;
- devlink_snapshot_data_dest_t *data_destructor;
u8 *data;
u32 id;
};
@@ -365,7 +364,7 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
struct devlink_region *region;
list_for_each_entry(region, &devlink->region_list, list)
- if (!strcmp(region->name, region_name))
+ if (!strcmp(region->ops->name, region_name))
return region;
return NULL;
@@ -545,6 +544,7 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
attrs->phys.port_number))
return -EMSGSIZE;
@@ -2709,7 +2709,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb,
struct net *net;
if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
- NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+ NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified");
return ERR_PTR(-EINVAL);
}
@@ -2727,7 +2727,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb,
net = ERR_PTR(-EINVAL);
}
if (IS_ERR(net)) {
- NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+ NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace");
return ERR_PTR(-EINVAL);
}
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
@@ -3694,7 +3694,7 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
- err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
if (err)
goto nla_put_failure;
@@ -3740,7 +3740,7 @@ static void devlink_nl_region_notify(struct devlink_region *region,
goto out_cancel_msg;
err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
- region->name);
+ region->ops->name);
if (err)
goto out_cancel_msg;
@@ -3768,13 +3768,201 @@ out_free_msg:
nlmsg_free(msg);
}
+/**
+ * __devlink_snapshot_id_increment - Increment number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a new snapshot begins using an id. Load the count for the
+ * given id from the snapshot xarray, increment it, and store it back.
+ *
+ * Called when a new snapshot is created with the given id.
+ *
+ * The id *must* have been previously allocated by
+ * devlink_region_snapshot_id_get().
+ *
+ * Returns 0 on success, or an error on failure.
+ */
+static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ lockdep_assert_held(&devlink->lock);
+
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ return -EINVAL;
+
+ if (WARN_ON(!xa_is_value(p)))
+ return -EINVAL;
+
+ count = xa_to_value(p);
+ count++;
+
+ return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_KERNEL));
+}
+
+/**
+ * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a snapshot is deleted and stops using an id. Load the count
+ * for the given id from the snapshot xarray, decrement it, and store it
+ * back.
+ *
+ * If the count reaches zero, erase this id from the xarray, freeing it
+ * up for future re-use by devlink_region_snapshot_id_get().
+ *
+ * Called when a snapshot using the given id is deleted, and when the
+ * initial allocator of the id is finished using it.
+ */
+static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ lockdep_assert_held(&devlink->lock);
+
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ return;
+
+ if (WARN_ON(!xa_is_value(p)))
+ return;
+
+ count = xa_to_value(p);
+
+ if (count > 1) {
+ count--;
+ xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_KERNEL);
+ } else {
+ /* If this was the last user, we can erase this id */
+ xa_erase(&devlink->snapshot_ids, id);
+ }
+}
+
+/**
+ * __devlink_snapshot_id_insert - Insert a specific snapshot ID
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Mark the given snapshot id as used by inserting a zero value into the
+ * snapshot xarray.
+ *
+ * This must be called while holding the devlink instance lock. Unlike
+ * devlink_snapshot_id_get, the initial reference count is zero, not one.
+ * It is expected that the id will immediately be used before
+ * releasing the devlink instance lock.
+ *
+ * Returns zero on success, or an error code if the snapshot id could not
+ * be inserted.
+ */
+static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ if (WARN_ON(xa_load(&devlink->snapshot_ids, id)))
+ return -EEXIST;
+
+ return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(0),
+ GFP_KERNEL));
+}
+
+/**
+ * __devlink_region_snapshot_id_get - get snapshot ID
+ * @devlink: devlink instance
+ * @id: storage to return snapshot id
+ *
+ * Allocates a new snapshot id. Returns zero on success, or a negative
+ * error on failure. Must be called while holding the devlink instance
+ * lock.
+ *
+ * Snapshot IDs are tracked using an xarray which stores the number of
+ * users of the snapshot id.
+ *
+ * Note that the caller of this function counts as a 'user', in order to
+ * avoid race conditions. The caller must release its hold on the
+ * snapshot by using devlink_region_snapshot_id_put.
+ */
+static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1),
+ xa_limit_32b, GFP_KERNEL);
+}
+
+/**
+ * __devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * Must be called only while holding the devlink instance lock.
+ *
+ * @region: devlink region of the snapshot
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ */
+static int
+__devlink_region_snapshot_create(struct devlink_region *region,
+ u8 *data, u32 snapshot_id)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ lockdep_assert_held(&devlink->lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots)
+ return -ENOSPC;
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id))
+ return -EEXIST;
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot)
+ return -ENOMEM;
+
+ err = __devlink_snapshot_id_increment(devlink, snapshot_id);
+ if (err)
+ goto err_snapshot_id_increment;
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ return 0;
+
+err_snapshot_id_increment:
+ kfree(snapshot);
+ return err;
+}
+
static void devlink_region_snapshot_del(struct devlink_region *region,
struct devlink_snapshot *snapshot)
{
+ struct devlink *devlink = region->devlink;
+
+ lockdep_assert_held(&devlink->lock);
+
devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
region->cur_snapshots--;
list_del(&snapshot->list);
- (*snapshot->data_destructor)(snapshot->data);
+ region->ops->destructor(snapshot->data);
+ __devlink_snapshot_id_decrement(devlink, snapshot->id);
kfree(snapshot);
}
@@ -3877,6 +4065,71 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
return 0;
}
+static int
+devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_region *region;
+ const char *region_name;
+ u32 snapshot_id;
+ u8 *data;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) {
+ NL_SET_ERR_MSG_MOD(info->extack, "No region name provided");
+ return -EINVAL;
+ }
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) {
+ NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided");
+ return -EINVAL;
+ }
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist");
+ return -EINVAL;
+ }
+
+ if (!region->ops->snapshot) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot");
+ return -EOPNOTSUPP;
+ }
+
+ if (region->cur_snapshots == region->max_snapshots) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots");
+ return -ENOSPC;
+ }
+
+ snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
+ return -EEXIST;
+ }
+
+ err = __devlink_snapshot_id_insert(devlink, snapshot_id);
+ if (err)
+ return err;
+
+ err = region->ops->snapshot(devlink, info->extack, &data);
+ if (err)
+ goto err_snapshot_capture;
+
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ if (err)
+ goto err_snapshot_create;
+
+ return 0;
+
+err_snapshot_create:
+ region->ops->destructor(data);
+err_snapshot_capture:
+ __devlink_snapshot_id_decrement(devlink, snapshot_id);
+ return err;
+}
+
static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
struct devlink *devlink,
u8 *chunk, u32 chunk_size,
@@ -4239,11 +4492,17 @@ struct devlink_fmsg_item {
int attrtype;
u8 nla_type;
u16 len;
- int value[0];
+ int value[];
};
struct devlink_fmsg {
struct list_head item_list;
+ bool putting_binary; /* This flag forces enclosing of binary data
+ * in an array brackets. It forces using
+ * of designated API:
+ * devlink_fmsg_binary_pair_nest_start()
+ * devlink_fmsg_binary_pair_nest_end()
+ */
};
static struct devlink_fmsg *devlink_fmsg_alloc(void)
@@ -4287,17 +4546,26 @@ static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg,
int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start);
static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
}
int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end);
@@ -4308,6 +4576,9 @@ static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
{
struct devlink_fmsg_item *item;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE)
return -EMSGSIZE;
@@ -4328,6 +4599,9 @@ int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START);
if (err)
return err;
@@ -4342,6 +4616,9 @@ EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start);
int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end);
@@ -4351,6 +4628,9 @@ int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
@@ -4367,6 +4647,9 @@ int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_nest_end(fmsg);
if (err)
return err;
@@ -4379,6 +4662,30 @@ int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
}
EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end);
+int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name)
+{
+ int err;
+
+ err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
+ if (err)
+ return err;
+
+ fmsg->putting_binary = true;
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start);
+
+int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ if (!fmsg->putting_binary)
+ return -EINVAL;
+
+ fmsg->putting_binary = false;
+ return devlink_fmsg_arr_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end);
+
static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
const void *value, u16 value_len,
u8 value_nla_type)
@@ -4403,40 +4710,59 @@ static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put);
int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put);
int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put);
int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
NLA_NUL_STRING);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
-static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
- u16 value_len)
+int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+ u16 value_len)
{
+ if (!fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
bool value)
@@ -4547,10 +4873,11 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
const void *value, u32 value_len)
{
u32 data_size;
+ int end_err;
u32 offset;
int err;
- err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
+ err = devlink_fmsg_binary_pair_nest_start(fmsg, name);
if (err)
return err;
@@ -4560,14 +4887,18 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
data_size = DEVLINK_FMSG_MAX_SIZE;
err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
if (err)
- return err;
+ break;
+ /* Exit from loop with a break (instead of
+ * return) to make sure putting_binary is turned off in
+ * devlink_fmsg_binary_pair_nest_end
+ */
}
- err = devlink_fmsg_arr_pair_nest_end(fmsg);
- if (err)
- return err;
+ end_err = devlink_fmsg_binary_pair_nest_end(fmsg);
+ if (end_err)
+ err = end_err;
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
@@ -4758,6 +5089,7 @@ struct devlink_health_reporter {
struct mutex dump_lock; /* lock parallel read/write from dump buffers */
u64 graceful_period;
bool auto_recover;
+ bool auto_dump;
u8 health_state;
u64 dump_ts;
u64 dump_real_ts;
@@ -4793,14 +5125,12 @@ devlink_health_reporter_find_by_name(struct devlink *devlink,
* @devlink: devlink
* @ops: ops
* @graceful_period: to avoid recovery loops, in msecs
- * @auto_recover: auto recover when error occurs
* @priv: priv
*/
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, bool auto_recover,
- void *priv)
+ u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
@@ -4810,8 +5140,7 @@ devlink_health_reporter_create(struct devlink *devlink,
goto unlock;
}
- if (WARN_ON(auto_recover && !ops->recover) ||
- WARN_ON(graceful_period && !ops->recover)) {
+ if (WARN_ON(graceful_period && !ops->recover)) {
reporter = ERR_PTR(-EINVAL);
goto unlock;
}
@@ -4826,7 +5155,8 @@ devlink_health_reporter_create(struct devlink *devlink,
reporter->ops = ops;
reporter->devlink = devlink;
reporter->graceful_period = graceful_period;
- reporter->auto_recover = auto_recover;
+ reporter->auto_recover = !!ops->recover;
+ reporter->auto_dump = !!ops->dump;
mutex_init(&reporter->dump_lock);
refcount_set(&reporter->refcount, 1);
list_add_tail(&reporter->list, &devlink->reporter_list);
@@ -4907,6 +5237,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
reporter->dump_real_ts, DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
+ if (reporter->ops->dump &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ reporter->auto_dump))
+ goto reporter_nest_cancel;
nla_nest_end(msg, reporter_attr);
genlmsg_end(msg, hdr);
@@ -5053,10 +5387,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
- mutex_lock(&reporter->dump_lock);
- /* store current dump of current error, for later analysis */
- devlink_health_do_dump(reporter, priv_ctx, NULL);
- mutex_unlock(&reporter->dump_lock);
+ if (reporter->auto_dump) {
+ mutex_lock(&reporter->dump_lock);
+ /* store current dump of current error, for later analysis */
+ devlink_health_do_dump(reporter, priv_ctx, NULL);
+ mutex_unlock(&reporter->dump_lock);
+ }
if (reporter->auto_recover)
return devlink_health_reporter_recover(reporter,
@@ -5230,6 +5566,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
err = -EOPNOTSUPP;
goto out;
}
+ if (!reporter->ops->dump &&
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
reporter->graceful_period =
@@ -5239,6 +5580,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
reporter->auto_recover =
nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
+ reporter->auto_dump =
+ nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]);
+
devlink_health_reporter_put(reporter);
return 0;
out:
@@ -5375,18 +5720,35 @@ struct devlink_stats {
};
/**
+ * struct devlink_trap_policer_item - Packet trap policer attributes.
+ * @policer: Immutable packet trap policer attributes.
+ * @rate: Rate in packets / sec.
+ * @burst: Burst size in packets.
+ * @list: trap_policer_list member.
+ *
+ * Describes packet trap policer attributes. Created by devlink during trap
+ * policer registration.
+ */
+struct devlink_trap_policer_item {
+ const struct devlink_trap_policer *policer;
+ u64 rate;
+ u64 burst;
+ struct list_head list;
+};
+
+/**
* struct devlink_trap_group_item - Packet trap group attributes.
* @group: Immutable packet trap group attributes.
- * @refcount: Number of trap items using the group.
+ * @policer_item: Associated policer item. Can be NULL.
* @list: trap_group_list member.
* @stats: Trap group statistics.
*
* Describes packet trap group attributes. Created by devlink during trap
- * registration.
+ * group registration.
*/
struct devlink_trap_group_item {
const struct devlink_trap_group *group;
- refcount_t refcount;
+ struct devlink_trap_policer_item *policer_item;
struct list_head list;
struct devlink_stats __percpu *stats;
};
@@ -5412,6 +5774,19 @@ struct devlink_trap_item {
void *priv;
};
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
+ if (policer_item->policer->id == id)
+ return policer_item;
+ }
+
+ return NULL;
+}
+
static struct devlink_trap_item *
devlink_trap_item_lookup(struct devlink *devlink, const char *name)
{
@@ -5469,6 +5844,9 @@ static int devlink_trap_metadata_put(struct sk_buff *msg,
if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) &&
nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT))
goto nla_put_failure;
+ if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE))
+ goto nla_put_failure;
nla_nest_end(msg, attr);
@@ -5736,6 +6114,19 @@ devlink_trap_group_item_lookup(struct devlink *devlink, const char *name)
}
static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (group_item->group->id == id)
+ return group_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_group_item *
devlink_trap_group_item_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
@@ -5772,6 +6163,11 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
goto nla_put_failure;
+ if (group_item->policer_item &&
+ nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ group_item->policer_item->policer->id))
+ goto nla_put_failure;
+
err = devlink_trap_stats_put(msg, group_item->stats);
if (err)
goto nla_put_failure;
@@ -5873,7 +6269,7 @@ __devlink_trap_group_action_set(struct devlink *devlink,
int err;
list_for_each_entry(trap_item, &devlink->trap_list, list) {
- if (strcmp(trap_item->trap->group.name, group_name))
+ if (strcmp(trap_item->group_item->group->name, group_name))
continue;
err = __devlink_trap_action_set(devlink, trap_item,
trap_action, extack);
@@ -5887,7 +6283,7 @@ __devlink_trap_group_action_set(struct devlink *devlink,
static int
devlink_trap_group_action_set(struct devlink *devlink,
struct devlink_trap_group_item *group_item,
- struct genl_info *info)
+ struct genl_info *info, bool *p_modified)
{
enum devlink_trap_action trap_action;
int err;
@@ -5906,6 +6302,47 @@ devlink_trap_group_action_set(struct devlink *devlink,
if (err)
return err;
+ *p_modified = true;
+
+ return 0;
+}
+
+static int devlink_trap_group_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ const struct devlink_trap_policer *policer;
+ struct nlattr **attrs = info->attrs;
+ int err;
+
+ if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return 0;
+
+ if (!devlink->ops->trap_group_set)
+ return -EOPNOTSUPP;
+
+ policer_item = group_item->policer_item;
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) {
+ u32 policer_id;
+
+ policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+ policer_item = devlink_trap_policer_item_lookup(devlink,
+ policer_id);
+ if (policer_id && !policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+ }
+ policer = policer_item ? policer_item->policer : NULL;
+
+ err = devlink->ops->trap_group_set(devlink, group_item->group, policer);
+ if (err)
+ return err;
+
+ group_item->policer_item = policer_item;
+
return 0;
}
@@ -5915,6 +6352,7 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_group_item *group_item;
+ bool modified = false;
int err;
if (list_empty(&devlink->trap_group_list))
@@ -5926,14 +6364,262 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
return -ENOENT;
}
- err = devlink_trap_group_action_set(devlink, group_item, info);
+ err = devlink_trap_group_action_set(devlink, group_item, info,
+ &modified);
+ if (err)
+ return err;
+
+ err = devlink_trap_group_set(devlink, group_item, info);
+ if (err)
+ goto err_trap_group_set;
+
+ return 0;
+
+err_trap_group_set:
+ if (modified)
+ NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already");
+ return err;
+}
+
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ u32 id;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return NULL;
+ id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+
+ return devlink_trap_policer_item_lookup(devlink, id);
+}
+
+static int
+devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct nlattr *attr;
+ u64 drops;
+ int err;
+
+ if (!devlink->ops->trap_policer_counter_get)
+ return 0;
+
+ err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops);
+ if (err)
+ return err;
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ policer_item->policer->id))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE,
+ policer_item->rate, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST,
+ policer_item->burst, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ err = devlink_trap_policer_stats_put(msg, devlink,
+ policer_item->policer);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_policer_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_policer_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW;
+ struct devlink_trap_policer_item *policer_item;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(policer_item, &devlink->trap_policer_list,
+ list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_policer_fill(msg, devlink,
+ policer_item, cmd,
+ portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int
+devlink_trap_policer_set(struct devlink *devlink,
+ struct devlink_trap_policer_item *policer_item,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **attrs = info->attrs;
+ u64 rate, burst;
+ int err;
+
+ rate = policer_item->rate;
+ burst = policer_item->burst;
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE])
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]);
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST])
+ burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]);
+
+ if (rate < policer_item->policer->min_rate) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit");
+ return -EINVAL;
+ }
+
+ if (rate > policer_item->policer->max_rate) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit");
+ return -EINVAL;
+ }
+
+ if (burst < policer_item->policer->min_burst) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit");
+ return -EINVAL;
+ }
+
+ if (burst > policer_item->policer->max_burst) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit");
+ return -EINVAL;
+ }
+
+ err = devlink->ops->trap_policer_set(devlink, policer_item->policer,
+ rate, burst, info->extack);
if (err)
return err;
+ policer_item->rate = rate;
+ policer_item->burst = burst;
+
return 0;
}
+static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ if (!devlink->ops->trap_policer_set)
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ return devlink_trap_policer_set(devlink, policer_item, info);
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+ [DEVLINK_ATTR_UNSPEC] = { .strict_start_type =
+ DEVLINK_ATTR_TRAP_POLICER_ID },
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
@@ -5971,6 +6657,10 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 },
[DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 },
[DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -6094,7 +6784,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_eswitch_get_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_ESWITCH_SET,
@@ -6193,6 +6884,13 @@ static const struct genl_ops devlink_nl_ops[] = {
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
+ .cmd = DEVLINK_CMD_REGION_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = devlink_nl_cmd_region_new,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
.cmd = DEVLINK_CMD_REGION_DEL,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_region_del,
@@ -6298,6 +6996,19 @@ static const struct genl_ops devlink_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_GET,
+ .doit = devlink_nl_cmd_trap_policer_get_doit,
+ .dumpit = devlink_nl_cmd_trap_policer_get_dumpit,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_SET,
+ .doit = devlink_nl_cmd_trap_policer_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
static struct genl_family devlink_nl_family __ro_after_init = {
@@ -6335,6 +7046,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
if (!devlink)
return NULL;
devlink->ops = ops;
+ xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
__devlink_net_set(devlink, &init_net);
INIT_LIST_HEAD(&devlink->port_list);
INIT_LIST_HEAD(&devlink->sb_list);
@@ -6345,6 +7057,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
INIT_LIST_HEAD(&devlink->reporter_list);
INIT_LIST_HEAD(&devlink->trap_list);
INIT_LIST_HEAD(&devlink->trap_group_list);
+ INIT_LIST_HEAD(&devlink->trap_policer_list);
mutex_init(&devlink->lock);
mutex_init(&devlink->reporters_lock);
return devlink;
@@ -6429,6 +7142,7 @@ void devlink_free(struct devlink *devlink)
{
mutex_destroy(&devlink->reporters_lock);
mutex_destroy(&devlink->lock);
+ WARN_ON(!list_empty(&devlink->trap_policer_list));
WARN_ON(!list_empty(&devlink->trap_group_list));
WARN_ON(!list_empty(&devlink->trap_list));
WARN_ON(!list_empty(&devlink->reporter_list));
@@ -6439,6 +7153,8 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->sb_list));
WARN_ON(!list_empty(&devlink->port_list));
+ xa_destroy(&devlink->snapshot_ids);
+
kfree(devlink);
}
EXPORT_SYMBOL_GPL(devlink_free);
@@ -6734,6 +7450,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
if (!attrs->split)
n = snprintf(name, len, "p%u", attrs->phys.port_number);
else
@@ -7553,21 +8270,24 @@ EXPORT_SYMBOL_GPL(devlink_param_value_str_fill);
* devlink_region_create - create a new address region
*
* @devlink: devlink
- * @region_name: region name
+ * @ops: region operations and name
* @region_max_snapshots: Maximum supported number of snapshots for region
* @region_size: size of region
*/
-struct devlink_region *devlink_region_create(struct devlink *devlink,
- const char *region_name,
- u32 region_max_snapshots,
- u64 region_size)
+struct devlink_region *
+devlink_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
{
struct devlink_region *region;
int err = 0;
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
mutex_lock(&devlink->lock);
- if (devlink_region_get_by_name(devlink, region_name)) {
+ if (devlink_region_get_by_name(devlink, ops->name)) {
err = -EEXIST;
goto unlock;
}
@@ -7580,7 +8300,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink,
region->devlink = devlink;
region->max_snapshots = region_max_snapshots;
- region->name = region_name;
+ region->ops = ops;
region->size = region_size;
INIT_LIST_HEAD(&region->snapshot_list);
list_add_tail(&region->list, &devlink->region_list);
@@ -7626,75 +8346,66 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy);
* Driver should use the same id for multiple snapshots taken
* on multiple regions at the same time/by the same trigger.
*
+ * The caller of this function must use devlink_region_snapshot_id_put
+ * when finished creating regions using this id.
+ *
+ * Returns zero on success, or a negative error code on failure.
+ *
* @devlink: devlink
+ * @id: storage to return id
*/
-u32 devlink_region_snapshot_id_get(struct devlink *devlink)
+int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
{
- u32 id;
+ int err;
mutex_lock(&devlink->lock);
- id = ++devlink->snapshot_id;
+ err = __devlink_region_snapshot_id_get(devlink, id);
mutex_unlock(&devlink->lock);
- return id;
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
/**
+ * devlink_region_snapshot_id_put - put snapshot ID reference
+ *
+ * This should be called by a driver after finishing creating snapshots
+ * with an id. Doing so ensures that the ID can later be released in the
+ * event that all snapshots using it have been destroyed.
+ *
+ * @devlink: devlink
+ * @id: id to release reference on
+ */
+void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id)
+{
+ mutex_lock(&devlink->lock);
+ __devlink_snapshot_id_decrement(devlink, id);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put);
+
+/**
* devlink_region_snapshot_create - create a new snapshot
* This will add a new snapshot of a region. The snapshot
* will be stored on the region struct and can be accessed
- * from devlink. This is useful for future analyses of snapshots.
+ * from devlink. This is useful for future analyses of snapshots.
* Multiple snapshots can be created on a region.
* The @snapshot_id should be obtained using the getter function.
*
* @region: devlink region of the snapshot
* @data: snapshot data
* @snapshot_id: snapshot id to be created
- * @data_destructor: pointer to destructor function to free data
*/
int devlink_region_snapshot_create(struct devlink_region *region,
- u8 *data, u32 snapshot_id,
- devlink_snapshot_data_dest_t *data_destructor)
+ u8 *data, u32 snapshot_id)
{
struct devlink *devlink = region->devlink;
- struct devlink_snapshot *snapshot;
int err;
mutex_lock(&devlink->lock);
-
- /* check if region can hold one more snapshot */
- if (region->cur_snapshots == region->max_snapshots) {
- err = -ENOMEM;
- goto unlock;
- }
-
- if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
- err = -EEXIST;
- goto unlock;
- }
-
- snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
- if (!snapshot) {
- err = -ENOMEM;
- goto unlock;
- }
-
- snapshot->id = snapshot_id;
- snapshot->region = region;
- snapshot->data = data;
- snapshot->data_destructor = data_destructor;
-
- list_add_tail(&snapshot->list, &region->snapshot_list);
-
- region->cur_snapshots++;
-
- devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
mutex_unlock(&devlink->lock);
- return 0;
-unlock:
- mutex_unlock(&devlink->lock);
return err;
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
@@ -7734,6 +8445,8 @@ static const struct devlink_trap devlink_trap_generic[] = {
DEVLINK_TRAP(NON_ROUTABLE, DROP),
DEVLINK_TRAP(DECAP_ERROR, EXCEPTION),
DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
+ DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP),
};
#define DEVLINK_TRAP_GROUP(_id) \
@@ -7747,6 +8460,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = {
DEVLINK_TRAP_GROUP(L3_DROPS),
DEVLINK_TRAP_GROUP(BUFFER_DROPS),
DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
+ DEVLINK_TRAP_GROUP(ACL_DROPS),
};
static int devlink_trap_generic_verify(const struct devlink_trap *trap)
@@ -7780,7 +8494,7 @@ static int devlink_trap_driver_verify(const struct devlink_trap *trap)
static int devlink_trap_verify(const struct devlink_trap *trap)
{
- if (!trap || !trap->name || !trap->group.name)
+ if (!trap || !trap->name)
return -EINVAL;
if (trap->generic)
@@ -7851,108 +8565,22 @@ devlink_trap_group_notify(struct devlink *devlink,
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
-static struct devlink_trap_group_item *
-devlink_trap_group_item_create(struct devlink *devlink,
- const struct devlink_trap_group *group)
-{
- struct devlink_trap_group_item *group_item;
- int err;
-
- err = devlink_trap_group_verify(group);
- if (err)
- return ERR_PTR(err);
-
- group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
- if (!group_item)
- return ERR_PTR(-ENOMEM);
-
- group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
- if (!group_item->stats) {
- err = -ENOMEM;
- goto err_stats_alloc;
- }
-
- group_item->group = group;
- refcount_set(&group_item->refcount, 1);
-
- if (devlink->ops->trap_group_init) {
- err = devlink->ops->trap_group_init(devlink, group);
- if (err)
- goto err_group_init;
- }
-
- list_add_tail(&group_item->list, &devlink->trap_group_list);
- devlink_trap_group_notify(devlink, group_item,
- DEVLINK_CMD_TRAP_GROUP_NEW);
-
- return group_item;
-
-err_group_init:
- free_percpu(group_item->stats);
-err_stats_alloc:
- kfree(group_item);
- return ERR_PTR(err);
-}
-
-static void
-devlink_trap_group_item_destroy(struct devlink *devlink,
- struct devlink_trap_group_item *group_item)
-{
- devlink_trap_group_notify(devlink, group_item,
- DEVLINK_CMD_TRAP_GROUP_DEL);
- list_del(&group_item->list);
- free_percpu(group_item->stats);
- kfree(group_item);
-}
-
-static struct devlink_trap_group_item *
-devlink_trap_group_item_get(struct devlink *devlink,
- const struct devlink_trap_group *group)
-{
- struct devlink_trap_group_item *group_item;
-
- group_item = devlink_trap_group_item_lookup(devlink, group->name);
- if (group_item) {
- refcount_inc(&group_item->refcount);
- return group_item;
- }
-
- return devlink_trap_group_item_create(devlink, group);
-}
-
-static void
-devlink_trap_group_item_put(struct devlink *devlink,
- struct devlink_trap_group_item *group_item)
-{
- if (!refcount_dec_and_test(&group_item->refcount))
- return;
-
- devlink_trap_group_item_destroy(devlink, group_item);
-}
-
static int
devlink_trap_item_group_link(struct devlink *devlink,
struct devlink_trap_item *trap_item)
{
+ u16 group_id = trap_item->trap->init_group_id;
struct devlink_trap_group_item *group_item;
- group_item = devlink_trap_group_item_get(devlink,
- &trap_item->trap->group);
- if (IS_ERR(group_item))
- return PTR_ERR(group_item);
+ group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id);
+ if (WARN_ON_ONCE(!group_item))
+ return -EINVAL;
trap_item->group_item = group_item;
return 0;
}
-static void
-devlink_trap_item_group_unlink(struct devlink *devlink,
- struct devlink_trap_item *trap_item)
-{
- devlink_trap_group_item_put(devlink, trap_item->group_item);
-}
-
static void devlink_trap_notify(struct devlink *devlink,
const struct devlink_trap_item *trap_item,
enum devlink_command cmd)
@@ -8015,7 +8643,6 @@ devlink_trap_register(struct devlink *devlink,
return 0;
err_trap_init:
- devlink_trap_item_group_unlink(devlink, trap_item);
err_group_link:
free_percpu(trap_item->stats);
err_stats_alloc:
@@ -8036,7 +8663,6 @@ static void devlink_trap_unregister(struct devlink *devlink,
list_del(&trap_item->list);
if (devlink->ops->trap_fini)
devlink->ops->trap_fini(devlink, trap, trap_item);
- devlink_trap_item_group_unlink(devlink, trap_item);
free_percpu(trap_item->stats);
kfree(trap_item);
}
@@ -8138,12 +8764,14 @@ devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
static void
devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
const struct devlink_trap_item *trap_item,
- struct devlink_port *in_devlink_port)
+ struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
{
struct devlink_trap_group_item *group_item = trap_item->group_item;
hw_metadata->trap_group_name = group_item->group->name;
hw_metadata->trap_name = trap_item->trap->name;
+ hw_metadata->fa_cookie = fa_cookie;
spin_lock(&in_devlink_port->type_lock);
if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
@@ -8157,9 +8785,12 @@ devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
* @skb: Trapped packet.
* @trap_ctx: Trap context.
* @in_devlink_port: Input devlink port.
+ * @fa_cookie: Flow action cookie. Could be NULL.
*/
void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
- void *trap_ctx, struct devlink_port *in_devlink_port)
+ void *trap_ctx, struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
+
{
struct devlink_trap_item *trap_item = trap_ctx;
struct net_dm_hw_metadata hw_metadata = {};
@@ -8168,7 +8799,7 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
devlink_trap_report_metadata_fill(&hw_metadata, trap_item,
- in_devlink_port);
+ in_devlink_port, fa_cookie);
net_dm_hw_report(skb, &hw_metadata);
}
EXPORT_SYMBOL_GPL(devlink_trap_report);
@@ -8187,6 +8818,288 @@ void *devlink_trap_ctx_priv(void *trap_ctx)
}
EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
+static int
+devlink_trap_group_item_policer_link(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item)
+{
+ u32 policer_id = group_item->group->init_policer_id;
+ struct devlink_trap_policer_item *policer_item;
+
+ if (policer_id == 0)
+ return 0;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
+ if (WARN_ON_ONCE(!policer_item))
+ return -EINVAL;
+
+ group_item->policer_item = policer_item;
+
+ return 0;
+}
+
+static int
+devlink_trap_group_register(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+ int err;
+
+ if (devlink_trap_group_item_lookup(devlink, group->name))
+ return -EEXIST;
+
+ group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
+ if (!group_item)
+ return -ENOMEM;
+
+ group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!group_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ group_item->group = group;
+
+ err = devlink_trap_group_item_policer_link(devlink, group_item);
+ if (err)
+ goto err_policer_link;
+
+ if (devlink->ops->trap_group_init) {
+ err = devlink->ops->trap_group_init(devlink, group);
+ if (err)
+ goto err_group_init;
+ }
+
+ list_add_tail(&group_item->list, &devlink->trap_group_list);
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+
+ return 0;
+
+err_group_init:
+err_policer_link:
+ free_percpu(group_item->stats);
+err_stats_alloc:
+ kfree(group_item);
+ return err;
+}
+
+static void
+devlink_trap_group_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_lookup(devlink, group->name);
+ if (WARN_ON_ONCE(!group_item))
+ return;
+
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+ list_del(&group_item->list);
+ free_percpu(group_item->stats);
+ kfree(group_item);
+}
+
+/**
+ * devlink_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_trap_groups_register(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i, err;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < groups_count; i++) {
+ const struct devlink_trap_group *group = &groups[i];
+
+ err = devlink_trap_group_verify(group);
+ if (err)
+ goto err_trap_group_verify;
+
+ err = devlink_trap_group_register(devlink, group);
+ if (err)
+ goto err_trap_group_register;
+ }
+ mutex_unlock(&devlink->lock);
+
+ return 0;
+
+err_trap_group_register:
+err_trap_group_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_register);
+
+/**
+ * devlink_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ */
+void devlink_trap_groups_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i;
+
+ mutex_lock(&devlink->lock);
+ for (i = groups_count - 1; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister);
+
+static void
+devlink_trap_policer_notify(struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW &&
+ cmd != DEVLINK_CMD_TRAP_POLICER_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0,
+ 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int
+devlink_trap_policer_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+ int err;
+
+ if (devlink_trap_policer_item_lookup(devlink, policer->id))
+ return -EEXIST;
+
+ policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL);
+ if (!policer_item)
+ return -ENOMEM;
+
+ policer_item->policer = policer;
+ policer_item->rate = policer->init_rate;
+ policer_item->burst = policer->init_burst;
+
+ if (devlink->ops->trap_policer_init) {
+ err = devlink->ops->trap_policer_init(devlink, policer);
+ if (err)
+ goto err_policer_init;
+ }
+
+ list_add_tail(&policer_item->list, &devlink->trap_policer_list);
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+
+ return 0;
+
+err_policer_init:
+ kfree(policer_item);
+ return err;
+}
+
+static void
+devlink_trap_policer_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer->id);
+ if (WARN_ON_ONCE(!policer_item))
+ return;
+
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+ list_del(&policer_item->list);
+ if (devlink->ops->trap_policer_fini)
+ devlink->ops->trap_policer_fini(devlink, policer);
+ kfree(policer_item);
+}
+
+/**
+ * devlink_trap_policers_register - Register packet trap policers with devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ *
+ * Return: Non-zero value on failure.
+ */
+int
+devlink_trap_policers_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i, err;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < policers_count; i++) {
+ const struct devlink_trap_policer *policer = &policers[i];
+
+ if (WARN_ON(policer->id == 0 ||
+ policer->max_rate < policer->min_rate ||
+ policer->max_burst < policer->min_burst)) {
+ err = -EINVAL;
+ goto err_trap_policer_verify;
+ }
+
+ err = devlink_trap_policer_register(devlink, policer);
+ if (err)
+ goto err_trap_policer_register;
+ }
+ mutex_unlock(&devlink->lock);
+
+ return 0;
+
+err_trap_policer_register:
+err_trap_policer_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_policers_register);
+
+/**
+ * devlink_trap_policers_unregister - Unregister packet trap policers from devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ */
+void
+devlink_trap_policers_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i;
+
+ mutex_lock(&devlink->lock);
+ for (i = policers_count - 1; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_policers_unregister);
+
static void __devlink_compat_running_version(struct devlink *devlink,
char *buf, size_t len)
{
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 31700e0c3928..8e33cec9fc4e 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -29,6 +29,7 @@
#include <net/drop_monitor.h>
#include <net/genetlink.h>
#include <net/netevent.h>
+#include <net/flow_offload.h>
#include <trace/events/skb.h>
#include <trace/events/napi.h>
@@ -67,7 +68,7 @@ struct net_dm_hw_entry {
struct net_dm_hw_entries {
u32 num_entries;
- struct net_dm_hw_entry entries[0];
+ struct net_dm_hw_entry entries[];
};
struct per_cpu_dm_data {
@@ -701,6 +702,13 @@ static void net_dm_packet_work(struct work_struct *work)
}
static size_t
+net_dm_flow_action_cookie_size(const struct net_dm_hw_metadata *hw_metadata)
+{
+ return hw_metadata->fa_cookie ?
+ nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0;
+}
+
+static size_t
net_dm_hw_packet_report_size(size_t payload_len,
const struct net_dm_hw_metadata *hw_metadata)
{
@@ -717,6 +725,8 @@ net_dm_hw_packet_report_size(size_t payload_len,
nla_total_size(strlen(hw_metadata->trap_name) + 1) +
/* NET_DM_ATTR_IN_PORT */
net_dm_in_port_size() +
+ /* NET_DM_ATTR_FLOW_ACTION_COOKIE */
+ net_dm_flow_action_cookie_size(hw_metadata) +
/* NET_DM_ATTR_TIMESTAMP */
nla_total_size(sizeof(u64)) +
/* NET_DM_ATTR_ORIG_LEN */
@@ -762,6 +772,12 @@ static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
goto nla_put_failure;
}
+ if (hw_metadata->fa_cookie &&
+ nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE,
+ hw_metadata->fa_cookie->cookie_len,
+ hw_metadata->fa_cookie->cookie))
+ goto nla_put_failure;
+
if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
goto nla_put_failure;
@@ -794,11 +810,12 @@ nla_put_failure:
static struct net_dm_hw_metadata *
net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
{
+ const struct flow_action_cookie *fa_cookie;
struct net_dm_hw_metadata *n_hw_metadata;
const char *trap_group_name;
const char *trap_name;
- n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC);
+ n_hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC);
if (!n_hw_metadata)
return NULL;
@@ -812,12 +829,25 @@ net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
goto free_trap_group;
n_hw_metadata->trap_name = trap_name;
+ if (hw_metadata->fa_cookie) {
+ size_t cookie_size = sizeof(*fa_cookie) +
+ hw_metadata->fa_cookie->cookie_len;
+
+ fa_cookie = kmemdup(hw_metadata->fa_cookie, cookie_size,
+ GFP_ATOMIC);
+ if (!fa_cookie)
+ goto free_trap_name;
+ n_hw_metadata->fa_cookie = fa_cookie;
+ }
+
n_hw_metadata->input_dev = hw_metadata->input_dev;
if (n_hw_metadata->input_dev)
dev_hold(n_hw_metadata->input_dev);
return n_hw_metadata;
+free_trap_name:
+ kfree(trap_name);
free_trap_group:
kfree(trap_group_name);
free_hw_metadata:
@@ -830,6 +860,7 @@ net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
{
if (hw_metadata->input_dev)
dev_put(hw_metadata->input_dev);
+ kfree(hw_metadata->fa_cookie);
kfree(hw_metadata->trap_name);
kfree(hw_metadata->trap_group_name);
kfree(hw_metadata);
diff --git a/net/core/filter.c b/net/core/filter.c
index c180871e606d..7628b947dbc3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2642,6 +2642,19 @@ static const struct bpf_func_proto bpf_msg_pop_data_proto = {
.arg4_type = ARG_ANYTHING,
};
+#ifdef CONFIG_CGROUP_NET_CLASSID
+BPF_CALL_0(bpf_get_cgroup_classid_curr)
+{
+ return __task_get_classid(current);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+ .func = bpf_get_cgroup_classid_curr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+#endif
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -3626,7 +3639,6 @@ err:
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
return err;
}
-EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
@@ -4062,7 +4074,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
- if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+ if (unlikely(!xdp ||
+ xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
return -EFAULT;
return bpf_event_output(map, flags, meta, meta_size, xdp->data,
@@ -4080,6 +4093,19 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+static int bpf_xdp_output_btf_ids[5];
+const struct bpf_func_proto bpf_xdp_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_xdp_output_btf_ids,
+};
+
BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
return skb->sk ? sock_gen_cookie(skb->sk) : 0;
@@ -4104,6 +4130,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
+{
+ return sock_gen_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
+ .func = bpf_get_socket_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
return sock_gen_cookie(ctx->sk);
@@ -4116,6 +4154,39 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+#ifdef CONFIG_NET_NS
+ return net_gen_cookie(sk ? sk->sk_net.net : &init_net);
+#else
+ return 0;
+#endif
+}
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+ .func = bpf_get_netns_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+ .func = bpf_get_netns_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
struct sock *sk = sk_to_full_sk(skb->sk);
@@ -4134,8 +4205,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
- struct bpf_map *, map, u64, flags, void *, data, u64, size)
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags,
+ void *, data, u64, size)
{
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -4143,8 +4214,8 @@ BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
}
-static const struct bpf_func_proto bpf_sockopt_event_output_proto = {
- .func = bpf_sockopt_event_output,
+static const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -5330,8 +5401,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- /* Only full sockets have sk->sk_flags. */
- if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
+ if (sk_is_refcounted(sk))
sock_gen_put(sk);
return 0;
}
@@ -5847,6 +5917,36 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+ if (flags != 0)
+ return -EINVAL;
+ if (!skb_at_tc_ingress(skb))
+ return -EOPNOTSUPP;
+ if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ return -ENETUNREACH;
+ if (unlikely(sk->sk_reuseport))
+ return -ESOCKTNOSUPPORT;
+ if (sk_is_refcounted(sk) &&
+ unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ return -ENOENT;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_pfree;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+ .func = bpf_sk_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -5941,6 +6041,26 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -5965,8 +6085,26 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_addr_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_addr_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sock_addr_sk_lookup_tcp_proto;
@@ -6140,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_ecn_set_ce_proto;
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_assign_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -6209,7 +6349,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
- return &bpf_sockopt_event_output_proto;
+ return &bpf_event_output_data_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
@@ -7140,6 +7280,27 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb_shinfo(SKB); */
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, end));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, head));
+ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+#else
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, end));
+#endif
+
+ return insn;
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -7462,26 +7623,21 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, gso_segs):
- /* si->dst_reg = skb_shinfo(SKB); */
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
- BPF_REG_AX, si->src_reg,
- offsetof(struct sk_buff, end));
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
- si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, head));
- *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
-#else
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
- si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, end));
-#endif
+ insn = bpf_convert_shinfo_access(si, insn);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
si->dst_reg, si->dst_reg,
bpf_target_off(struct skb_shared_info,
gso_segs, 2,
target_size));
break;
+ case offsetof(struct __sk_buff, gso_size):
+ insn = bpf_convert_shinfo_access(si, insn);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ gso_size, 2,
+ target_size));
+ break;
case offsetof(struct __sk_buff, wire_len):
BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
@@ -8620,6 +8776,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
struct bpf_map *, map, void *, key, u32, flags)
{
+ bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
struct sock_reuseport *reuse;
struct sock *selected_sk;
@@ -8628,26 +8785,20 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
return -ENOENT;
reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
- if (!reuse)
- /* selected_sk is unhashed (e.g. by close()) after the
- * above map_lookup_elem(). Treat selected_sk has already
- * been removed from the map.
+ if (!reuse) {
+ /* reuseport_array has only sk with non NULL sk_reuseport_cb.
+ * The only (!reuse) case here is - the sk has already been
+ * unhashed (e.g. by close()), so treat it as -ENOENT.
+ *
+ * Other maps (e.g. sock_map) do not provide this guarantee and
+ * the sk may never be in the reuseport group to begin with.
*/
- return -ENOENT;
+ return is_sockarray ? -ENOENT : -EINVAL;
+ }
if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
- struct sock *sk;
-
- if (unlikely(!reuse_kern->reuseport_id))
- /* There is a small race between adding the
- * sk to the map and setting the
- * reuse_kern->reuseport_id.
- * Treat it as the sk has not been added to
- * the bpf map yet.
- */
- return -ENOENT;
+ struct sock *sk = reuse_kern->sk;
- sk = reuse_kern->sk;
if (sk->sk_protocol != selected_sk->sk_protocol)
return -EPROTOTYPE;
else if (sk->sk_family != selected_sk->sk_family)
@@ -8835,10 +8986,9 @@ const struct bpf_prog_ops sk_reuseport_prog_ops = {
};
#endif /* CONFIG_INET */
-DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+DEFINE_BPF_DISPATCHER(xdp)
void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
- bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp),
- prev_prog, prog);
+ bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a1670dff0629..3eff84824c8b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -920,9 +920,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
(int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
flow_keys->flags = flags;
- preempt_disable();
- result = BPF_PROG_RUN(prog, ctx);
- preempt_enable();
+ result = bpf_prog_run_pin_on_cpu(prog, ctx);
flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 45b6a59ac124..e951b743bed3 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -167,6 +167,34 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
}
EXPORT_SYMBOL(flow_rule_match_enc_opts);
+struct flow_action_cookie *flow_action_cookie_create(void *data,
+ unsigned int len,
+ gfp_t gfp)
+{
+ struct flow_action_cookie *cookie;
+
+ cookie = kmalloc(sizeof(*cookie) + len, gfp);
+ if (!cookie)
+ return NULL;
+ cookie->cookie_len = len;
+ memcpy(cookie->cookie, data, len);
+ return cookie;
+}
+EXPORT_SYMBOL(flow_action_cookie_create);
+
+void flow_action_cookie_destroy(struct flow_action_cookie *cookie)
+{
+ kfree(cookie);
+}
+EXPORT_SYMBOL(flow_action_cookie_destroy);
+
+void flow_rule_match_ct(const struct flow_rule *rule,
+ struct flow_match_ct *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ct);
+
struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb,
void *cb_ident, void *cb_priv,
void (*release)(void *cb_priv))
@@ -483,7 +511,8 @@ EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
void flow_indr_block_call(struct net_device *dev,
struct flow_block_offload *bo,
- enum flow_block_command command)
+ enum flow_block_command command,
+ enum tc_setup_type type)
{
struct flow_indr_block_cb *indr_block_cb;
struct flow_indr_block_dev *indr_dev;
@@ -493,8 +522,7 @@ void flow_indr_block_call(struct net_device *dev,
return;
list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- bo);
+ indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo);
}
EXPORT_SYMBOL_GPL(flow_indr_block_call);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 99a6de52b21d..7d3438215f32 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -367,7 +367,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
[LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
};
-static int bpf_build_state(struct nlattr *nla,
+static int bpf_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 2f9c0de533c7..8ec7d13d2860 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "BPF";
case LWTUNNEL_ENCAP_SEG6_LOCAL:
return "SEG6LOCAL";
+ case LWTUNNEL_ENCAP_RPL:
+ return "RPL";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -98,7 +100,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
}
EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
struct nlattr *encap, unsigned int family,
const void *cfg, struct lwtunnel_state **lws,
struct netlink_ext_ack *extack)
@@ -122,7 +124,7 @@ int lwtunnel_build_state(u16 encap_type,
rcu_read_unlock();
if (found) {
- ret = ops->build_state(encap, family, cfg, lws, extack);
+ ret = ops->build_state(net, encap, family, cfg, lws, extack);
if (ret)
module_put(ops->owner);
} else {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 789a73aa7bd8..5bf8d22a47ec 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3553,9 +3553,6 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
-#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
- NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
-
#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 4c826b8bf9b1..cf0215734ceb 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -944,6 +944,24 @@ err:
kobject_put(kobj);
return error;
}
+
+static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct netdev_rx_queue *queue = dev->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+ if (dev->sysfs_rx_queue_group)
+ error = sysfs_group_change_owner(
+ kobj, dev->sysfs_rx_queue_group, kuid, kgid);
+
+ return error;
+}
#endif /* CONFIG_SYSFS */
int
@@ -981,6 +999,29 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
#endif
}
+static int net_rx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+#ifndef CONFIG_RPS
+ if (!dev->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = 0; i < num; i++) {
+ error = rx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
#ifdef CONFIG_SYSFS
/*
* netdev_queue sysfs structures and functions.
@@ -1486,6 +1527,23 @@ err:
kobject_put(kobj);
return error;
}
+
+static int tx_queue_change_owner(struct net_device *ndev, int index,
+ kuid_t kuid, kgid_t kgid)
+{
+ struct netdev_queue *queue = ndev->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+#ifdef CONFIG_BQL
+ error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid);
+#endif
+ return error;
+}
#endif /* CONFIG_SYSFS */
int
@@ -1520,6 +1578,25 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
#endif /* CONFIG_SYSFS */
}
+static int net_tx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ error = tx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
static int register_queue_kobjects(struct net_device *dev)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
@@ -1554,6 +1631,31 @@ error:
return error;
}
+static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid)
+{
+ int error = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ if (ndev->queues_kset) {
+ error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid);
+ if (error)
+ return error;
+ }
+ real_rx = ndev->real_num_rx_queues;
+#endif
+ real_tx = ndev->real_num_tx_queues;
+
+ error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid);
+ if (error)
+ return error;
+
+ error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
static void remove_queue_kobjects(struct net_device *dev)
{
int real_rx = 0, real_tx = 0;
@@ -1767,6 +1869,37 @@ int netdev_register_kobject(struct net_device *ndev)
return error;
}
+/* Change owner for sysfs entries when moving network devices across network
+ * namespaces owned by different user namespaces.
+ */
+int netdev_change_owner(struct net_device *ndev, const struct net *net_old,
+ const struct net *net_new)
+{
+ struct device *dev = &ndev->dev;
+ kuid_t old_uid, new_uid;
+ kgid_t old_gid, new_gid;
+ int error;
+
+ net_ns_get_ownership(net_old, &old_uid, &old_gid);
+ net_ns_get_ownership(net_new, &new_uid, &new_gid);
+
+ /* The network namespace was changed but the owning user namespace is
+ * identical so there's no need to change the owner of sysfs entries.
+ */
+ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid))
+ return 0;
+
+ error = device_change_owner(dev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ error = queue_change_owner(ndev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
int netdev_class_create_file_ns(const struct class_attribute *class_attr,
const void *ns)
{
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 006876c7b78d..8a5b04c2699a 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -8,5 +8,7 @@ void netdev_unregister_kobject(struct net_device *);
int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
int netdev_queue_update_kobjects(struct net_device *net,
int old_num, int new_num);
+int netdev_change_owner(struct net_device *, const struct net *net_old,
+ const struct net *net_new);
#endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 757cc1d084e7..190ca66a383b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+static atomic64_t cookie_gen;
+
+u64 net_gen_cookie(struct net *net)
+{
+ while (1) {
+ u64 res = atomic64_read(&net->net_cookie);
+
+ if (res)
+ return res;
+ res = atomic64_inc_return(&cookie_gen);
+ atomic64_cmpxchg(&net->net_cookie, 0, res);
+ }
+}
+
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
@@ -1087,6 +1101,7 @@ static int __init net_ns_init(void)
panic("Could not allocate generic netns");
rcu_assign_pointer(init_net.gen, ng);
+ net_gen_cookie(&init_net);
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 10d2b255df5e..ef98372facf6 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -43,9 +43,11 @@ static int page_pool_init(struct page_pool *pool,
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
* which is the XDP_TX use-case.
*/
- if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
- (pool->p.dma_dir != DMA_BIDIRECTIONAL))
- return -EINVAL;
+ if (pool->p.flags & PP_FLAG_DMA_MAP) {
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+ }
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
/* In order to request DMA-sync-for-device the page
@@ -96,7 +98,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
}
EXPORT_SYMBOL(page_pool_create);
-static void __page_pool_return_page(struct page_pool *pool, struct page *page);
+static void page_pool_return_page(struct page_pool *pool, struct page *page);
noinline
static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
@@ -136,7 +138,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
* (2) break out to fallthrough to alloc_pages_node.
* This limit stress on page buddy alloactor.
*/
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
page = NULL;
break;
}
@@ -274,18 +276,25 @@ static s32 page_pool_inflight(struct page_pool *pool)
return inflight;
}
-/* Cleanup page_pool state from page */
-static void __page_pool_clean_page(struct page_pool *pool,
- struct page *page)
+/* Disconnects a page (from a page_pool). API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+void page_pool_release_page(struct page_pool *pool, struct page *page)
{
dma_addr_t dma;
int count;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ /* Always account for inflight pages, even if we didn't
+ * map them
+ */
goto skip_dma_unmap;
dma = page->dma_addr;
- /* DMA unmap */
+
+ /* When page is unmapped, it cannot be returned our pool */
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
@@ -297,21 +306,12 @@ skip_dma_unmap:
count = atomic_inc_return(&pool->pages_state_release_cnt);
trace_page_pool_state_release(pool, page, count);
}
-
-/* unmap the page and clean our state */
-void page_pool_unmap_page(struct page_pool *pool, struct page *page)
-{
- /* When page is unmapped, this implies page will not be
- * returned to page_pool.
- */
- __page_pool_clean_page(pool, page);
-}
-EXPORT_SYMBOL(page_pool_unmap_page);
+EXPORT_SYMBOL(page_pool_release_page);
/* Return a page to the page allocator, cleaning up our state */
-static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+static void page_pool_return_page(struct page_pool *pool, struct page *page)
{
- __page_pool_clean_page(pool, page);
+ page_pool_release_page(pool, page);
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
@@ -320,8 +320,7 @@ static void __page_pool_return_page(struct page_pool *pool, struct page *page)
*/
}
-static bool __page_pool_recycle_into_ring(struct page_pool *pool,
- struct page *page)
+static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
{
int ret;
/* BH protection not needed if current is serving softirq */
@@ -338,7 +337,7 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
*
* Caller must provide appropriate safe context.
*/
-static bool __page_pool_recycle_direct(struct page *page,
+static bool page_pool_recycle_in_cache(struct page *page,
struct page_pool *pool)
{
if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
@@ -357,8 +356,14 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page)
return !page_is_pfmemalloc(page);
}
-void __page_pool_put_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+/* If the page refcnt == 1, this will try to recycle the page.
+ * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
+ * the configured size min(dma_sync_size, pool->max_len).
+ * If the page refcnt != 1, then the page will be returned to memory
+ * subsystem.
+ */
+void page_pool_put_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
@@ -375,12 +380,12 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page,
dma_sync_size);
if (allow_direct && in_serving_softirq())
- if (__page_pool_recycle_direct(page, pool))
+ if (page_pool_recycle_in_cache(page, pool))
return;
- if (!__page_pool_recycle_into_ring(pool, page)) {
+ if (!page_pool_recycle_in_ring(pool, page)) {
/* Cache full, fallback to free pages */
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
return;
}
@@ -397,12 +402,13 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page,
* doing refcnt based recycle tricks, meaning another process
* will be invoking put_page.
*/
- __page_pool_clean_page(pool, page);
+ /* Do not replace this with page_pool_return_page() */
+ page_pool_release_page(pool, page);
put_page(page);
}
-EXPORT_SYMBOL(__page_pool_put_page);
+EXPORT_SYMBOL(page_pool_put_page);
-static void __page_pool_empty_ring(struct page_pool *pool)
+static void page_pool_empty_ring(struct page_pool *pool)
{
struct page *page;
@@ -413,7 +419,7 @@ static void __page_pool_empty_ring(struct page_pool *pool)
pr_crit("%s() page_pool refcnt %d violation\n",
__func__, page_ref_count(page));
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
@@ -443,7 +449,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
*/
while (pool->alloc.count) {
page = pool->alloc.cache[--pool->alloc.count];
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
@@ -455,7 +461,7 @@ static void page_pool_scrub(struct page_pool *pool)
/* No more consumers should exist, but producers could still
* be in-flight.
*/
- __page_pool_empty_ring(pool);
+ page_pool_empty_ring(pool);
}
static int page_pool_release(struct page_pool *pool)
@@ -529,7 +535,7 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
/* Flush pool alloc cache, as refill will check NUMA node */
while (pool->alloc.count) {
page = pool->alloc.cache[--pool->alloc.count];
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
EXPORT_SYMBOL(page_pool_update_nid);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index d0641bba6b81..08e2811b5274 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2003,8 +2003,8 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
return -ENODEV;
}
- if (odev->type != ARPHRD_ETHER) {
- pr_err("not an ethernet device: \"%s\"\n", ifname);
+ if (odev->type != ARPHRD_ETHER && odev->type != ARPHRD_LOOPBACK) {
+ pr_err("not an ethernet or loopback device: \"%s\"\n", ifname);
err = -EINVAL;
} else if (!netif_running(odev)) {
pr_err("device is down: \"%s\"\n", ifname);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e1152f4ffe33..709ebbf8ab5b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1872,7 +1872,9 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
};
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD },
[IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
[IFLA_XDP_FLAGS] = { .type = NLA_U32 },
[IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
@@ -2799,8 +2801,20 @@ static int do_setlink(const struct sk_buff *skb,
}
if (xdp[IFLA_XDP_FD]) {
+ int expected_fd = -1;
+
+ if (xdp_flags & XDP_FLAGS_REPLACE) {
+ if (!xdp[IFLA_XDP_EXPECTED_FD]) {
+ err = -EINVAL;
+ goto errout;
+ }
+ expected_fd =
+ nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
+ }
+
err = dev_change_xdp_fd(dev, extack,
nla_get_s32(xdp[IFLA_XDP_FD]),
+ expected_fd,
xdp_flags);
if (err)
goto errout;
@@ -3909,7 +3923,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
@@ -4020,7 +4034,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
@@ -4246,13 +4260,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (!br_idx) { /* user did not specify a specific bridge */
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (netif_is_bridge_port(dev)) {
br_dev = netdev_master_upper_dev_get(dev);
cops = br_dev->netdev_ops;
}
} else {
if (dev != br_dev &&
- !(dev->priv_flags & IFF_BRIDGE_PORT))
+ !netif_is_bridge_port(dev))
continue;
if (br_dev != netdev_master_upper_dev_get(dev) &&
@@ -4264,7 +4278,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (idx < s_idx)
goto cont;
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (netif_is_bridge_port(dev)) {
if (cops && cops->ndo_fdb_dump) {
err = cops->ndo_fdb_dump(skb, cb,
br_dev, dev,
@@ -4414,7 +4428,7 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (dev) {
if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
- if (!(dev->priv_flags & IFF_BRIDGE_PORT)) {
+ if (!netif_is_bridge_port(dev)) {
NL_SET_ERR_MSG(extack, "Device is not a bridge port");
return -EINVAL;
}
@@ -4553,7 +4567,11 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
brport_nla_put_flag(skb, flags, mask,
IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
brport_nla_put_flag(skb, flags, mask,
- IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) {
nla_nest_cancel(skb, protinfo);
goto nla_put_failure;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e1101a4f90a6..7e29590482ce 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3668,6 +3668,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
skb_push(nskb, -skb_network_offset(nskb) + offset);
+ skb_release_head_state(nskb);
__copy_skb_header(nskb, skb);
skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
@@ -3926,14 +3927,21 @@ normal:
goto perform_csum_check;
if (!sg) {
- if (!nskb->remcsum_offload)
- nskb->ip_summed = CHECKSUM_NONE;
- SKB_GSO_CB(nskb)->csum =
- skb_copy_and_csum_bits(head_skb, offset,
- skb_put(nskb, len),
- len, 0);
- SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ if (!csum) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb,
+ len),
+ len, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ } else {
+ skb_copy_bits(head_skb, offset,
+ skb_put(nskb, len),
+ len);
+ }
continue;
}
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ded2d5227678..c479372f2cd2 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -512,7 +512,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
refcount_set(&psock->refcnt, 1);
- rcu_assign_sk_user_data(sk, psock);
+ rcu_assign_sk_user_data_nocopy(sk, psock);
sock_hold(sk);
return psock;
@@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct bpf_prog *prog;
int ret;
- preempt_disable();
rcu_read_lock();
prog = READ_ONCE(psock->progs.msg_parser);
if (unlikely(!prog)) {
@@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
sk_msg_compute_data_pointers(msg);
msg->sk = sk;
- ret = BPF_PROG_RUN(prog, msg);
+ ret = bpf_prog_run_pin_on_cpu(prog, msg);
ret = sk_psock_map_verd(ret, msg->sk_redir);
psock->apply_bytes = msg->apply_bytes;
if (ret == __SK_REDIRECT) {
@@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
}
out:
rcu_read_unlock();
- preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
@@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
skb->sk = psock->sk;
bpf_compute_data_end_sk_skb(skb);
- preempt_disable();
- ret = BPF_PROG_RUN(prog, skb);
- preempt_enable();
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
/* strparser clones the skb before handing it to a upper layer,
* meaning skb_orphan has been called. We NULL sk on the way out
* to ensure we don't trigger a BUG_ON() in skb/sk operations
diff --git a/net/core/sock.c b/net/core/sock.c
index 8f71684305c3..da32d9b6d09f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1572,13 +1572,14 @@ static inline void sock_lock_init(struct sock *sk)
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
+ const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
- osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+ prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
@@ -1792,16 +1793,17 @@ static void sk_init_common(struct sock *sk)
*/
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
+ struct proto *prot = READ_ONCE(sk->sk_prot);
struct sock *newsk;
bool is_charged = true;
- newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+ newsk = sk_prot_alloc(prot, priority, sk->sk_family);
if (newsk != NULL) {
struct sk_filter *filter;
sock_copy(newsk, sk);
- newsk->sk_prot_creator = sk->sk_prot;
+ newsk->sk_prot_creator = prot;
/* SANITY */
if (likely(newsk->sk_net_refcnt))
@@ -1866,6 +1868,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
goto out;
}
+ /* Clear sk_user_data if parent had the pointer tagged
+ * as not suitable for copying when cloning.
+ */
+ if (sk_user_data_is_nocopy(newsk))
+ RCU_INIT_POINTER(newsk->sk_user_data, NULL);
+
newsk->sk_err = 0;
newsk->sk_err_soft = 0;
newsk->sk_priority = 0;
@@ -2063,6 +2071,18 @@ void sock_efree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_efree);
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ if (sk_is_refcounted(skb->sk))
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
+
kuid_t sock_i_uid(struct sock *sk)
{
kuid_t uid;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index b70c844a88ec..b08dfae10f88 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -10,6 +10,8 @@
#include <linux/skmsg.h>
#include <linux/list.h>
#include <linux/jhash.h>
+#include <linux/sock_diag.h>
+#include <net/udp.h>
struct bpf_stab {
struct bpf_map map;
@@ -31,7 +33,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
return ERR_PTR(-EPERM);
if (attr->max_entries == 0 ||
attr->key_size != 4 ||
- attr->value_size != 4 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
@@ -139,12 +142,58 @@ static void sock_map_unref(struct sock *sk, void *link_raw)
}
}
+static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
+{
+ struct proto *prot;
+
+ sock_owned_by_me(sk);
+
+ switch (sk->sk_type) {
+ case SOCK_STREAM:
+ prot = tcp_bpf_get_proto(sk, psock);
+ break;
+
+ case SOCK_DGRAM:
+ prot = udp_bpf_get_proto(sk, psock);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (IS_ERR(prot))
+ return PTR_ERR(prot);
+
+ sk_psock_update_proto(sk, psock, prot);
+ return 0;
+}
+
+static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock) {
+ if (sk->sk_prot->close != sock_map_close) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ psock = ERR_PTR(-EBUSY);
+ }
+out:
+ rcu_read_unlock();
+ return psock;
+}
+
static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
struct sock *sk)
{
struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
- bool skb_progs, sk_psock_is_new = false;
struct sk_psock *psock;
+ bool skb_progs;
int ret;
skb_verdict = READ_ONCE(progs->skb_verdict);
@@ -170,7 +219,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
}
}
- psock = sk_psock_get_checked(sk);
+ psock = sock_map_psock_get_checked(sk);
if (IS_ERR(psock)) {
ret = PTR_ERR(psock);
goto out_progs;
@@ -189,18 +238,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
ret = -ENOMEM;
goto out_progs;
}
- sk_psock_is_new = true;
}
if (msg_parser)
psock_set_prog(&psock->progs.msg_parser, msg_parser);
- if (sk_psock_is_new) {
- ret = tcp_bpf_init(sk);
- if (ret < 0)
- goto out_drop;
- } else {
- tcp_bpf_reinit(sk);
- }
+
+ ret = sock_map_init_proto(sk, psock);
+ if (ret < 0)
+ goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
if (skb_progs && !psock->parser.enabled) {
@@ -228,6 +273,27 @@ out:
return ret;
}
+static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
+{
+ struct sk_psock *psock;
+ int ret;
+
+ psock = sock_map_psock_get_checked(sk);
+ if (IS_ERR(psock))
+ return PTR_ERR(psock);
+
+ if (!psock) {
+ psock = sk_psock_init(sk, map->numa_node);
+ if (!psock)
+ return -ENOMEM;
+ }
+
+ ret = sock_map_init_proto(sk, psock);
+ if (ret < 0)
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
static void sock_map_free(struct bpf_map *map)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
@@ -277,7 +343,22 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
static void *sock_map_lookup(struct bpf_map *map, void *key)
{
- return ERR_PTR(-EOPNOTSUPP);
+ return __sock_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ sock_gen_cookie(sk);
+ return &sk->sk_cookie;
}
static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
@@ -336,11 +417,15 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
return 0;
}
+static bool sock_map_redirect_allowed(const struct sock *sk)
+{
+ return sk->sk_state != TCP_LISTEN;
+}
+
static int sock_map_update_common(struct bpf_map *map, u32 idx,
struct sock *sk, u64 flags)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
- struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_psock_link *link;
struct sk_psock *psock;
struct sock *osk;
@@ -351,14 +436,21 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
return -EINVAL;
if (unlikely(idx >= map->max_entries))
return -E2BIG;
- if (unlikely(rcu_access_pointer(icsk->icsk_ulp_data)))
+ if (inet_csk_has_ulp(sk))
return -EINVAL;
link = sk_psock_init_link();
if (!link)
return -ENOMEM;
- ret = sock_map_link(map, &stab->progs, sk);
+ /* Only sockets we can redirect into/from in BPF need to hold
+ * refs to parser/verdict progs and have their sk_data_ready
+ * and sk_write_space callbacks overridden.
+ */
+ if (sock_map_redirect_allowed(sk))
+ ret = sock_map_link(map, &stab->progs, sk);
+ else
+ ret = sock_map_link_no_progs(map, sk);
if (ret < 0)
goto out_free;
@@ -393,23 +485,52 @@ out_free:
static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
{
return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
- ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB;
}
-static bool sock_map_sk_is_suitable(const struct sock *sk)
+static bool sk_is_tcp(const struct sock *sk)
{
return sk->sk_type == SOCK_STREAM &&
sk->sk_protocol == IPPROTO_TCP;
}
+static bool sk_is_udp(const struct sock *sk)
+{
+ return sk->sk_type == SOCK_DGRAM &&
+ sk->sk_protocol == IPPROTO_UDP;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+ return sk_is_tcp(sk) || sk_is_udp(sk);
+}
+
+static bool sock_map_sk_state_allowed(const struct sock *sk)
+{
+ if (sk_is_tcp(sk))
+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
+ else if (sk_is_udp(sk))
+ return sk_hashed(sk);
+
+ return false;
+}
+
static int sock_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
- u32 ufd = *(u32 *)value;
u32 idx = *(u32 *)key;
struct socket *sock;
struct sock *sk;
int ret;
+ u64 ufd;
+
+ if (map->value_size == sizeof(u64))
+ ufd = *(u64 *)value;
+ else
+ ufd = *(u32 *)value;
+ if (ufd > S32_MAX)
+ return -EINVAL;
sock = sockfd_lookup(ufd, &ret);
if (!sock)
@@ -425,7 +546,7 @@ static int sock_map_update_elem(struct bpf_map *map, void *key,
}
sock_map_sk_acquire(sk);
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
else
ret = sock_map_update_common(map, idx, sk, flags);
@@ -462,13 +583,17 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = sk;
return SK_PASS;
}
@@ -485,12 +610,17 @@ const struct bpf_func_proto bpf_sk_redirect_map_proto = {
BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
struct bpf_map *, map, u32, key, u64, flags)
{
+ struct sock *sk;
+
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->flags = flags;
- msg->sk_redir = __sock_map_lookup_elem(map, key);
- if (!msg->sk_redir)
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
return SK_PASS;
}
@@ -508,6 +638,7 @@ const struct bpf_map_ops sock_map_ops = {
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
.map_get_next_key = sock_map_get_next_key,
+ .map_lookup_elem_sys_only = sock_map_lookup_sys,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
.map_lookup_elem = sock_map_lookup,
@@ -520,7 +651,7 @@ struct bpf_htab_elem {
u32 hash;
struct sock *sk;
struct hlist_node node;
- u8 key[0];
+ u8 key[];
};
struct bpf_htab_bucket {
@@ -664,7 +795,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
struct sock *sk, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct inet_connection_sock *icsk = inet_csk(sk);
u32 key_size = map->key_size, hash;
struct bpf_htab_elem *elem, *elem_new;
struct bpf_htab_bucket *bucket;
@@ -675,14 +805,21 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
WARN_ON_ONCE(!rcu_read_lock_held());
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
- if (unlikely(icsk->icsk_ulp_data))
+ if (inet_csk_has_ulp(sk))
return -EINVAL;
link = sk_psock_init_link();
if (!link)
return -ENOMEM;
- ret = sock_map_link(map, &htab->progs, sk);
+ /* Only sockets we can redirect into/from in BPF need to hold
+ * refs to parser/verdict progs and have their sk_data_ready
+ * and sk_write_space callbacks overridden.
+ */
+ if (sock_map_redirect_allowed(sk))
+ ret = sock_map_link(map, &htab->progs, sk);
+ else
+ ret = sock_map_link_no_progs(map, sk);
if (ret < 0)
goto out_free;
@@ -731,10 +868,17 @@ out_free:
static int sock_hash_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
- u32 ufd = *(u32 *)value;
struct socket *sock;
struct sock *sk;
int ret;
+ u64 ufd;
+
+ if (map->value_size == sizeof(u64))
+ ufd = *(u64 *)value;
+ else
+ ufd = *(u32 *)value;
+ if (ufd > S32_MAX)
+ return -EINVAL;
sock = sockfd_lookup(ufd, &ret);
if (!sock)
@@ -750,7 +894,7 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key,
}
sock_map_sk_acquire(sk);
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
else
ret = sock_hash_update_common(map, key, sk, flags);
@@ -810,7 +954,8 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
return ERR_PTR(-EPERM);
if (attr->max_entries == 0 ||
attr->key_size == 0 ||
- attr->value_size != 4 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->key_size > MAX_BPF_STACK)
@@ -889,6 +1034,26 @@ static void sock_hash_free(struct bpf_map *map)
kfree(htab);
}
+static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ sock_gen_cookie(sk);
+ return &sk->sk_cookie;
+}
+
+static void *sock_hash_lookup(struct bpf_map *map, void *key)
+{
+ return __sock_hash_lookup_elem(map, key);
+}
+
static void sock_hash_release_progs(struct bpf_map *map)
{
psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
@@ -920,13 +1085,17 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
struct bpf_map *, map, void *, key, u64, flags)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = sk;
return SK_PASS;
}
@@ -943,12 +1112,17 @@ const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
struct bpf_map *, map, void *, key, u64, flags)
{
+ struct sock *sk;
+
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->flags = flags;
- msg->sk_redir = __sock_hash_lookup_elem(map, key);
- if (!msg->sk_redir)
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
return SK_PASS;
}
@@ -968,7 +1142,8 @@ const struct bpf_map_ops sock_hash_ops = {
.map_get_next_key = sock_hash_get_next_key,
.map_update_elem = sock_hash_update_elem,
.map_delete_elem = sock_hash_delete_elem,
- .map_lookup_elem = sock_map_lookup,
+ .map_lookup_elem = sock_hash_lookup,
+ .map_lookup_elem_sys_only = sock_hash_lookup_sys,
.map_release_uref = sock_hash_release_progs,
.map_check_btf = map_check_no_btf,
};
@@ -1012,7 +1187,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
return 0;
}
-void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
{
switch (link->map->map_type) {
case BPF_MAP_TYPE_SOCKMAP:
@@ -1025,3 +1200,54 @@ void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
break;
}
}
+
+static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ while ((link = sk_psock_link_pop(psock))) {
+ sock_map_unlink(sk, link);
+ sk_psock_free_link(link);
+ }
+}
+
+void sock_map_unhash(struct sock *sk)
+{
+ void (*saved_unhash)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ if (sk->sk_prot->unhash)
+ sk->sk_prot->unhash(sk);
+ return;
+ }
+
+ saved_unhash = psock->saved_unhash;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ saved_unhash(sk);
+}
+
+void sock_map_close(struct sock *sk, long timeout)
+{
+ void (*saved_close)(struct sock *sk, long timeout);
+ struct sk_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ release_sock(sk);
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ saved_close = psock->saved_close;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ release_sock(sk);
+ saved_close(sk, timeout);
+}
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 91e9f2223c39..adcb3aea576d 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -16,27 +16,8 @@
DEFINE_SPINLOCK(reuseport_lock);
-#define REUSEPORT_MIN_ID 1
static DEFINE_IDA(reuseport_ida);
-int reuseport_get_id(struct sock_reuseport *reuse)
-{
- int id;
-
- if (reuse->reuseport_id)
- return reuse->reuseport_id;
-
- id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
- /* Called under reuseport_lock */
- GFP_ATOMIC);
- if (id < 0)
- return id;
-
- reuse->reuseport_id = id;
-
- return reuse->reuseport_id;
-}
-
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
unsigned int size = sizeof(struct sock_reuseport) +
@@ -55,6 +36,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
+ int id, ret = 0;
/* bh lock used since this function call may precede hlist lock in
* soft irq of receive path or setsockopt from process context
@@ -78,10 +60,18 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
- spin_unlock_bh(&reuseport_lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ ret = id;
+ goto out;
+ }
+
+ reuse->reuseport_id = id;
reuse->socks[0] = sk;
reuse->num_socks = 1;
reuse->bind_inany = bind_inany;
@@ -90,7 +80,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
out:
spin_unlock_bh(&reuseport_lock);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(reuseport_alloc);
@@ -134,8 +124,7 @@ static void reuseport_free_rcu(struct rcu_head *head)
reuse = container_of(head, struct sock_reuseport, rcu);
sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
- if (reuse->reuseport_id)
- ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
+ ida_free(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
@@ -199,12 +188,15 @@ void reuseport_detach_sock(struct sock *sk)
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
- /* At least one of the sk in this reuseport group is added to
- * a bpf map. Notify the bpf side. The bpf map logic will
- * remove the sk if it is indeed added to a bpf map.
+ /* Notify the bpf side. The sk may be added to a sockarray
+ * map. If so, sockarray logic will remove it from the map.
+ *
+ * Other bpf map types that work with reuseport, like sockmap,
+ * don't need an explicit callback from here. They override sk
+ * unhash/close ops to remove the sk from the map before we
+ * get to this point.
*/
- if (reuse->reuseport_id)
- bpf_sk_reuseport_detach(sk);
+ bpf_sk_reuseport_detach(sk);
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8310714c47fd..4c7ea85486af 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -372,7 +372,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
napi_direct &= !xdp_return_frame_no_direct();
- page_pool_put_page(xa->page_pool, page, napi_direct);
+ page_pool_put_full_page(xa->page_pool, page, napi_direct);
rcu_read_unlock();
break;
case MEM_TYPE_PAGE_SHARED: