summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/index.rst1
-rw-r--r--Documentation/networking/xdp-rx-metadata.rst110
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_clock.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_netdev.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_rx.c63
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/mlx4_en.h5
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h5
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h5
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c31
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h10
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c47
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c99
-rw-r--r--drivers/net/netdevsim/bpf.c4
-rw-r--r--drivers/net/veth.c87
-rw-r--r--include/linux/bpf.h61
-rw-r--r--include/linux/netdevice.h8
-rw-r--r--include/net/xdp.h21
-rw-r--r--include/net/xsk_buff_pool.h5
-rw-r--r--include/uapi/linux/bpf.h5
-rw-r--r--kernel/bpf/core.c12
-rw-r--r--kernel/bpf/offload.c425
-rw-r--r--kernel/bpf/syscall.c34
-rw-r--r--kernel/bpf/verifier.c44
-rw-r--r--net/bpf/test_run.c3
-rw-r--r--net/core/dev.c9
-rw-r--r--net/core/filter.c2
-rw-r--r--net/core/xdp.c64
-rw-r--r--tools/include/uapi/linux/bpf.h5
-rw-r--r--tools/testing/selftests/bpf/.gitignore1
-rw-r--r--tools/testing/selftests/bpf/DENYLIST.s390x1
-rw-r--r--tools/testing/selftests/bpf/Makefile9
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_metadata.c410
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_hw_metadata.c81
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_metadata.c64
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_metadata2.c23
-rwxr-xr-xtools/testing/selftests/bpf/test_offload.py10
-rw-r--r--tools/testing/selftests/bpf/xdp_hw_metadata.c403
-rw-r--r--tools/testing/selftests/bpf/xdp_metadata.h15
40 files changed, 1911 insertions, 293 deletions
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 4f2d1f682a18..4ddcae33c336 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -120,6 +120,7 @@ Contents:
xfrm_proc
xfrm_sync
xfrm_sysctl
+ xdp-rx-metadata
.. only:: subproject and html
diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
new file mode 100644
index 000000000000..aac63fc2d08b
--- /dev/null
+++ b/Documentation/networking/xdp-rx-metadata.rst
@@ -0,0 +1,110 @@
+===============
+XDP RX Metadata
+===============
+
+This document describes how an eXpress Data Path (XDP) program can access
+hardware metadata related to a packet using a set of helper functions,
+and how it can pass that metadata on to other consumers.
+
+General Design
+==============
+
+XDP has access to a set of kfuncs to manipulate the metadata in an XDP frame.
+Every device driver that wishes to expose additional packet metadata can
+implement these kfuncs. The set of kfuncs is declared in ``include/net/xdp.h``
+via ``XDP_METADATA_KFUNC_xxx``.
+
+Currently, the following kfuncs are supported. In the future, as more
+metadata is supported, this set will grow:
+
+.. kernel-doc:: net/core/xdp.c
+ :identifiers: bpf_xdp_metadata_rx_timestamp bpf_xdp_metadata_rx_hash
+
+An XDP program can use these kfuncs to read the metadata into stack
+variables for its own consumption. Or, to pass the metadata on to other
+consumers, an XDP program can store it into the metadata area carried
+ahead of the packet.
+
+Not all kfuncs have to be implemented by the device driver; when not
+implemented, the default ones that return ``-EOPNOTSUPP`` will be used.
+
+Within an XDP frame, the metadata layout (accessed via ``xdp_buff``) is
+as follows::
+
+ +----------+-----------------+------+
+ | headroom | custom metadata | data |
+ +----------+-----------------+------+
+ ^ ^
+ | |
+ xdp_buff->data_meta xdp_buff->data
+
+An XDP program can store individual metadata items into this ``data_meta``
+area in whichever format it chooses. Later consumers of the metadata
+will have to agree on the format by some out of band contract (like for
+the AF_XDP use case, see below).
+
+AF_XDP
+======
+
+:doc:`af_xdp` use-case implies that there is a contract between the BPF
+program that redirects XDP frames into the ``AF_XDP`` socket (``XSK``) and
+the final consumer. Thus the BPF program manually allocates a fixed number of
+bytes out of metadata via ``bpf_xdp_adjust_meta`` and calls a subset
+of kfuncs to populate it. The userspace ``XSK`` consumer computes
+``xsk_umem__get_data() - METADATA_SIZE`` to locate that metadata.
+Note, ``xsk_umem__get_data`` is defined in ``libxdp`` and
+``METADATA_SIZE`` is an application-specific constant (``AF_XDP`` receive
+descriptor does _not_ explicitly carry the size of the metadata).
+
+Here is the ``AF_XDP`` consumer layout (note missing ``data_meta`` pointer)::
+
+ +----------+-----------------+------+
+ | headroom | custom metadata | data |
+ +----------+-----------------+------+
+ ^
+ |
+ rx_desc->address
+
+XDP_PASS
+========
+
+This is the path where the packets processed by the XDP program are passed
+into the kernel. The kernel creates the ``skb`` out of the ``xdp_buff``
+contents. Currently, every driver has custom kernel code to parse
+the descriptors and populate ``skb`` metadata when doing this ``xdp_buff->skb``
+conversion, and the XDP metadata is not used by the kernel when building
+``skbs``. However, TC-BPF programs can access the XDP metadata area using
+the ``data_meta`` pointer.
+
+In the future, we'd like to support a case where an XDP program
+can override some of the metadata used for building ``skbs``.
+
+bpf_redirect_map
+================
+
+``bpf_redirect_map`` can redirect the frame to a different device.
+Some devices (like virtual ethernet links) support running a second XDP
+program after the redirect. However, the final consumer doesn't have
+access to the original hardware descriptor and can't access any of
+the original metadata. The same applies to XDP programs installed
+into devmaps and cpumaps.
+
+This means that for redirected packets only custom metadata is
+currently supported, which has to be prepared by the initial XDP program
+before redirect. If the frame is eventually passed to the kernel, the
+``skb`` created from such a frame won't have any hardware metadata populated
+in its ``skb``. If such a packet is later redirected into an ``XSK``,
+that will also only have access to the custom metadata.
+
+bpf_tail_call
+=============
+
+Adding programs that access metadata kfuncs to the ``BPF_MAP_TYPE_PROG_ARRAY``
+is currently not supported.
+
+Example
+=======
+
+See ``tools/testing/selftests/bpf/progs/xdp_metadata.c`` and
+``tools/testing/selftests/bpf/prog_tests/xdp_metadata.c`` for an example of
+BPF program that handles XDP metadata.
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
index 98b5ffb4d729..9e3b76182088 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -58,9 +58,7 @@ u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe)
return hi | lo;
}
-void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
- struct skb_shared_hwtstamps *hwts,
- u64 timestamp)
+u64 mlx4_en_get_hwtstamp(struct mlx4_en_dev *mdev, u64 timestamp)
{
unsigned int seq;
u64 nsec;
@@ -70,8 +68,15 @@ void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
nsec = timecounter_cyc2time(&mdev->clock, timestamp);
} while (read_seqretry(&mdev->clock_lock, seq));
+ return ns_to_ktime(nsec);
+}
+
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+ struct skb_shared_hwtstamps *hwts,
+ u64 timestamp)
+{
memset(hwts, 0, sizeof(struct skb_shared_hwtstamps));
- hwts->hwtstamp = ns_to_ktime(nsec);
+ hwts->hwtstamp = mlx4_en_get_hwtstamp(mdev, timestamp);
}
/**
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 8800d3f1f55c..af4c4858f397 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2889,6 +2889,11 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
.ndo_bpf = mlx4_xdp,
};
+static const struct xdp_metadata_ops mlx4_xdp_metadata_ops = {
+ .xmo_rx_timestamp = mlx4_en_xdp_rx_timestamp,
+ .xmo_rx_hash = mlx4_en_xdp_rx_hash,
+};
+
struct mlx4_en_bond {
struct work_struct work;
struct mlx4_en_priv *priv;
@@ -3310,6 +3315,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
dev->netdev_ops = &mlx4_netdev_ops_master;
else
dev->netdev_ops = &mlx4_netdev_ops;
+ dev->xdp_metadata_ops = &mlx4_xdp_metadata_ops;
dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT;
netif_set_real_num_tx_queues(dev, priv->tx_ring_num[TX]);
netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 8f762fc170b3..0869d4fff17b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -661,9 +661,41 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
#define MLX4_CQE_STATUS_IP_ANY (MLX4_CQE_STATUS_IPV4)
#endif
+struct mlx4_en_xdp_buff {
+ struct xdp_buff xdp;
+ struct mlx4_cqe *cqe;
+ struct mlx4_en_dev *mdev;
+ struct mlx4_en_rx_ring *ring;
+ struct net_device *dev;
+};
+
+int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+{
+ struct mlx4_en_xdp_buff *_ctx = (void *)ctx;
+
+ if (unlikely(_ctx->ring->hwtstamp_rx_filter != HWTSTAMP_FILTER_ALL))
+ return -EOPNOTSUPP;
+
+ *timestamp = mlx4_en_get_hwtstamp(_ctx->mdev,
+ mlx4_en_get_cqe_ts(_ctx->cqe));
+ return 0;
+}
+
+int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
+{
+ struct mlx4_en_xdp_buff *_ctx = (void *)ctx;
+
+ if (unlikely(!(_ctx->dev->features & NETIF_F_RXHASH)))
+ return -EOPNOTSUPP;
+
+ *hash = be32_to_cpu(_ctx->cqe->immed_rss_invalid);
+ return 0;
+}
+
int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_xdp_buff mxbuf = {};
int factor = priv->cqe_factor;
struct mlx4_en_rx_ring *ring;
struct bpf_prog *xdp_prog;
@@ -671,7 +703,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
bool doorbell_pending;
bool xdp_redir_flush;
struct mlx4_cqe *cqe;
- struct xdp_buff xdp;
int polled = 0;
int index;
@@ -681,7 +712,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
ring = priv->rx_ring[cq_ring];
xdp_prog = rcu_dereference_bh(ring->xdp_prog);
- xdp_init_buff(&xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq);
+ xdp_init_buff(&mxbuf.xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq);
doorbell_pending = false;
xdp_redir_flush = false;
@@ -776,24 +807,28 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
priv->frag_info[0].frag_size,
DMA_FROM_DEVICE);
- xdp_prepare_buff(&xdp, va - frags[0].page_offset,
- frags[0].page_offset, length, false);
- orig_data = xdp.data;
-
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
-
- length = xdp.data_end - xdp.data;
- if (xdp.data != orig_data) {
- frags[0].page_offset = xdp.data -
- xdp.data_hard_start;
- va = xdp.data;
+ xdp_prepare_buff(&mxbuf.xdp, va - frags[0].page_offset,
+ frags[0].page_offset, length, true);
+ orig_data = mxbuf.xdp.data;
+ mxbuf.cqe = cqe;
+ mxbuf.mdev = priv->mdev;
+ mxbuf.ring = ring;
+ mxbuf.dev = dev;
+
+ act = bpf_prog_run_xdp(xdp_prog, &mxbuf.xdp);
+
+ length = mxbuf.xdp.data_end - mxbuf.xdp.data;
+ if (mxbuf.xdp.data != orig_data) {
+ frags[0].page_offset = mxbuf.xdp.data -
+ mxbuf.xdp.data_hard_start;
+ va = mxbuf.xdp.data;
}
switch (act) {
case XDP_PASS:
break;
case XDP_REDIRECT:
- if (likely(!xdp_do_redirect(dev, &xdp, xdp_prog))) {
+ if (likely(!xdp_do_redirect(dev, &mxbuf.xdp, xdp_prog))) {
ring->xdp_redirect++;
xdp_redir_flush = true;
frags[0].page = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 3d4226ddba5e..544e09b97483 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -796,10 +796,15 @@ void mlx4_en_update_pfc_stats_bitmap(struct mlx4_dev *dev,
int mlx4_en_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr);
+struct xdp_md;
+int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp);
+int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash);
+
/*
* Functions for time stamping
*/
u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe);
+u64 mlx4_en_get_hwtstamp(struct mlx4_en_dev *mdev, u64 timestamp);
void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
struct skb_shared_hwtstamps *hwts,
u64 timestamp);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 2d77fb8a8a01..711dc88d8b48 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -626,10 +626,11 @@ struct mlx5e_rq;
typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
typedef struct sk_buff *
(*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
- u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+ struct mlx5_cqe64 *cqe, u16 cqe_bcnt,
+ u32 head_offset, u32 page_idx);
typedef struct sk_buff *
(*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
- u32 cqe_bcnt);
+ struct mlx5_cqe64 *cqe, u32 cqe_bcnt);
typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
typedef void (*mlx5e_fp_shampo_dealloc_hd)(struct mlx5e_rq*, u16, u16, bool);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index 853f312cd757..757c012ece27 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -73,6 +73,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
void mlx5e_free_rx_descs(struct mlx5e_rq *rq);
void mlx5e_free_rx_in_progress_descs(struct mlx5e_rq *rq);
+static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
+{
+ return config->rx_filter == HWTSTAMP_FILTER_ALL;
+}
+
/* TX */
netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev);
bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 20507ef2f956..f7d52b1d293b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -156,10 +156,39 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
return true;
}
+static int mlx5e_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+{
+ const struct mlx5e_xdp_buff *_ctx = (void *)ctx;
+
+ if (unlikely(!mlx5e_rx_hw_stamp(_ctx->rq->tstamp)))
+ return -EOPNOTSUPP;
+
+ *timestamp = mlx5e_cqe_ts_to_ns(_ctx->rq->ptp_cyc2time,
+ _ctx->rq->clock, get_cqe_ts(_ctx->cqe));
+ return 0;
+}
+
+static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
+{
+ const struct mlx5e_xdp_buff *_ctx = (void *)ctx;
+
+ if (unlikely(!(_ctx->xdp.rxq->dev->features & NETIF_F_RXHASH)))
+ return -EOPNOTSUPP;
+
+ *hash = be32_to_cpu(_ctx->cqe->rss_hash_result);
+ return 0;
+}
+
+const struct xdp_metadata_ops mlx5e_xdp_metadata_ops = {
+ .xmo_rx_timestamp = mlx5e_xdp_rx_timestamp,
+ .xmo_rx_hash = mlx5e_xdp_rx_hash,
+};
+
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct page *page,
- struct bpf_prog *prog, struct xdp_buff *xdp)
+ struct bpf_prog *prog, struct mlx5e_xdp_buff *mxbuf)
{
+ struct xdp_buff *xdp = &mxbuf->xdp;
u32 act;
int err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index bc2d9034af5b..69f338bc0633 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -44,10 +44,16 @@
(MLX5E_XDP_INLINE_WQE_MAX_DS_CNT * MLX5_SEND_WQE_DS - \
sizeof(struct mlx5_wqe_inline_seg))
+struct mlx5e_xdp_buff {
+ struct xdp_buff xdp;
+ struct mlx5_cqe64 *cqe;
+ struct mlx5e_rq *rq;
+};
+
struct mlx5e_xsk_param;
int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct page *page,
- struct bpf_prog *prog, struct xdp_buff *xdp);
+ struct bpf_prog *prog, struct mlx5e_xdp_buff *mlctx);
void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
@@ -56,6 +62,8 @@ void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq);
int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
u32 flags);
+extern const struct xdp_metadata_ops mlx5e_xdp_metadata_ops;
+
INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq,
struct mlx5e_xmit_data *xdptxd,
struct skb_shared_info *sinfo,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index c91b54d9ff27..b7c84ebe8418 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -8,6 +8,14 @@
/* RX data path */
+static struct mlx5e_xdp_buff *xsk_buff_to_mxbuf(struct xdp_buff *xdp)
+{
+ /* mlx5e_xdp_buff shares its layout with xdp_buff_xsk
+ * and private mlx5e_xdp_buff fields fall into xdp_buff_xsk->cb
+ */
+ return (struct mlx5e_xdp_buff *)xdp;
+}
+
int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
{
struct mlx5e_mpw_info *wi = mlx5e_get_mpw_info(rq, ix);
@@ -22,6 +30,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
goto err;
BUILD_BUG_ON(sizeof(wi->alloc_units[0]) != sizeof(wi->alloc_units[0].xsk));
+ XSK_CHECK_PRIV_TYPE(struct mlx5e_xdp_buff);
batch = xsk_buff_alloc_batch(rq->xsk_pool, (struct xdp_buff **)wi->alloc_units,
rq->mpwqe.pages_per_wqe);
@@ -43,25 +52,30 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED)) {
for (i = 0; i < batch; i++) {
+ struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[i].xsk);
dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
.ptag = cpu_to_be64(addr | MLX5_EN_WR),
};
+ mxbuf->rq = rq;
}
} else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
for (i = 0; i < batch; i++) {
+ struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[i].xsk);
dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
umr_wqe->inline_ksms[i] = (struct mlx5_ksm) {
.key = rq->mkey_be,
.va = cpu_to_be64(addr),
};
+ mxbuf->rq = rq;
}
} else if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE)) {
u32 mapping_size = 1 << (rq->mpwqe.page_shift - 2);
for (i = 0; i < batch; i++) {
+ struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[i].xsk);
dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
umr_wqe->inline_ksms[i << 2] = (struct mlx5_ksm) {
@@ -80,6 +94,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
.key = rq->mkey_be,
.va = cpu_to_be64(rq->wqe_overflow.addr),
};
+ mxbuf->rq = rq;
}
} else {
__be32 pad_size = cpu_to_be32((1 << rq->mpwqe.page_shift) -
@@ -87,6 +102,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
__be32 frame_size = cpu_to_be32(rq->xsk_pool->chunk_size);
for (i = 0; i < batch; i++) {
+ struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[i].xsk);
dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
umr_wqe->inline_klms[i << 1] = (struct mlx5_klm) {
@@ -99,6 +115,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
.va = cpu_to_be64(rq->wqe_overflow.addr),
.bcount = pad_size,
};
+ mxbuf->rq = rq;
}
}
@@ -229,11 +246,12 @@ static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, struct xdp_b
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi,
+ struct mlx5_cqe64 *cqe,
u16 cqe_bcnt,
u32 head_offset,
u32 page_idx)
{
- struct xdp_buff *xdp = wi->alloc_units[page_idx].xsk;
+ struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[page_idx].xsk);
struct bpf_prog *prog;
/* Check packet size. Note LRO doesn't use linear SKB */
@@ -249,9 +267,11 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
*/
WARN_ON_ONCE(head_offset);
- xsk_buff_set_size(xdp, cqe_bcnt);
- xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
- net_prefetch(xdp->data);
+ /* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */
+ mxbuf->cqe = cqe;
+ xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt);
+ xsk_buff_dma_sync_for_cpu(&mxbuf->xdp, rq->xsk_pool);
+ net_prefetch(mxbuf->xdp.data);
/* Possible flows:
* - XDP_REDIRECT to XSKMAP:
@@ -269,7 +289,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
*/
prog = rcu_dereference(rq->xdp_prog);
- if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, xdp))) {
+ if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, mxbuf))) {
if (likely(__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)))
__set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
return NULL; /* page/packet was consumed by XDP */
@@ -278,14 +298,15 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
* frame. On SKB allocation failure, NULL is returned.
*/
- return mlx5e_xsk_construct_skb(rq, xdp);
+ return mlx5e_xsk_construct_skb(rq, &mxbuf->xdp);
}
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
+ struct mlx5_cqe64 *cqe,
u32 cqe_bcnt)
{
- struct xdp_buff *xdp = wi->au->xsk;
+ struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->au->xsk);
struct bpf_prog *prog;
/* wi->offset is not used in this function, because xdp->data and the
@@ -295,17 +316,19 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
*/
WARN_ON_ONCE(wi->offset);
- xsk_buff_set_size(xdp, cqe_bcnt);
- xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
- net_prefetch(xdp->data);
+ /* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */
+ mxbuf->cqe = cqe;
+ xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt);
+ xsk_buff_dma_sync_for_cpu(&mxbuf->xdp, rq->xsk_pool);
+ net_prefetch(mxbuf->xdp.data);
prog = rcu_dereference(rq->xdp_prog);
- if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, xdp)))
+ if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, mxbuf)))
return NULL; /* page/packet was consumed by XDP */
/* XDP_PASS: copy the data from the UMEM to a new SKB. The frame reuse
* will be handled by mlx5e_free_rx_wqe.
* On SKB allocation failure, NULL is returned.
*/
- return mlx5e_xsk_construct_skb(rq, xdp);
+ return mlx5e_xsk_construct_skb(rq, &mxbuf->xdp);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index 087c943bd8e9..cefc0ef6105d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -13,11 +13,13 @@ int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi,
+ struct mlx5_cqe64 *cqe,
u16 cqe_bcnt,
u32 head_offset,
u32 page_idx);
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
+ struct mlx5_cqe64 *cqe,
u32 cqe_bcnt);
#endif /* __MLX5_EN_XSK_RX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cff5f2e29e1e..3370c8b9f983 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5053,6 +5053,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
SET_NETDEV_DEV(netdev, mdev->device);
netdev->netdev_ops = &mlx5e_netdev_ops;
+ netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops;
mlx5e_dcbnl_build_netdev(netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index c8820ab22169..7b08653be000 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -62,10 +62,12 @@
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
- u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+ struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset,
+ u32 page_idx);
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
- u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+ struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset,
+ u32 page_idx);
static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
@@ -76,11 +78,6 @@ const struct mlx5e_rx_handlers mlx5e_rx_handlers_nic = {
.handle_rx_cqe_mpwqe_shampo = mlx5e_handle_rx_cqe_mpwrq_shampo,
};
-static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
-{
- return config->rx_filter == HWTSTAMP_FILTER_ALL;
-}
-
static inline void mlx5e_read_cqe_slot(struct mlx5_cqwq *wq,
u32 cqcc, void *data)
{
@@ -1575,16 +1572,19 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
return skb;
}
-static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom,
- u32 len, struct xdp_buff *xdp)
+static void mlx5e_fill_mxbuf(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+ void *va, u16 headroom, u32 len,
+ struct mlx5e_xdp_buff *mxbuf)
{
- xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq);
- xdp_prepare_buff(xdp, va, headroom, len, true);
+ xdp_init_buff(&mxbuf->xdp, rq->buff.frame0_sz, &rq->xdp_rxq);
+ xdp_prepare_buff(&mxbuf->xdp, va, headroom, len, true);
+ mxbuf->cqe = cqe;
+ mxbuf->rq = rq;
}
static struct sk_buff *
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
- u32 cqe_bcnt)
+ struct mlx5_cqe64 *cqe, u32 cqe_bcnt)
{
union mlx5e_alloc_unit *au = wi->au;
u16 rx_headroom = rq->buff.headroom;
@@ -1606,16 +1606,16 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
prog = rcu_dereference(rq->xdp_prog);
if (prog) {
- struct xdp_buff xdp;
+ struct mlx5e_xdp_buff mxbuf;
net_prefetchw(va); /* xdp_frame data area */
- mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
- if (mlx5e_xdp_handle(rq, au->page, prog, &xdp))
+ mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, cqe_bcnt, &mxbuf);
+ if (mlx5e_xdp_handle(rq, au->page, prog, &mxbuf))
return NULL; /* page/packet was consumed by XDP */
- rx_headroom = xdp.data - xdp.data_hard_start;
- metasize = xdp.data - xdp.data_meta;
- cqe_bcnt = xdp.data_end - xdp.data;
+ rx_headroom = mxbuf.xdp.data - mxbuf.xdp.data_hard_start;
+ metasize = mxbuf.xdp.data - mxbuf.xdp.data_meta;
+ cqe_bcnt = mxbuf.xdp.data_end - mxbuf.xdp.data;
}
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
@@ -1630,16 +1630,16 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
static struct sk_buff *
mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
- u32 cqe_bcnt)
+ struct mlx5_cqe64 *cqe, u32 cqe_bcnt)
{
struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
struct mlx5e_wqe_frag_info *head_wi = wi;
union mlx5e_alloc_unit *au = wi->au;
u16 rx_headroom = rq->buff.headroom;
struct skb_shared_info *sinfo;
+ struct mlx5e_xdp_buff mxbuf;
u32 frag_consumed_bytes;
struct bpf_prog *prog;
- struct xdp_buff xdp;
struct sk_buff *skb;
dma_addr_t addr;
u32 truesize;
@@ -1654,8 +1654,8 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
net_prefetchw(va); /* xdp_frame data area */
net_prefetch(va + rx_headroom);
- mlx5e_fill_xdp_buff(rq, va, rx_headroom, frag_consumed_bytes, &xdp);
- sinfo = xdp_get_shared_info_from_buff(&xdp);
+ mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, frag_consumed_bytes, &mxbuf);
+ sinfo = xdp_get_shared_info_from_buff(&mxbuf.xdp);
truesize = 0;
cqe_bcnt -= frag_consumed_bytes;
@@ -1673,13 +1673,13 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
dma_sync_single_for_cpu(rq->pdev, addr + wi->offset,
frag_consumed_bytes, rq->buff.map_dir);
- if (!xdp_buff_has_frags(&xdp)) {
+ if (!xdp_buff_has_frags(&mxbuf.xdp)) {
/* Init on the first fragment to avoid cold cache access
* when possible.
*/
sinfo->nr_frags = 0;
sinfo->xdp_frags_size = 0;
- xdp_buff_set_frags_flag(&xdp);
+ xdp_buff_set_frags_flag(&mxbuf.xdp);
}
frag = &sinfo->frags[sinfo->nr_frags++];
@@ -1688,7 +1688,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
skb_frag_size_set(frag, frag_consumed_bytes);
if (page_is_pfmemalloc(au->page))
- xdp_buff_set_frag_pfmemalloc(&xdp);
+ xdp_buff_set_frag_pfmemalloc(&mxbuf.xdp);
sinfo->xdp_frags_size += frag_consumed_bytes;
truesize += frag_info->frag_stride;
@@ -1701,7 +1701,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
au = head_wi->au;
prog = rcu_dereference(rq->xdp_prog);
- if (prog && mlx5e_xdp_handle(rq, au->page, prog, &xdp)) {
+ if (prog && mlx5e_xdp_handle(rq, au->page, prog, &mxbuf)) {
if (test_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
int i;
@@ -1711,22 +1711,22 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
return NULL; /* page/packet was consumed by XDP */
}
- skb = mlx5e_build_linear_skb(rq, xdp.data_hard_start, rq->buff.frame0_sz,
- xdp.data - xdp.data_hard_start,
- xdp.data_end - xdp.data,
- xdp.data - xdp.data_meta);
+ skb = mlx5e_build_linear_skb(rq, mxbuf.xdp.data_hard_start, rq->buff.frame0_sz,
+ mxbuf.xdp.data - mxbuf.xdp.data_hard_start,
+ mxbuf.xdp.data_end - mxbuf.xdp.data,
+ mxbuf.xdp.data - mxbuf.xdp.data_meta);
if (unlikely(!skb))
return NULL;
page_ref_inc(au->page);
- if (unlikely(xdp_buff_has_frags(&xdp))) {
+ if (unlikely(xdp_buff_has_frags(&mxbuf.xdp))) {
int i;
/* sinfo->nr_frags is reset by build_skb, calculate again. */
xdp_update_skb_shared_info(skb, wi - head_wi - 1,
sinfo->xdp_frags_size, truesize,
- xdp_buff_is_frag_pfmemalloc(&xdp));
+ xdp_buff_is_frag_pfmemalloc(&mxbuf.xdp));
for (i = 0; i < sinfo->nr_frags; i++) {
skb_frag_t *frag = &sinfo->frags[i];
@@ -1777,7 +1777,7 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
mlx5e_skb_from_cqe_linear,
mlx5e_skb_from_cqe_nonlinear,
mlx5e_xsk_skb_from_cqe_linear,
- rq, wi, cqe_bcnt);
+ rq, wi, cqe, cqe_bcnt);
if (!skb) {
/* probably for XDP */
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
@@ -1830,7 +1830,7 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe,
mlx5e_skb_from_cqe_linear,
mlx5e_skb_from_cqe_nonlinear,
- rq, wi, cqe_bcnt);
+ rq, wi, cqe, cqe_bcnt);
if (!skb) {
/* probably for XDP */
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
@@ -1889,7 +1889,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64
skb = INDIRECT_CALL_2(rq->mpwqe.skb_from_cqe_mpwrq,
mlx5e_skb_from_cqe_mpwrq_linear,
mlx5e_skb_from_cqe_mpwrq_nonlinear,
- rq, wi, cqe_bcnt, head_offset, page_idx);
+ rq, wi, cqe, cqe_bcnt, head_offset, page_idx);
if (!skb)
goto mpwrq_cqe_out;
@@ -1940,7 +1940,8 @@ mlx5e_fill_skb_data(struct sk_buff *skb, struct mlx5e_rq *rq,
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
- u16 cqe_bcnt, u32 head_offset, u32 page_idx)
+ struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset,
+ u32 page_idx)
{
union mlx5e_alloc_unit *au = &wi->alloc_units[page_idx];
u16 headlen = min_t(u16, MLX5E_RX_MAX_HEAD, cqe_bcnt);
@@ -1979,7 +1980,8 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
- u16 cqe_bcnt, u32 head_offset, u32 page_idx)
+ struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset,
+ u32 page_idx)
{
union mlx5e_alloc_unit *au = &wi->alloc_units[page_idx];
u16 rx_headroom = rq->buff.headroom;
@@ -2007,19 +2009,19 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
prog = rcu_dereference(rq->xdp_prog);
if (prog) {
- struct xdp_buff xdp;
+ struct mlx5e_xdp_buff mxbuf;
net_prefetchw(va); /* xdp_frame data area */
- mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
- if (mlx5e_xdp_handle(rq, au->page, prog, &xdp)) {
+ mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, cqe_bcnt, &mxbuf);
+ if (mlx5e_xdp_handle(rq, au->page, prog, &mxbuf)) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
__set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
return NULL; /* page/packet was consumed by XDP */
}
- rx_headroom = xdp.data - xdp.data_hard_start;
- metasize = xdp.data - xdp.data_meta;
- cqe_bcnt = xdp.data_end - xdp.data;
+ rx_headroom = mxbuf.xdp.data - mxbuf.xdp.data_hard_start;
+ metasize = mxbuf.xdp.data - mxbuf.xdp.data_meta;
+ cqe_bcnt = mxbuf.xdp.data_end - mxbuf.xdp.data;
}
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
@@ -2174,8 +2176,8 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq
if (likely(head_size))
*skb = mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index);
else
- *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe_bcnt, data_offset,
- page_idx);
+ *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe, cqe_bcnt,
+ data_offset, page_idx);
if (unlikely(!*skb))
goto free_hd_entry;
@@ -2249,7 +2251,8 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq
mlx5e_skb_from_cqe_mpwrq_linear,
mlx5e_skb_from_cqe_mpwrq_nonlinear,
mlx5e_xsk_skb_from_cqe_mpwrq_linear,
- rq, wi, cqe_bcnt, head_offset, page_idx);
+ rq, wi, cqe, cqe_bcnt, head_offset,
+ page_idx);
if (!skb)
goto mpwrq_cqe_out;
@@ -2494,7 +2497,7 @@ static void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe,
mlx5e_skb_from_cqe_linear,
mlx5e_skb_from_cqe_nonlinear,
- rq, wi, cqe_bcnt);
+ rq, wi, cqe, cqe_bcnt);
if (!skb)
goto wq_free_wqe;
@@ -2586,7 +2589,7 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe
goto free_wqe;
}
- skb = mlx5e_skb_from_cqe_nonlinear(rq, wi, cqe_bcnt);
+ skb = mlx5e_skb_from_cqe_nonlinear(rq, wi, cqe, cqe_bcnt);
if (!skb)
goto free_wqe;
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 50854265864d..f60eb97e3a62 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -315,10 +315,6 @@ nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf)
NSIM_EA(bpf->extack, "xdpoffload of non-bound program");
return -EINVAL;
}
- if (!bpf_offload_dev_match(bpf->prog, ns->netdev)) {
- NSIM_EA(bpf->extack, "program bound to different dev");
- return -EINVAL;
- }
state = bpf->prog->aux->offload->dev_priv;
if (WARN_ON(strcmp(state->state, "xlated"))) {
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index dfc7d87fad59..ba3e05832843 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -116,6 +116,11 @@ static struct {
{ "peer_ifindex" },
};
+struct veth_xdp_buff {
+ struct xdp_buff xdp;
+ struct sk_buff *skb;
+};
+
static int veth_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *cmd)
{
@@ -592,23 +597,25 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (likely(xdp_prog)) {
- struct xdp_buff xdp;
+ struct veth_xdp_buff vxbuf;
+ struct xdp_buff *xdp = &vxbuf.xdp;
u32 act;
- xdp_convert_frame_to_buff(frame, &xdp);
- xdp.rxq = &rq->xdp_rxq;
+ xdp_convert_frame_to_buff(frame, xdp);
+ xdp->rxq = &rq->xdp_rxq;
+ vxbuf.skb = NULL;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) {
case XDP_PASS:
- if (xdp_update_frame_from_buff(&xdp, frame))
+ if (xdp_update_frame_from_buff(xdp, frame))
goto err_xdp;
break;
case XDP_TX:
orig_frame = *frame;
- xdp.rxq->mem = frame->mem;
- if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
+ xdp->rxq->mem = frame->mem;
+ if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
trace_xdp_exception(rq->dev, xdp_prog, act);
frame = &orig_frame;
stats->rx_drops++;
@@ -619,8 +626,8 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
goto xdp_xmit;
case XDP_REDIRECT:
orig_frame = *frame;
- xdp.rxq->mem = frame->mem;
- if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
+ xdp->rxq->mem = frame->mem;
+ if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
frame = &orig_frame;
stats->rx_drops++;
goto err_xdp;
@@ -801,7 +808,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
{
void *orig_data, *orig_data_end;
struct bpf_prog *xdp_prog;
- struct xdp_buff xdp;
+ struct veth_xdp_buff vxbuf;
+ struct xdp_buff *xdp = &vxbuf.xdp;
u32 act, metalen;
int off;
@@ -815,22 +823,23 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
}
__skb_push(skb, skb->data - skb_mac_header(skb));
- if (veth_convert_skb_to_xdp_buff(rq, &xdp, &skb))
+ if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
goto drop;
+ vxbuf.skb = skb;
- orig_data = xdp.data;
- orig_data_end = xdp.data_end;
+ orig_data = xdp->data;
+ orig_data_end = xdp->data_end;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) {
case XDP_PASS:
break;
case XDP_TX:
- veth_xdp_get(&xdp);
+ veth_xdp_get(xdp);
consume_skb(skb);
- xdp.rxq->mem = rq->xdp_mem;
- if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
+ xdp->rxq->mem = rq->xdp_mem;
+ if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
trace_xdp_exception(rq->dev, xdp_prog, act);
stats->rx_drops++;
goto err_xdp;
@@ -839,10 +848,10 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
rcu_read_unlock();
goto xdp_xmit;
case XDP_REDIRECT:
- veth_xdp_get(&xdp);
+ veth_xdp_get(xdp);
consume_skb(skb);
- xdp.rxq->mem = rq->xdp_mem;
- if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
+ xdp->rxq->mem = rq->xdp_mem;
+ if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
stats->rx_drops++;
goto err_xdp;
}
@@ -862,7 +871,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
rcu_read_unlock();
/* check if bpf_xdp_adjust_head was used */
- off = orig_data - xdp.data;
+ off = orig_data - xdp->data;
if (off > 0)
__skb_push(skb, off);
else if (off < 0)
@@ -871,21 +880,21 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
skb_reset_mac_header(skb);
/* check if bpf_xdp_adjust_tail was used */
- off = xdp.data_end - orig_data_end;
+ off = xdp->data_end - orig_data_end;
if (off != 0)
__skb_put(skb, off); /* positive on grow, negative on shrink */
/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
* (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
*/
- if (xdp_buff_has_frags(&xdp))
+ if (xdp_buff_has_frags(xdp))
skb->data_len = skb_shinfo(skb)->xdp_frags_size;
else
skb->data_len = 0;
skb->protocol = eth_type_trans(skb, rq->dev);
- metalen = xdp.data - xdp.data_meta;
+ metalen = xdp->data - xdp->data_meta;
if (metalen)
skb_metadata_set(skb, metalen);
out:
@@ -898,7 +907,7 @@ xdp_drop:
return NULL;
err_xdp:
rcu_read_unlock();
- xdp_return_buff(&xdp);
+ xdp_return_buff(xdp);
xdp_xmit:
return NULL;
}
@@ -1596,6 +1605,28 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
}
}
+static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+{
+ struct veth_xdp_buff *_ctx = (void *)ctx;
+
+ if (!_ctx->skb)
+ return -EOPNOTSUPP;
+
+ *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;
+ return 0;
+}
+
+static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
+{
+ struct veth_xdp_buff *_ctx = (void *)ctx;
+
+ if (!_ctx->skb)
+ return -EOPNOTSUPP;
+
+ *hash = skb_get_hash(_ctx->skb);
+ return 0;
+}
+
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
@@ -1617,6 +1648,11 @@ static const struct net_device_ops veth_netdev_ops = {
.ndo_get_peer_dev = veth_peer_dev,
};
+static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
+ .xmo_rx_timestamp = veth_xdp_rx_timestamp,
+ .xmo_rx_hash = veth_xdp_rx_hash,
+};
+
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
@@ -1633,6 +1669,7 @@ static void veth_setup(struct net_device *dev)
dev->priv_flags |= IFF_PHONY_HEADROOM;
dev->netdev_ops = &veth_netdev_ops;
+ dev->xdp_metadata_ops = &veth_xdp_metadata_ops;
dev->ethtool_ops = &veth_ethtool_ops;
dev->features |= NETIF_F_LLTX;
dev->features |= VETH_FEATURES;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ae7771c7d750..ad4bb36d4c10 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1261,7 +1261,8 @@ struct bpf_prog_aux {
enum bpf_prog_type saved_dst_prog_type;
enum bpf_attach_type saved_dst_attach_type;
bool verifier_zext; /* Zero extensions has been inserted by verifier. */
- bool offload_requested;
+ bool dev_bound; /* Program is bound to the netdev. */
+ bool offload_requested; /* Program is bound and offloaded to the netdev. */
bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
bool func_proto_unreliable;
bool sleepable;
@@ -2451,7 +2452,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);
int bpf_prog_offload_compile(struct bpf_prog *prog);
-void bpf_prog_offload_destroy(struct bpf_prog *prog);
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog);
int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
struct bpf_prog *prog);
@@ -2479,14 +2480,26 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev);
void unpriv_ebpf_notify(int new_state);
#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux);
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id);
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr);
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog);
+void bpf_dev_bound_netdev_unregister(struct net_device *dev);
static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
{
+ return aux->dev_bound;
+}
+
+static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux)
+{
return aux->offload_requested;
}
-static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs);
+
+static inline bool bpf_map_is_offloaded(struct bpf_map *map)
{
return unlikely(map->ops == &bpf_map_offload_ops);
}
@@ -2507,18 +2520,50 @@ void sock_map_unhash(struct sock *sk);
void sock_map_destroy(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
#else
-static inline int bpf_prog_offload_init(struct bpf_prog *prog,
- union bpf_attr *attr)
+static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog,
+ u32 func_id)
+{
+ return NULL;
+}
+
+static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog,
+ union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
{
return -EOPNOTSUPP;
}
-static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+}
+
+static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
+{
+ return false;
+}
+
+static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux)
+{
+ return false;
+}
+
+static inline bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
{
return false;
}
-static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
+static inline bool bpf_map_is_offloaded(struct bpf_map *map)
{
return false;
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index aad12a179e54..90f2be194bc5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -74,6 +74,7 @@ struct udp_tunnel_nic_info;
struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;
+struct xdp_md;
void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
@@ -1618,6 +1619,11 @@ struct net_device_ops {
bool cycles);
};
+struct xdp_metadata_ops {
+ int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp);
+ int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash);
+};
+
/**
* enum netdev_priv_flags - &struct net_device priv_flags
*
@@ -1801,6 +1807,7 @@ enum netdev_ml_priv_type {
*
* @netdev_ops: Includes several pointers to callbacks,
* if one wants to override the ndo_*() functions
+ * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks.
* @ethtool_ops: Management operations
* @l3mdev_ops: Layer 3 master device operations
* @ndisc_ops: Includes callbacks for different IPv6 neighbour
@@ -2050,6 +2057,7 @@ struct net_device {
unsigned int flags;
unsigned long long priv_flags;
const struct net_device_ops *netdev_ops;
+ const struct xdp_metadata_ops *xdp_metadata_ops;
int ifindex;
unsigned short gflags;
unsigned short hard_header_len;
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 55dbc68bfffc..91292aa13bc0 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -409,4 +409,25 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE
+#define XDP_METADATA_KFUNC_xxx \
+ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
+ bpf_xdp_metadata_rx_timestamp) \
+ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \
+ bpf_xdp_metadata_rx_hash) \
+
+enum {
+#define XDP_METADATA_KFUNC(name, _) name,
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+MAX_XDP_METADATA_KFUNC,
+};
+
+#ifdef CONFIG_NET
+u32 bpf_xdp_metadata_kfunc_id(int id);
+bool bpf_dev_bound_kfunc_id(u32 btf_id);
+#else
+static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; }
+static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; }
+#endif
+
#endif /* __LINUX_NET_XDP_H__ */
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index f787c3f524b0..3e952e569418 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -19,8 +19,11 @@ struct xdp_sock;
struct device;
struct page;
+#define XSK_PRIV_MAX 24
+
struct xdp_buff_xsk {
struct xdp_buff xdp;
+ u8 cb[XSK_PRIV_MAX];
dma_addr_t dma;
dma_addr_t frame_dma;
struct xsk_buff_pool *pool;
@@ -28,6 +31,8 @@ struct xdp_buff_xsk {
struct list_head free_list_node;
};
+#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb))
+
struct xsk_dma_map {
dma_addr_t *dma_pages;
struct device *dev;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index adae5b168f9d..ba0f0cfb5e42 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1156,6 +1156,11 @@ enum bpf_link_type {
*/
#define BPF_F_XDP_HAS_FRAGS (1U << 5)
+/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded
+ * program becomes device-bound but can access XDP metadata.
+ */
+#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
+
/* link_create.kprobe_multi.flags used in LINK_CREATE command for
* BPF_TRACE_KPROBE_MULTI attach type to create return probe.
*/
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba3fff17e2f9..16da51093aff 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2096,6 +2096,14 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
if (fp->kprobe_override)
return false;
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
spin_lock(&map->owner.lock);
if (!map->owner.type) {
/* There's no owner yet where we could check for
@@ -2182,7 +2190,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- if (!bpf_prog_is_dev_bound(fp->aux)) {
+ if (!bpf_prog_is_offloaded(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;
@@ -2554,7 +2562,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
- bpf_prog_offload_destroy(aux->prog);
+ bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 13e4efc971e6..e87cab2ed710 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -41,7 +41,7 @@ struct bpf_offload_dev {
struct bpf_offload_netdev {
struct rhash_head l;
struct net_device *netdev;
- struct bpf_offload_dev *offdev;
+ struct bpf_offload_dev *offdev; /* NULL when bound-only */
struct list_head progs;
struct list_head maps;
struct list_head offdev_netdevs;
@@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = {
};
static struct rhashtable offdevs;
-static bool offdevs_inited;
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -72,58 +71,221 @@ bpf_offload_find_netdev(struct net_device *netdev)
{
lockdep_assert_held(&bpf_devs_lock);
- if (!offdevs_inited)
- return NULL;
return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
}
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
struct bpf_offload_netdev *ondev;
- struct bpf_prog_offload *offload;
int err;
- if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
- attr->prog_type != BPF_PROG_TYPE_XDP)
- return -EINVAL;
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
- if (attr->prog_flags)
- return -EINVAL;
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_free;
+ }
+
+ if (offdev)
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ return 0;
+
+err_free:
+ kfree(ondev);
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload = prog->aux->offload;
+
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+ /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
+ bpf_prog_free_id(prog, true);
+
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+}
+
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map, true);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev, *altdev = NULL;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
+
+ ASSERT_RTNL();
+
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ return;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+
+ /* Try to move the objects to another netdev of the device */
+ if (offdev) {
+ list_del(&ondev->offdev_netdevs);
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ }
+
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
+ }
+
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+}
+
+static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ struct bpf_prog_offload *offload;
+ int err;
offload = kzalloc(sizeof(*offload), GFP_USER);
if (!offload)
return -ENOMEM;
offload->prog = prog;
+ offload->netdev = netdev;
- offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
- attr->prog_ifindex);
- err = bpf_dev_offload_check(offload->netdev);
- if (err)
- goto err_maybe_put;
-
- down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offload->netdev);
if (!ondev) {
- err = -EINVAL;
- goto err_unlock;
+ if (bpf_prog_is_offloaded(prog->aux)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ /* When only binding to the device, explicitly
+ * create an entry in the hashtable.
+ */
+ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+ if (err)
+ goto err_free;
+ ondev = bpf_offload_find_netdev(offload->netdev);
}
offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
- dev_put(offload->netdev);
- up_write(&bpf_devs_lock);
return 0;
-err_unlock:
- up_write(&bpf_devs_lock);
-err_maybe_put:
- if (offload->netdev)
- dev_put(offload->netdev);
+err_free:
kfree(offload);
return err;
}
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net_device *netdev;
+ int err;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
+ if (!netdev)
+ return -EINVAL;
+
+ err = bpf_dev_offload_check(netdev);
+ if (err)
+ goto out;
+
+ prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_prog_dev_bound_init(prog, netdev);
+ up_write(&bpf_devs_lock);
+
+out:
+ dev_put(netdev);
+ return err;
+}
+
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog)
+{
+ int err;
+
+ if (!bpf_prog_is_dev_bound(old_prog->aux))
+ return 0;
+
+ if (bpf_prog_is_offloaded(old_prog->aux))
+ return -EINVAL;
+
+ new_prog->aux->dev_bound = old_prog->aux->dev_bound;
+ new_prog->aux->offload_requested = old_prog->aux->offload_requested;
+
+ down_write(&bpf_devs_lock);
+ if (!old_prog->aux->offload) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev);
+
+out:
+ up_write(&bpf_devs_lock);
+ return err;
+}
+
int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload;
@@ -209,27 +371,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
up_read(&bpf_devs_lock);
}
-static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
-
- if (offload->dev_state)
- offload->offdev->ops->destroy(prog);
-
- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
- bpf_prog_free_id(prog, true);
-
- list_del_init(&offload->offloads);
- kfree(offload);
- prog->aux->offload = NULL;
-}
+ struct bpf_offload_netdev *ondev;
+ struct net_device *netdev;
-void bpf_prog_offload_destroy(struct bpf_prog *prog)
-{
+ rtnl_lock();
down_write(&bpf_devs_lock);
- if (prog->aux->offload)
+ if (prog->aux->offload) {
+ list_del_init(&prog->aux->offload->offloads);
+
+ netdev = prog->aux->offload->netdev;
__bpf_prog_offload_destroy(prog);
+
+ ondev = bpf_offload_find_netdev(netdev);
+ if (!ondev->offdev && list_empty(&ondev->progs))
+ __bpf_offload_dev_netdev_unregister(NULL, netdev);
+ }
up_write(&bpf_devs_lock);
+ rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
@@ -343,22 +503,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
-static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
- enum bpf_netdev_command cmd)
-{
- struct netdev_bpf data = {};
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- data.command = cmd;
- data.offmap = offmap;
- /* Caller must make sure netdev is valid */
- netdev = offmap->netdev;
-
- return netdev->netdev_ops->ndo_bpf(netdev, &data);
-}
-
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
@@ -408,15 +552,6 @@ err_unlock:
return ERR_PTR(err);
}
-static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
-{
- WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
- /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
- bpf_map_free_id(&offmap->map, true);
- list_del_init(&offmap->offloads);
- offmap->netdev = NULL;
-}
-
void bpf_map_offload_map_free(struct bpf_map *map)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
@@ -576,12 +711,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
+{
+ bool ret;
+
+ if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ ret = lhs->aux->offload && rhs->aux->offload &&
+ lhs->aux->offload->netdev &&
+ lhs->aux->offload->netdev == rhs->aux->offload->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
{
struct bpf_offloaded_map *offmap;
bool ret;
- if (!bpf_map_is_dev_bound(map))
+ if (!bpf_map_is_offloaded(map))
return bpf_map_offload_neutral(map);
offmap = map_to_offmap(map);
@@ -595,32 +746,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev;
int err;
- ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
- if (!ondev)
- return -ENOMEM;
-
- ondev->netdev = netdev;
- ondev->offdev = offdev;
- INIT_LIST_HEAD(&ondev->progs);
- INIT_LIST_HEAD(&ondev->maps);
-
down_write(&bpf_devs_lock);
- err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
- if (err) {
- netdev_warn(netdev, "failed to register for BPF offload\n");
- goto err_unlock_free;
- }
-
- list_add(&ondev->offdev_netdevs, &offdev->netdevs);
- up_write(&bpf_devs_lock);
- return 0;
-
-err_unlock_free:
+ err = __bpf_offload_dev_netdev_register(offdev, netdev);
up_write(&bpf_devs_lock);
- kfree(ondev);
return err;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
@@ -628,43 +758,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev, *altdev;
- struct bpf_offloaded_map *offmap, *mtmp;
- struct bpf_prog_offload *offload, *ptmp;
-
- ASSERT_RTNL();
-
down_write(&bpf_devs_lock);
- ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
- if (WARN_ON(!ondev))
- goto unlock;
-
- WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
- list_del(&ondev->offdev_netdevs);
-
- /* Try to move the objects to another netdev of the device */
- altdev = list_first_entry_or_null(&offdev->netdevs,
- struct bpf_offload_netdev,
- offdev_netdevs);
- if (altdev) {
- list_for_each_entry(offload, &ondev->progs, offloads)
- offload->netdev = altdev->netdev;
- list_splice_init(&ondev->progs, &altdev->progs);
-
- list_for_each_entry(offmap, &ondev->maps, offloads)
- offmap->netdev = altdev->netdev;
- list_splice_init(&ondev->maps, &altdev->maps);
- } else {
- list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
- __bpf_prog_offload_destroy(offload->prog);
- list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
- __bpf_map_offload_destroy(offmap);
- }
-
- WARN_ON(!list_empty(&ondev->progs));
- WARN_ON(!list_empty(&ondev->maps));
- kfree(ondev);
-unlock:
+ __bpf_offload_dev_netdev_unregister(offdev, netdev);
up_write(&bpf_devs_lock);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
@@ -673,18 +768,6 @@ struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
- int err;
-
- down_write(&bpf_devs_lock);
- if (!offdevs_inited) {
- err = rhashtable_init(&offdevs, &offdevs_params);
- if (err) {
- up_write(&bpf_devs_lock);
- return ERR_PTR(err);
- }
- offdevs_inited = true;
- }
- up_write(&bpf_devs_lock);
offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
if (!offdev)
@@ -710,3 +793,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
return offdev->priv;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
+
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+ struct bpf_offload_netdev *ondev;
+
+ ASSERT_RTNL();
+
+ down_write(&bpf_devs_lock);
+ ondev = bpf_offload_find_netdev(dev);
+ if (ondev && !ondev->offdev)
+ __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+ up_write(&bpf_devs_lock);
+}
+
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ if (!bpf_prog_is_dev_bound(prog_aux)) {
+ bpf_log(log, "metadata kfuncs require device-bound program\n");
+ return -EINVAL;
+ }
+
+ if (bpf_prog_is_offloaded(prog_aux)) {
+ bpf_log(log, "metadata kfuncs can't be offloaded\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
+{
+ const struct xdp_metadata_ops *ops;
+ void *p = NULL;
+
+ /* We don't hold bpf_devs_lock while resolving several
+ * kfuncs and can race with the unregister_netdevice().
+ * We rely on bpf_dev_bound_match() check at attach
+ * to render this program unusable.
+ */
+ down_read(&bpf_devs_lock);
+ if (!prog->aux->offload)
+ goto out;
+
+ ops = prog->aux->offload->netdev->xdp_metadata_ops;
+ if (!ops)
+ goto out;
+
+ if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
+ p = ops->xmo_rx_timestamp;
+ else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
+ p = ops->xmo_rx_hash;
+out:
+ up_read(&bpf_devs_lock);
+
+ return p;
+}
+
+static int __init bpf_offload_init(void)
+{
+ return rhashtable_init(&offdevs, &offdevs_params);
+}
+
+late_initcall(bpf_offload_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35ffd808f281..d5ffa7a01dfb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -181,7 +181,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
int err;
/* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
@@ -238,7 +238,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
void *ptr;
int err;
- if (bpf_map_is_dev_bound(map))
+ if (bpf_map_is_offloaded(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
@@ -1483,7 +1483,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map) ||
@@ -1547,7 +1547,7 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
@@ -1605,7 +1605,7 @@ int generic_map_delete_batch(struct bpf_map *map,
map->key_size))
break;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
@@ -1851,7 +1851,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- if (!bpf_map_is_dev_bound(map)) {
+ if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
@@ -1944,7 +1944,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (!ops)
return -EINVAL;
- if (!bpf_prog_is_dev_bound(prog->aux))
+ if (!bpf_prog_is_offloaded(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
@@ -2255,7 +2255,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
if (prog->type != *attach_type)
return false;
- if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
return false;
return true;
@@ -2491,7 +2491,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32 |
- BPF_F_XDP_HAS_FRAGS))
+ BPF_F_XDP_HAS_FRAGS |
+ BPF_F_XDP_DEV_BOUND_ONLY))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2575,7 +2576,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
- prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->dev_bound = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
@@ -2599,7 +2600,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
- err = bpf_prog_offload_init(prog, attr);
+ err = bpf_prog_dev_bound_init(prog, attr);
+ if (err)
+ goto free_prog_sec;
+ }
+
+ if (type == BPF_PROG_TYPE_EXT && dst_prog &&
+ bpf_prog_is_dev_bound(dst_prog->aux)) {
+ err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
goto free_prog_sec;
}
@@ -3997,7 +4005,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
return -EFAULT;
}
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
err = bpf_prog_offload_info_fill(&info, prog);
if (err)
return err;
@@ -4225,7 +4233,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
return err;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ecf7fed7881c..800488289297 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2333,6 +2333,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
+ if (err)
+ return err;
+ }
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
desc->imm = call_imm;
@@ -14099,7 +14105,7 @@ static int do_check(struct bpf_verifier_env *env)
env->prev_log_len = env->log.len_used;
}
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
@@ -14579,7 +14585,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
@@ -15060,7 +15066,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
unsigned int orig_prog_len = env->prog->len;
int err;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
err = bpf_remove_insns(env->prog, off, cnt);
@@ -15141,7 +15147,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
else
continue;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_replace_insn(env, i, &ja);
memcpy(insn, &ja, sizeof(ja));
@@ -15328,7 +15334,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
@@ -15728,7 +15734,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
int err = 0;
if (env->prog->jit_requested &&
- !bpf_prog_is_dev_bound(env->prog->aux)) {
+ !bpf_prog_is_offloaded(env->prog->aux)) {
err = jit_subprogs(env);
if (err == 0)
return 0;
@@ -15772,12 +15778,25 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
+ void *xdp_kfunc;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
return -EINVAL;
}
+ *cnt = 0;
+
+ if (bpf_dev_bound_kfunc_id(insn->imm)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm);
+ if (xdp_kfunc) {
+ insn->imm = BPF_CALL_IMM(xdp_kfunc);
+ return 0;
+ }
+
+ /* fallback to default kfunc when not supported by netdev */
+ }
+
/* insn->imm has the btf func_id. Replace it with
* an address (relative to __bpf_call_base).
*/
@@ -15788,7 +15807,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
- *cnt = 0;
insn->imm = desc->imm;
if (insn->off)
return 0;
@@ -16795,6 +16813,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ if (bpf_prog_is_dev_bound(prog->aux) &&
+ !bpf_prog_dev_bound_match(prog, tgt_prog)) {
+ bpf_log(log, "Target program bound device mismatch");
+ return -EINVAL;
+ }
+
for (i = 0; i < aux->func_info_cnt; i++)
if (aux->func_info[i].type_id == btf_id) {
subprog = i;
@@ -17231,7 +17255,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (ret < 0)
goto skip_full_check;
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
@@ -17244,7 +17268,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);
- if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+ if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
@@ -17279,7 +17303,7 @@ skip_full_check:
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
- if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2723623429ac..8da0d73b368e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -1300,6 +1300,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES)
return -EINVAL;
+ if (bpf_prog_is_dev_bound(prog->aux))
+ return -EINVAL;
+
if (do_live) {
if (!batch_size)
batch_size = NAPI_POLL_WEIGHT;
diff --git a/net/core/dev.c b/net/core/dev.c
index cf78f35bc0b9..e66da626df84 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9224,8 +9224,12 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
return -EEXIST;
}
- if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
- NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
+ if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
+ NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
+ return -EINVAL;
+ }
+ if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
+ NL_SET_ERR_MSG(extack, "Program bound to different device");
return -EINVAL;
}
if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
@@ -10830,6 +10834,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
dev_shutdown(dev);
dev_xdp_uninstall(dev);
+ bpf_dev_bound_netdev_unregister(dev);
netdev_offload_xstats_disable_all(dev);
diff --git a/net/core/filter.c b/net/core/filter.c
index b4547a2c02f4..ed08dbf10338 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8760,7 +8760,7 @@ static bool xdp_is_valid_access(int off, int size,
}
if (type == BPF_WRITE) {
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
switch (off) {
case offsetof(struct xdp_md, rx_queue_index):
return __is_valid_xdp_access(off, size);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 844c9d99dc0e..a5a7ecf6391c 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -4,6 +4,7 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
#include <linux/bpf.h>
+#include <linux/btf_ids.h>
#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
@@ -709,3 +710,66 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
return nxdpf;
}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+/**
+ * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp.
+ * @ctx: XDP context pointer.
+ * @timestamp: Return value pointer.
+ *
+ * Returns 0 on success or ``-errno`` on error.
+ */
+int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+{
+ return -EOPNOTSUPP;
+}
+
+/**
+ * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash.
+ * @ctx: XDP context pointer.
+ * @hash: Return value pointer.
+ *
+ * Returns 0 on success or ``-errno`` on error.
+ */
+int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash)
+{
+ return -EOPNOTSUPP;
+}
+
+__diag_pop();
+
+BTF_SET8_START(xdp_metadata_kfunc_ids)
+#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, 0)
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+BTF_SET8_END(xdp_metadata_kfunc_ids)
+
+static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &xdp_metadata_kfunc_ids,
+};
+
+BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
+#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str)
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
+u32 bpf_xdp_metadata_kfunc_id(int id)
+{
+ /* xdp_metadata_kfunc_ids is sorted and can't be used */
+ return xdp_metadata_kfunc_ids_unsorted[id];
+}
+
+bool bpf_dev_bound_kfunc_id(u32 btf_id)
+{
+ return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id);
+}
+
+static int __init xdp_metadata_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
+}
+late_initcall(xdp_metadata_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 142b81bcbb2e..7f024ac22edd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1156,6 +1156,11 @@ enum bpf_link_type {
*/
#define BPF_F_XDP_HAS_FRAGS (1U << 5)
+/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded
+ * program becomes device-bound but can access XDP metadata.
+ */
+#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
+
/* link_create.kprobe_multi.flags used in LINK_CREATE command for
* BPF_TRACE_KPROBE_MULTI attach type to create return probe.
*/
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 401a75844cc0..4aa5bba956ff 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -47,3 +47,4 @@ test_cpp
xskxceiver
xdp_redirect_multi
xdp_synproxy
+xdp_hw_metadata
diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index 96e8371f5c2a..b9ef9cc61d36 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -86,5 +86,6 @@ xdp_adjust_tail # case-128 err 0 errno 28 retval 1 size
xdp_bonding # failed to auto-attach program 'trace_on_entry': -524 (trampoline)
xdp_bpf2bpf # failed to auto-attach program 'trace_on_entry': -524 (trampoline)
xdp_do_redirect # prog_run_max_size unexpected error: -22 (errno 22)
+xdp_metadata # JIT does not support calling kernel function (kfunc)
xdp_synproxy # JIT does not support calling kernel function (kfunc)
xfrm_info # JIT does not support calling kernel function (kfunc)
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 22533a18705e..a554b41fe872 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -83,7 +83,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
- xskxceiver xdp_redirect_multi xdp_synproxy veristat
+ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata
TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/sign-file
TEST_GEN_FILES += liburandom_read.so
@@ -383,6 +383,7 @@ test_subskeleton.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib
test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o
test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o
xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
+xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
@@ -527,7 +528,7 @@ TRUNNER_BPF_PROGS_DIR := progs
TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \
network_helpers.c testing_helpers.c \
btf_helpers.c flow_dissector_load.h \
- cap_helpers.c test_loader.c
+ cap_helpers.c test_loader.c xsk.c
TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \
$(OUTPUT)/liburandom_read.so \
$(OUTPUT)/xdp_synproxy \
@@ -580,6 +581,10 @@ $(OUTPUT)/xskxceiver: xskxceiver.c $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.
$(call msg,BINARY,,$@)
$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
+$(OUTPUT)/xdp_hw_metadata: xdp_hw_metadata.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xdp_hw_metadata.skel.h | $(OUTPUT)
+ $(call msg,BINARY,,$@)
+ $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
+
# Make sure we are able to include and link libbpf against c++.
$(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
$(call msg,CXX,,$@)
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
new file mode 100644
index 000000000000..e033d48288c0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "xdp_metadata.skel.h"
+#include "xdp_metadata2.skel.h"
+#include "xdp_metadata.h"
+#include "xsk.h"
+
+#include <bpf/btf.h>
+#include <linux/errqueue.h>
+#include <linux/if_link.h>
+#include <linux/net_tstamp.h>
+#include <linux/udp.h>
+#include <sys/mman.h>
+#include <net/if.h>
+#include <poll.h>
+
+#define TX_NAME "veTX"
+#define RX_NAME "veRX"
+
+#define UDP_PAYLOAD_BYTES 4
+
+#define AF_XDP_SOURCE_PORT 1234
+#define AF_XDP_CONSUMER_PORT 8080
+
+#define UMEM_NUM 16
+#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
+#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
+#define XDP_FLAGS XDP_FLAGS_DRV_MODE
+#define QUEUE_ID 0
+
+#define TX_ADDR "10.0.0.1"
+#define RX_ADDR "10.0.0.2"
+#define PREFIX_LEN "8"
+#define FAMILY AF_INET
+
+#define SYS(cmd) ({ \
+ if (!ASSERT_OK(system(cmd), (cmd))) \
+ goto out; \
+})
+
+struct xsk {
+ void *umem_area;
+ struct xsk_umem *umem;
+ struct xsk_ring_prod fill;
+ struct xsk_ring_cons comp;
+ struct xsk_ring_prod tx;
+ struct xsk_ring_cons rx;
+ struct xsk_socket *socket;
+};
+
+static int open_xsk(int ifindex, struct xsk *xsk)
+{
+ int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+ const struct xsk_socket_config socket_config = {
+ .rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .bind_flags = XDP_COPY,
+ };
+ const struct xsk_umem_config umem_config = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+ .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+ };
+ __u32 idx;
+ u64 addr;
+ int ret;
+ int i;
+
+ xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+ if (!ASSERT_NEQ(xsk->umem_area, MAP_FAILED, "mmap"))
+ return -1;
+
+ ret = xsk_umem__create(&xsk->umem,
+ xsk->umem_area, UMEM_SIZE,
+ &xsk->fill,
+ &xsk->comp,
+ &umem_config);
+ if (!ASSERT_OK(ret, "xsk_umem__create"))
+ return ret;
+
+ ret = xsk_socket__create(&xsk->socket, ifindex, QUEUE_ID,
+ xsk->umem,
+ &xsk->rx,
+ &xsk->tx,
+ &socket_config);
+ if (!ASSERT_OK(ret, "xsk_socket__create"))
+ return ret;
+
+ /* First half of umem is for TX. This way address matches 1-to-1
+ * to the completion queue index.
+ */
+
+ for (i = 0; i < UMEM_NUM / 2; i++) {
+ addr = i * UMEM_FRAME_SIZE;
+ printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr);
+ }
+
+ /* Second half of umem is for RX. */
+
+ ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx);
+ if (!ASSERT_EQ(UMEM_NUM / 2, ret, "xsk_ring_prod__reserve"))
+ return ret;
+ if (!ASSERT_EQ(idx, 0, "fill idx != 0"))
+ return -1;
+
+ for (i = 0; i < UMEM_NUM / 2; i++) {
+ addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
+ printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
+ *xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
+ }
+ xsk_ring_prod__submit(&xsk->fill, ret);
+
+ return 0;
+}
+
+static void close_xsk(struct xsk *xsk)
+{
+ if (xsk->umem)
+ xsk_umem__delete(xsk->umem);
+ if (xsk->socket)
+ xsk_socket__delete(xsk->socket);
+ munmap(xsk->umem, UMEM_SIZE);
+}
+
+static void ip_csum(struct iphdr *iph)
+{
+ __u32 sum = 0;
+ __u16 *p;
+ int i;
+
+ iph->check = 0;
+ p = (void *)iph;
+ for (i = 0; i < sizeof(*iph) / sizeof(*p); i++)
+ sum += p[i];
+
+ while (sum >> 16)
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ iph->check = ~sum;
+}
+
+static int generate_packet(struct xsk *xsk, __u16 dst_port)
+{
+ struct xdp_desc *tx_desc;
+ struct udphdr *udph;
+ struct ethhdr *eth;
+ struct iphdr *iph;
+ void *data;
+ __u32 idx;
+ int ret;
+
+ ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
+ if (!ASSERT_EQ(ret, 1, "xsk_ring_prod__reserve"))
+ return -1;
+
+ tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
+ tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE;
+ printf("%p: tx_desc[%u]->addr=%llx\n", xsk, idx, tx_desc->addr);
+ data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
+
+ eth = data;
+ iph = (void *)(eth + 1);
+ udph = (void *)(iph + 1);
+
+ memcpy(eth->h_dest, "\x00\x00\x00\x00\x00\x02", ETH_ALEN);
+ memcpy(eth->h_source, "\x00\x00\x00\x00\x00\x01", ETH_ALEN);
+ eth->h_proto = htons(ETH_P_IP);
+
+ iph->version = 0x4;
+ iph->ihl = 0x5;
+ iph->tos = 0x9;
+ iph->tot_len = htons(sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES);
+ iph->id = 0;
+ iph->frag_off = 0;
+ iph->ttl = 0;
+ iph->protocol = IPPROTO_UDP;
+ ASSERT_EQ(inet_pton(FAMILY, TX_ADDR, &iph->saddr), 1, "inet_pton(TX_ADDR)");
+ ASSERT_EQ(inet_pton(FAMILY, RX_ADDR, &iph->daddr), 1, "inet_pton(RX_ADDR)");
+ ip_csum(iph);
+
+ udph->source = htons(AF_XDP_SOURCE_PORT);
+ udph->dest = htons(dst_port);
+ udph->len = htons(sizeof(*udph) + UDP_PAYLOAD_BYTES);
+ udph->check = 0;
+
+ memset(udph + 1, 0xAA, UDP_PAYLOAD_BYTES);
+
+ tx_desc->len = sizeof(*eth) + sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES;
+ xsk_ring_prod__submit(&xsk->tx, 1);
+
+ ret = sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
+ if (!ASSERT_GE(ret, 0, "sendto"))
+ return ret;
+
+ return 0;
+}
+
+static void complete_tx(struct xsk *xsk)
+{
+ __u32 idx;
+ __u64 addr;
+
+ if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) {
+ addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
+
+ printf("%p: refill idx=%u addr=%llx\n", xsk, idx, addr);
+ *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
+ xsk_ring_prod__submit(&xsk->fill, 1);
+ }
+}
+
+static void refill_rx(struct xsk *xsk, __u64 addr)
+{
+ __u32 idx;
+
+ if (ASSERT_EQ(xsk_ring_prod__reserve(&xsk->fill, 1, &idx), 1, "xsk_ring_prod__reserve")) {
+ printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
+ *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
+ xsk_ring_prod__submit(&xsk->fill, 1);
+ }
+}
+
+static int verify_xsk_metadata(struct xsk *xsk)
+{
+ const struct xdp_desc *rx_desc;
+ struct pollfd fds = {};
+ struct xdp_meta *meta;
+ struct ethhdr *eth;
+ struct iphdr *iph;
+ __u64 comp_addr;
+ void *data;
+ __u64 addr;
+ __u32 idx;
+ int ret;
+
+ ret = recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL);
+ if (!ASSERT_EQ(ret, 0, "recvfrom"))
+ return -1;
+
+ fds.fd = xsk_socket__fd(xsk->socket);
+ fds.events = POLLIN;
+
+ ret = poll(&fds, 1, 1000);
+ if (!ASSERT_GT(ret, 0, "poll"))
+ return -1;
+
+ ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx);
+ if (!ASSERT_EQ(ret, 1, "xsk_ring_cons__peek"))
+ return -2;
+
+ rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx);
+ comp_addr = xsk_umem__extract_addr(rx_desc->addr);
+ addr = xsk_umem__add_offset_to_addr(rx_desc->addr);
+ printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n",
+ xsk, idx, rx_desc->addr, addr, comp_addr);
+ data = xsk_umem__get_data(xsk->umem_area, addr);
+
+ /* Make sure we got the packet offset correctly. */
+
+ eth = data;
+ ASSERT_EQ(eth->h_proto, htons(ETH_P_IP), "eth->h_proto");
+ iph = (void *)(eth + 1);
+ ASSERT_EQ((int)iph->version, 4, "iph->version");
+
+ /* custom metadata */
+
+ meta = data - sizeof(struct xdp_meta);
+
+ if (!ASSERT_NEQ(meta->rx_timestamp, 0, "rx_timestamp"))
+ return -1;
+
+ if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash"))
+ return -1;
+
+ xsk_ring_cons__release(&xsk->rx, 1);
+ refill_rx(xsk, comp_addr);
+
+ return 0;
+}
+
+void test_xdp_metadata(void)
+{
+ struct xdp_metadata2 *bpf_obj2 = NULL;
+ struct xdp_metadata *bpf_obj = NULL;
+ struct bpf_program *new_prog, *prog;
+ struct nstoken *tok = NULL;
+ __u32 queue_id = QUEUE_ID;
+ struct bpf_map *prog_arr;
+ struct xsk tx_xsk = {};
+ struct xsk rx_xsk = {};
+ __u32 val, key = 0;
+ int retries = 10;
+ int rx_ifindex;
+ int tx_ifindex;
+ int sock_fd;
+ int ret;
+
+ /* Setup new networking namespace, with a veth pair. */
+
+ SYS("ip netns add xdp_metadata");
+ tok = open_netns("xdp_metadata");
+ SYS("ip link add numtxqueues 1 numrxqueues 1 " TX_NAME
+ " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1");
+ SYS("ip link set dev " TX_NAME " address 00:00:00:00:00:01");
+ SYS("ip link set dev " RX_NAME " address 00:00:00:00:00:02");
+ SYS("ip link set dev " TX_NAME " up");
+ SYS("ip link set dev " RX_NAME " up");
+ SYS("ip addr add " TX_ADDR "/" PREFIX_LEN " dev " TX_NAME);
+ SYS("ip addr add " RX_ADDR "/" PREFIX_LEN " dev " RX_NAME);
+
+ rx_ifindex = if_nametoindex(RX_NAME);
+ tx_ifindex = if_nametoindex(TX_NAME);
+
+ /* Setup separate AF_XDP for TX and RX interfaces. */
+
+ ret = open_xsk(tx_ifindex, &tx_xsk);
+ if (!ASSERT_OK(ret, "open_xsk(TX_NAME)"))
+ goto out;
+
+ ret = open_xsk(rx_ifindex, &rx_xsk);
+ if (!ASSERT_OK(ret, "open_xsk(RX_NAME)"))
+ goto out;
+
+ bpf_obj = xdp_metadata__open();
+ if (!ASSERT_OK_PTR(bpf_obj, "open skeleton"))
+ goto out;
+
+ prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx");
+ bpf_program__set_ifindex(prog, rx_ifindex);
+ bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY);
+
+ if (!ASSERT_OK(xdp_metadata__load(bpf_obj), "load skeleton"))
+ goto out;
+
+ /* Make sure we can't add dev-bound programs to prog maps. */
+ prog_arr = bpf_object__find_map_by_name(bpf_obj->obj, "prog_arr");
+ if (!ASSERT_OK_PTR(prog_arr, "no prog_arr map"))
+ goto out;
+
+ val = bpf_program__fd(prog);
+ if (!ASSERT_ERR(bpf_map__update_elem(prog_arr, &key, sizeof(key),
+ &val, sizeof(val), BPF_ANY),
+ "update prog_arr"))
+ goto out;
+
+ /* Attach BPF program to RX interface. */
+
+ ret = bpf_xdp_attach(rx_ifindex,
+ bpf_program__fd(bpf_obj->progs.rx),
+ XDP_FLAGS, NULL);
+ if (!ASSERT_GE(ret, 0, "bpf_xdp_attach"))
+ goto out;
+
+ sock_fd = xsk_socket__fd(rx_xsk.socket);
+ ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0);
+ if (!ASSERT_GE(ret, 0, "bpf_map_update_elem"))
+ goto out;
+
+ /* Send packet destined to RX AF_XDP socket. */
+ if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0,
+ "generate AF_XDP_CONSUMER_PORT"))
+ goto out;
+
+ /* Verify AF_XDP RX packet has proper metadata. */
+ if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk), 0,
+ "verify_xsk_metadata"))
+ goto out;
+
+ complete_tx(&tx_xsk);
+
+ /* Make sure freplace correctly picks up original bound device
+ * and doesn't crash.
+ */
+
+ bpf_obj2 = xdp_metadata2__open();
+ if (!ASSERT_OK_PTR(bpf_obj2, "open skeleton"))
+ goto out;
+
+ new_prog = bpf_object__find_program_by_name(bpf_obj2->obj, "freplace_rx");
+ bpf_program__set_attach_target(new_prog, bpf_program__fd(prog), "rx");
+
+ if (!ASSERT_OK(xdp_metadata2__load(bpf_obj2), "load freplace skeleton"))
+ goto out;
+
+ if (!ASSERT_OK(xdp_metadata2__attach(bpf_obj2), "attach freplace"))
+ goto out;
+
+ /* Send packet to trigger . */
+ if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0,
+ "generate freplace packet"))
+ goto out;
+
+ while (!retries--) {
+ if (bpf_obj2->bss->called)
+ break;
+ usleep(10);
+ }
+ ASSERT_GT(bpf_obj2->bss->called, 0, "not called");
+
+out:
+ close_xsk(&rx_xsk);
+ close_xsk(&tx_xsk);
+ xdp_metadata2__destroy(bpf_obj2);
+ xdp_metadata__destroy(bpf_obj);
+ if (tok)
+ close_netns(tok);
+ system("ip netns del xdp_metadata");
+}
diff --git a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
new file mode 100644
index 000000000000..25b8178735ee
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "xdp_metadata.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_XSKMAP);
+ __uint(max_entries, 256);
+ __type(key, __u32);
+ __type(value, __u32);
+} xsk SEC(".maps");
+
+extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx,
+ __u64 *timestamp) __ksym;
+extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx,
+ __u32 *hash) __ksym;
+
+SEC("xdp")
+int rx(struct xdp_md *ctx)
+{
+ void *data, *data_meta, *data_end;
+ struct ipv6hdr *ip6h = NULL;
+ struct ethhdr *eth = NULL;
+ struct udphdr *udp = NULL;
+ struct iphdr *iph = NULL;
+ struct xdp_meta *meta;
+ int ret;
+
+ data = (void *)(long)ctx->data;
+ data_end = (void *)(long)ctx->data_end;
+ eth = data;
+ if (eth + 1 < data_end) {
+ if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+ iph = (void *)(eth + 1);
+ if (iph + 1 < data_end && iph->protocol == IPPROTO_UDP)
+ udp = (void *)(iph + 1);
+ }
+ if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+ ip6h = (void *)(eth + 1);
+ if (ip6h + 1 < data_end && ip6h->nexthdr == IPPROTO_UDP)
+ udp = (void *)(ip6h + 1);
+ }
+ if (udp && udp + 1 > data_end)
+ udp = NULL;
+ }
+
+ if (!udp)
+ return XDP_PASS;
+
+ if (udp->dest != bpf_htons(9091))
+ return XDP_PASS;
+
+ bpf_printk("forwarding UDP:9091 to AF_XDP");
+
+ ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta));
+ if (ret != 0) {
+ bpf_printk("bpf_xdp_adjust_meta returned %d", ret);
+ return XDP_PASS;
+ }
+
+ data = (void *)(long)ctx->data;
+ data_meta = (void *)(long)ctx->data_meta;
+ meta = data_meta;
+
+ if (meta + 1 > data) {
+ bpf_printk("bpf_xdp_adjust_meta doesn't appear to work");
+ return XDP_PASS;
+ }
+
+ if (!bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp))
+ bpf_printk("populated rx_timestamp with %u", meta->rx_timestamp);
+
+ if (!bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash))
+ bpf_printk("populated rx_hash with %u", meta->rx_hash);
+
+ return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c
new file mode 100644
index 000000000000..77678b034389
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "xdp_metadata.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_XSKMAP);
+ __uint(max_entries, 4);
+ __type(key, __u32);
+ __type(value, __u32);
+} xsk SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} prog_arr SEC(".maps");
+
+extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx,
+ __u64 *timestamp) __ksym;
+extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx,
+ __u32 *hash) __ksym;
+
+SEC("xdp")
+int rx(struct xdp_md *ctx)
+{
+ void *data, *data_meta;
+ struct xdp_meta *meta;
+ u64 timestamp = -1;
+ int ret;
+
+ /* Reserve enough for all custom metadata. */
+
+ ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta));
+ if (ret != 0)
+ return XDP_DROP;
+
+ data = (void *)(long)ctx->data;
+ data_meta = (void *)(long)ctx->data_meta;
+
+ if (data_meta + sizeof(struct xdp_meta) > data)
+ return XDP_DROP;
+
+ meta = data_meta;
+
+ /* Export metadata. */
+
+ /* We expect veth bpf_xdp_metadata_rx_timestamp to return 0 HW
+ * timestamp, so put some non-zero value into AF_XDP frame for
+ * the userspace.
+ */
+ bpf_xdp_metadata_rx_timestamp(ctx, &timestamp);
+ if (timestamp == 0)
+ meta->rx_timestamp = 1;
+
+ bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash);
+
+ return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata2.c b/tools/testing/selftests/bpf/progs/xdp_metadata2.c
new file mode 100644
index 000000000000..cf69d05451c3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_metadata2.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "xdp_metadata.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx,
+ __u32 *hash) __ksym;
+
+int called;
+
+SEC("freplace/rx")
+int freplace_rx(struct xdp_md *ctx)
+{
+ u32 hash = 0;
+ /* Call _any_ metadata function to make sure we don't crash. */
+ bpf_xdp_metadata_rx_hash(ctx, &hash);
+ called++;
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index 7cb1bc05e5cf..40cba8d368d9 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -1039,7 +1039,7 @@ try:
offload = bpf_pinned("/sys/fs/bpf/offload")
ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True)
fail(ret == 0, "attached offloaded XDP program to drv")
- check_extack(err, "Using device-bound program without HW_MODE flag is not supported.", args)
+ check_extack(err, "Using offloaded program without HW_MODE flag is not supported.", args)
rm("/sys/fs/bpf/offload")
sim.wait_for_flush()
@@ -1088,12 +1088,12 @@ try:
ret, _, err = sim.set_xdp(pinned, "offload",
fail=False, include_stderr=True)
fail(ret == 0, "Pinned program loaded for a different device accepted")
- check_extack_nsim(err, "program bound to different dev.", args)
+ check_extack(err, "Program bound to different device.", args)
simdev2.remove()
ret, _, err = sim.set_xdp(pinned, "offload",
fail=False, include_stderr=True)
fail(ret == 0, "Pinned program loaded for a removed device accepted")
- check_extack_nsim(err, "xdpoffload of non-bound program.", args)
+ check_extack(err, "Program bound to different device.", args)
rm(pin_file)
bpftool_prog_list_wait(expected=0)
@@ -1334,12 +1334,12 @@ try:
ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False,
fail=False, include_stderr=True)
fail(ret == 0, "cross-ASIC program allowed")
- check_extack_nsim(err, "program bound to different dev.", args)
+ check_extack(err, "Program bound to different device.", args)
for d in simdevB.nsims:
ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False,
fail=False, include_stderr=True)
fail(ret == 0, "cross-ASIC program allowed")
- check_extack_nsim(err, "program bound to different dev.", args)
+ check_extack(err, "Program bound to different device.", args)
start_test("Test multi-dev ASIC cross-dev map reuse...")
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
new file mode 100644
index 000000000000..0008f0f239e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Reference program for verifying XDP metadata on real HW. Functional test
+ * only, doesn't test the performance.
+ *
+ * RX:
+ * - UDP 9091 packets are diverted into AF_XDP
+ * - Metadata verified:
+ * - rx_timestamp
+ * - rx_hash
+ *
+ * TX:
+ * - TBD
+ */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "xdp_hw_metadata.skel.h"
+#include "xsk.h"
+
+#include <error.h>
+#include <linux/errqueue.h>
+#include <linux/if_link.h>
+#include <linux/net_tstamp.h>
+#include <linux/udp.h>
+#include <linux/sockios.h>
+#include <sys/mman.h>
+#include <net/if.h>
+#include <poll.h>
+
+#include "xdp_metadata.h"
+
+#define UMEM_NUM 16
+#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
+#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
+#define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE)
+
+struct xsk {
+ void *umem_area;
+ struct xsk_umem *umem;
+ struct xsk_ring_prod fill;
+ struct xsk_ring_cons comp;
+ struct xsk_ring_prod tx;
+ struct xsk_ring_cons rx;
+ struct xsk_socket *socket;
+};
+
+struct xdp_hw_metadata *bpf_obj;
+struct xsk *rx_xsk;
+const char *ifname;
+int ifindex;
+int rxq;
+
+void test__fail(void) { /* for network_helpers.c */ }
+
+static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
+{
+ int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+ const struct xsk_socket_config socket_config = {
+ .rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .bind_flags = XDP_COPY,
+ };
+ const struct xsk_umem_config umem_config = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+ .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+ };
+ __u32 idx;
+ u64 addr;
+ int ret;
+ int i;
+
+ xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+ if (xsk->umem_area == MAP_FAILED)
+ return -ENOMEM;
+
+ ret = xsk_umem__create(&xsk->umem,
+ xsk->umem_area, UMEM_SIZE,
+ &xsk->fill,
+ &xsk->comp,
+ &umem_config);
+ if (ret)
+ return ret;
+
+ ret = xsk_socket__create(&xsk->socket, ifindex, queue_id,
+ xsk->umem,
+ &xsk->rx,
+ &xsk->tx,
+ &socket_config);
+ if (ret)
+ return ret;
+
+ /* First half of umem is for TX. This way address matches 1-to-1
+ * to the completion queue index.
+ */
+
+ for (i = 0; i < UMEM_NUM / 2; i++) {
+ addr = i * UMEM_FRAME_SIZE;
+ printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr);
+ }
+
+ /* Second half of umem is for RX. */
+
+ ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx);
+ for (i = 0; i < UMEM_NUM / 2; i++) {
+ addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
+ printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
+ *xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
+ }
+ xsk_ring_prod__submit(&xsk->fill, ret);
+
+ return 0;
+}
+
+static void close_xsk(struct xsk *xsk)
+{
+ if (xsk->umem)
+ xsk_umem__delete(xsk->umem);
+ if (xsk->socket)
+ xsk_socket__delete(xsk->socket);
+ munmap(xsk->umem, UMEM_SIZE);
+}
+
+static void refill_rx(struct xsk *xsk, __u64 addr)
+{
+ __u32 idx;
+
+ if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) {
+ printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
+ *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
+ xsk_ring_prod__submit(&xsk->fill, 1);
+ }
+}
+
+static void verify_xdp_metadata(void *data)
+{
+ struct xdp_meta *meta;
+
+ meta = data - sizeof(*meta);
+
+ printf("rx_timestamp: %llu\n", meta->rx_timestamp);
+ printf("rx_hash: %u\n", meta->rx_hash);
+}
+
+static void verify_skb_metadata(int fd)
+{
+ char cmsg_buf[1024];
+ char packet_buf[128];
+
+ struct scm_timestamping *ts;
+ struct iovec packet_iov;
+ struct cmsghdr *cmsg;
+ struct msghdr hdr;
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.msg_iov = &packet_iov;
+ hdr.msg_iovlen = 1;
+ packet_iov.iov_base = packet_buf;
+ packet_iov.iov_len = sizeof(packet_buf);
+
+ hdr.msg_control = cmsg_buf;
+ hdr.msg_controllen = sizeof(cmsg_buf);
+
+ if (recvmsg(fd, &hdr, 0) < 0)
+ error(-1, errno, "recvmsg");
+
+ for (cmsg = CMSG_FIRSTHDR(&hdr); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&hdr, cmsg)) {
+
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case SCM_TIMESTAMPING:
+ ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
+ if (ts->ts[2].tv_sec || ts->ts[2].tv_nsec) {
+ printf("found skb hwtstamp = %lu.%lu\n",
+ ts->ts[2].tv_sec, ts->ts[2].tv_nsec);
+ return;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ printf("skb hwtstamp is not found!\n");
+}
+
+static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd)
+{
+ const struct xdp_desc *rx_desc;
+ struct pollfd fds[rxq + 1];
+ __u64 comp_addr;
+ __u64 addr;
+ __u32 idx;
+ int ret;
+ int i;
+
+ for (i = 0; i < rxq; i++) {
+ fds[i].fd = xsk_socket__fd(rx_xsk[i].socket);
+ fds[i].events = POLLIN;
+ fds[i].revents = 0;
+ }
+
+ fds[rxq].fd = server_fd;
+ fds[rxq].events = POLLIN;
+ fds[rxq].revents = 0;
+
+ while (true) {
+ errno = 0;
+ ret = poll(fds, rxq + 1, 1000);
+ printf("poll: %d (%d)\n", ret, errno);
+ if (ret < 0)
+ break;
+ if (ret == 0)
+ continue;
+
+ if (fds[rxq].revents)
+ verify_skb_metadata(server_fd);
+
+ for (i = 0; i < rxq; i++) {
+ if (fds[i].revents == 0)
+ continue;
+
+ struct xsk *xsk = &rx_xsk[i];
+
+ ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx);
+ printf("xsk_ring_cons__peek: %d\n", ret);
+ if (ret != 1)
+ continue;
+
+ rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx);
+ comp_addr = xsk_umem__extract_addr(rx_desc->addr);
+ addr = xsk_umem__add_offset_to_addr(rx_desc->addr);
+ printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n",
+ xsk, idx, rx_desc->addr, addr, comp_addr);
+ verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr));
+ xsk_ring_cons__release(&xsk->rx, 1);
+ refill_rx(xsk, comp_addr);
+ }
+ }
+
+ return 0;
+}
+
+struct ethtool_channels {
+ __u32 cmd;
+ __u32 max_rx;
+ __u32 max_tx;
+ __u32 max_other;
+ __u32 max_combined;
+ __u32 rx_count;
+ __u32 tx_count;
+ __u32 other_count;
+ __u32 combined_count;
+};
+
+#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */
+
+static int rxq_num(const char *ifname)
+{
+ struct ethtool_channels ch = {
+ .cmd = ETHTOOL_GCHANNELS,
+ };
+
+ struct ifreq ifr = {
+ .ifr_data = (void *)&ch,
+ };
+ strcpy(ifr.ifr_name, ifname);
+ int fd, ret;
+
+ fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ if (fd < 0)
+ error(-1, errno, "socket");
+
+ ret = ioctl(fd, SIOCETHTOOL, &ifr);
+ if (ret < 0)
+ error(-1, errno, "socket");
+
+ close(fd);
+
+ return ch.rx_count + ch.combined_count;
+}
+
+static void cleanup(void)
+{
+ LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
+ int ret;
+ int i;
+
+ if (bpf_obj) {
+ opts.old_prog_fd = bpf_program__fd(bpf_obj->progs.rx);
+ if (opts.old_prog_fd >= 0) {
+ printf("detaching bpf program....\n");
+ ret = bpf_xdp_detach(ifindex, XDP_FLAGS, &opts);
+ if (ret)
+ printf("failed to detach XDP program: %d\n", ret);
+ }
+ }
+
+ for (i = 0; i < rxq; i++)
+ close_xsk(&rx_xsk[i]);
+
+ if (bpf_obj)
+ xdp_hw_metadata__destroy(bpf_obj);
+}
+
+static void handle_signal(int sig)
+{
+ /* interrupting poll() is all we need */
+}
+
+static void timestamping_enable(int fd, int val)
+{
+ int ret;
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val));
+ if (ret < 0)
+ error(-1, errno, "setsockopt(SO_TIMESTAMPING)");
+}
+
+int main(int argc, char *argv[])
+{
+ int server_fd = -1;
+ int ret;
+ int i;
+
+ struct bpf_program *prog;
+
+ if (argc != 2) {
+ fprintf(stderr, "pass device name\n");
+ return -1;
+ }
+
+ ifname = argv[1];
+ ifindex = if_nametoindex(ifname);
+ rxq = rxq_num(ifname);
+
+ printf("rxq: %d\n", rxq);
+
+ rx_xsk = malloc(sizeof(struct xsk) * rxq);
+ if (!rx_xsk)
+ error(-1, ENOMEM, "malloc");
+
+ for (i = 0; i < rxq; i++) {
+ printf("open_xsk(%s, %p, %d)\n", ifname, &rx_xsk[i], i);
+ ret = open_xsk(ifindex, &rx_xsk[i], i);
+ if (ret)
+ error(-1, -ret, "open_xsk");
+
+ printf("xsk_socket__fd() -> %d\n", xsk_socket__fd(rx_xsk[i].socket));
+ }
+
+ printf("open bpf program...\n");
+ bpf_obj = xdp_hw_metadata__open();
+ if (libbpf_get_error(bpf_obj))
+ error(-1, libbpf_get_error(bpf_obj), "xdp_hw_metadata__open");
+
+ prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx");
+ bpf_program__set_ifindex(prog, ifindex);
+ bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY);
+
+ printf("load bpf program...\n");
+ ret = xdp_hw_metadata__load(bpf_obj);
+ if (ret)
+ error(-1, -ret, "xdp_hw_metadata__load");
+
+ printf("prepare skb endpoint...\n");
+ server_fd = start_server(AF_INET6, SOCK_DGRAM, NULL, 9092, 1000);
+ if (server_fd < 0)
+ error(-1, errno, "start_server");
+ timestamping_enable(server_fd,
+ SOF_TIMESTAMPING_SOFTWARE |
+ SOF_TIMESTAMPING_RAW_HARDWARE);
+
+ printf("prepare xsk map...\n");
+ for (i = 0; i < rxq; i++) {
+ int sock_fd = xsk_socket__fd(rx_xsk[i].socket);
+ __u32 queue_id = i;
+
+ printf("map[%d] = %d\n", queue_id, sock_fd);
+ ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0);
+ if (ret)
+ error(-1, -ret, "bpf_map_update_elem");
+ }
+
+ printf("attach bpf program...\n");
+ ret = bpf_xdp_attach(ifindex,
+ bpf_program__fd(bpf_obj->progs.rx),
+ XDP_FLAGS, NULL);
+ if (ret)
+ error(-1, -ret, "bpf_xdp_attach");
+
+ signal(SIGINT, handle_signal);
+ ret = verify_metadata(rx_xsk, rxq, server_fd);
+ close(server_fd);
+ cleanup();
+ if (ret)
+ error(-1, -ret, "verify_metadata");
+}
diff --git a/tools/testing/selftests/bpf/xdp_metadata.h b/tools/testing/selftests/bpf/xdp_metadata.h
new file mode 100644
index 000000000000..f6780fbb0a21
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_metadata.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#pragma once
+
+#ifndef ETH_P_IP
+#define ETH_P_IP 0x0800
+#endif
+
+#ifndef ETH_P_IPV6
+#define ETH_P_IPV6 0x86DD
+#endif
+
+struct xdp_meta {
+ __u64 rx_timestamp;
+ __u32 rx_hash;
+};