summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2023-11-30 16:56:09 -0800
committerJakub Kicinski <kuba@kernel.org>2023-11-30 16:58:42 -0800
commit753c8608f3e579307493a63b9242667aee35a751 (patch)
tree4197358069e8db7bc0d36a474612f7ffefc7ba72
parent975f2d73a99f35b57ffa2ad7bff8562225cdcfcb (diff)
parentf690ff9122d2ca8e38769f3bcf217bd3df681a36 (diff)
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2023-11-30 We've added 30 non-merge commits during the last 7 day(s) which contain a total of 58 files changed, 1598 insertions(+), 154 deletions(-). The main changes are: 1) Add initial TX metadata implementation for AF_XDP with support in mlx5 and stmmac drivers. Two types of offloads are supported right now, that is, TX timestamp and TX checksum offload, from Stanislav Fomichev with stmmac implementation from Song Yoong Siang. 2) Change BPF verifier logic to validate global subprograms lazily instead of unconditionally before the main program, so they can be guarded using BPF CO-RE techniques, from Andrii Nakryiko. 3) Add BPF link_info support for uprobe multi link along with bpftool integration for the latter, from Jiri Olsa. 4) Use pkg-config in BPF selftests to determine ld flags which is in particular needed for linking statically, from Akihiko Odaki. 5) Fix a few BPF selftest failures to adapt to the upcoming LLVM18, from Yonghong Song. * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (30 commits) bpf/tests: Remove duplicate JSGT tests selftests/bpf: Add TX side to xdp_hw_metadata selftests/bpf: Convert xdp_hw_metadata to XDP_USE_NEED_WAKEUP selftests/bpf: Add TX side to xdp_metadata selftests/bpf: Add csum helpers selftests/xsk: Support tx_metadata_len xsk: Add option to calculate TX checksum in SW xsk: Validate xsk_tx_metadata flags xsk: Document tx_metadata_len layout net: stmmac: Add Tx HWTS support to XDP ZC net/mlx5e: Implement AF_XDP TX timestamp and checksum offload tools: ynl: Print xsk-features from the sample xsk: Add TX timestamp and TX checksum offload support xsk: Support tx_metadata_len selftests/bpf: Use pkg-config for libelf selftests/bpf: Override PKG_CONFIG for static builds selftests/bpf: Choose pkg-config for the target bpftool: Add support to display uprobe_multi links selftests/bpf: Add link_info test for uprobe_multi link selftests/bpf: Use bpf_link__destroy in fill_link_info tests ... ==================== Conflicts: Documentation/netlink/specs/netdev.yaml: 839ff60df3ab ("net: page_pool: add nlspec for basic access to page pools") 48eb03dd2630 ("xsk: Add TX timestamp and TX checksum offload support") https://lore.kernel.org/all/20231201094705.1ee3cab8@canb.auug.org.au/ While at it also regen, tree is dirty after: 48eb03dd2630 ("xsk: Add TX timestamp and TX checksum offload support") looks like code wasn't re-rendered after "render-max" was removed. Link: https://lore.kernel.org/r/20231130145708.32573-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--Documentation/netlink/specs/netdev.yaml19
-rw-r--r--Documentation/networking/index.rst1
-rw-r--r--Documentation/networking/xdp-rx-metadata.rst2
-rw-r--r--Documentation/networking/xsk-tx-metadata.rst79
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c72
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h11
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c17
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c1
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac.h12
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c64
-rw-r--r--include/linux/bpf.h2
-rw-r--r--include/linux/netdevice.h2
-rw-r--r--include/linux/skbuff.h14
-rw-r--r--include/net/xdp_sock.h111
-rw-r--r--include/net/xdp_sock_drv.h34
-rw-r--r--include/net/xsk_buff_pool.h8
-rw-r--r--include/uapi/linux/bpf.h10
-rw-r--r--include/uapi/linux/if_xdp.h47
-rw-r--r--include/uapi/linux/netdev.h14
-rw-r--r--kernel/bpf/verifier.c83
-rw-r--r--kernel/trace/bpf_trace.c86
-rw-r--r--lib/test_bpf.c2
-rw-r--r--net/bpf/test_run.c2
-rw-r--r--net/core/netdev-genl.c13
-rw-r--r--net/xdp/xdp_umem.c11
-rw-r--r--net/xdp/xsk.c56
-rw-r--r--net/xdp/xsk_buff_pool.c2
-rw-r--r--net/xdp/xsk_queue.h19
-rw-r--r--tools/bpf/bpftool/link.c105
-rw-r--r--tools/bpf/bpftool/prog.c14
-rw-r--r--tools/include/uapi/linux/bpf.h10
-rw-r--r--tools/include/uapi/linux/if_xdp.h61
-rw-r--r--tools/include/uapi/linux/netdev.h14
-rw-r--r--tools/lib/bpf/elf.c5
-rw-r--r--tools/lib/bpf/libbpf.c2
-rw-r--r--tools/lib/bpf/libbpf.map3
-rw-r--r--tools/lib/bpf/libbpf_internal.h3
-rw-r--r--tools/lib/bpf/libbpf_version.h2
-rw-r--r--tools/net/ynl/generated/netdev-user.c19
-rw-r--r--tools/net/ynl/generated/netdev-user.h3
-rw-r--r--tools/net/ynl/samples/netdev.c10
-rw-r--r--tools/testing/selftests/bpf/Makefile14
-rw-r--r--tools/testing/selftests/bpf/README.rst2
-rw-r--r--tools/testing/selftests/bpf/network_helpers.h43
-rw-r--r--tools/testing/selftests/bpf/prog_tests/fill_link_info.c242
-rw-r--r--tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/verifier.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_metadata.c33
-rw-r--r--tools/testing/selftests/bpf/progs/test_fill_link_info.c6
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func12.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func17.c1
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_global_subprogs.c92
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_subprog_precision.c4
-rwxr-xr-xtools/testing/selftests/bpf/test_offload.py15
-rw-r--r--tools/testing/selftests/bpf/xdp_hw_metadata.c235
-rw-r--r--tools/testing/selftests/bpf/xsk.c3
-rw-r--r--tools/testing/selftests/bpf/xsk.h1
58 files changed, 1590 insertions, 158 deletions
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 20f75b7d3240..eef6358ec587 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -45,7 +45,6 @@ definitions:
-
type: flags
name: xdp-rx-metadata
- render-max: true
entries:
-
name: timestamp
@@ -55,6 +54,18 @@ definitions:
name: hash
doc:
Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash().
+ -
+ type: flags
+ name: xsk-flags
+ entries:
+ -
+ name: tx-timestamp
+ doc:
+ HW timestamping egress packets is supported by the driver.
+ -
+ name: tx-checksum
+ doc:
+ L3 checksum HW offload is supported by the driver.
attribute-sets:
-
@@ -86,6 +97,11 @@ attribute-sets:
See Documentation/networking/xdp-rx-metadata.rst for more details.
type: u64
enum: xdp-rx-metadata
+ -
+ name: xsk-features
+ doc: Bitmask of enabled AF_XDP features.
+ type: u64
+ enum: xsk-flags
-
name: page-pool
attributes:
@@ -209,6 +225,7 @@ operations:
- xdp-features
- xdp-zc-max-segs
- xdp-rx-metadata-features
+ - xsk-features
dump:
reply: *dev-all
-
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index cb435c141794..d9424dcb19bf 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -124,6 +124,7 @@ Contents:
xfrm_sync
xfrm_sysctl
xdp-rx-metadata
+ xsk-tx-metadata
.. only:: subproject and html
diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
index 205696780b78..e3e9420fd817 100644
--- a/Documentation/networking/xdp-rx-metadata.rst
+++ b/Documentation/networking/xdp-rx-metadata.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
===============
XDP RX Metadata
===============
diff --git a/Documentation/networking/xsk-tx-metadata.rst b/Documentation/networking/xsk-tx-metadata.rst
new file mode 100644
index 000000000000..97ecfa480d00
--- /dev/null
+++ b/Documentation/networking/xsk-tx-metadata.rst
@@ -0,0 +1,79 @@
+==================
+AF_XDP TX Metadata
+==================
+
+This document describes how to enable offloads when transmitting packets
+via :doc:`af_xdp`. Refer to :doc:`xdp-rx-metadata` on how to access similar
+metadata on the receive side.
+
+General Design
+==============
+
+The headroom for the metadata is reserved via ``tx_metadata_len`` in
+``struct xdp_umem_reg``. The metadata length is therefore the same for
+every socket that shares the same umem. The metadata layout is a fixed UAPI,
+refer to ``union xsk_tx_metadata`` in ``include/uapi/linux/if_xdp.h``.
+Thus, generally, the ``tx_metadata_len`` field above should contain
+``sizeof(union xsk_tx_metadata)``.
+
+The headroom and the metadata itself should be located right before
+``xdp_desc->addr`` in the umem frame. Within a frame, the metadata
+layout is as follows::
+
+ tx_metadata_len
+ / \
+ +-----------------+---------+----------------------------+
+ | xsk_tx_metadata | padding | payload |
+ +-----------------+---------+----------------------------+
+ ^
+ |
+ xdp_desc->addr
+
+An AF_XDP application can request headrooms larger than ``sizeof(struct
+xsk_tx_metadata)``. The kernel will ignore the padding (and will still
+use ``xdp_desc->addr - tx_metadata_len`` to locate
+the ``xsk_tx_metadata``). For the frames that shouldn't carry
+any metadata (i.e., the ones that don't have ``XDP_TX_METADATA`` option),
+the metadata area is ignored by the kernel as well.
+
+The flags field enables the particular offload:
+
+- ``XDP_TXMD_FLAGS_TIMESTAMP``: requests the device to put transmission
+ timestamp into ``tx_timestamp`` field of ``union xsk_tx_metadata``.
+- ``XDP_TXMD_FLAGS_CHECKSUM``: requests the device to calculate L4
+ checksum. ``csum_start`` specifies byte offset of where the checksumming
+ should start and ``csum_offset`` specifies byte offset where the
+ device should store the computed checksum.
+
+Besides the flags above, in order to trigger the offloads, the first
+packet's ``struct xdp_desc`` descriptor should set ``XDP_TX_METADATA``
+bit in the ``options`` field. Also note that in a multi-buffer packet
+only the first chunk should carry the metadata.
+
+Software TX Checksum
+====================
+
+For development and testing purposes its possible to pass
+``XDP_UMEM_TX_SW_CSUM`` flag to ``XDP_UMEM_REG`` UMEM registration call.
+In this case, when running in ``XDK_COPY`` mode, the TX checksum
+is calculated on the CPU. Do not enable this option in production because
+it will negatively affect performance.
+
+Querying Device Capabilities
+============================
+
+Every devices exports its offloads capabilities via netlink netdev family.
+Refer to ``xsk-flags`` features bitmask in
+``Documentation/netlink/specs/netdev.yaml``.
+
+- ``tx-timestamp``: device supports ``XDP_TXMD_FLAGS_TIMESTAMP``
+- ``tx-checksum``: device supports ``XDP_TXMD_FLAGS_CHECKSUM``
+
+See ``tools/net/ynl/samples/netdev.c`` on how to query this information.
+
+Example
+=======
+
+See ``tools/testing/selftests/bpf/xdp_hw_metadata.c`` for an example
+program that handles TX metadata. Also see https://github.com/fomichev/xskgen
+for a more bare-bones example.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index b2a5da9739d2..43f027bf2da3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -484,10 +484,12 @@ struct mlx5e_xdp_info_fifo {
struct mlx5e_xdpsq;
struct mlx5e_xmit_data;
+struct xsk_tx_metadata;
typedef int (*mlx5e_fp_xmit_xdp_frame_check)(struct mlx5e_xdpsq *);
typedef bool (*mlx5e_fp_xmit_xdp_frame)(struct mlx5e_xdpsq *,
struct mlx5e_xmit_data *,
- int);
+ int,
+ struct xsk_tx_metadata *);
struct mlx5e_xdpsq {
/* data path */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 7decc81ed33a..e2e7d82cfca4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -103,7 +103,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
xdptxd->dma_addr = dma_addr;
if (unlikely(!INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
- mlx5e_xmit_xdp_frame, sq, xdptxd, 0)))
+ mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL)))
return false;
/* xmit_mode == MLX5E_XDP_XMIT_MODE_FRAME */
@@ -145,7 +145,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
xdptxd->dma_addr = dma_addr;
if (unlikely(!INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
- mlx5e_xmit_xdp_frame, sq, xdptxd, 0)))
+ mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL)))
return false;
/* xmit_mode == MLX5E_XDP_XMIT_MODE_PAGE */
@@ -261,6 +261,37 @@ const struct xdp_metadata_ops mlx5e_xdp_metadata_ops = {
.xmo_rx_hash = mlx5e_xdp_rx_hash,
};
+struct mlx5e_xsk_tx_complete {
+ struct mlx5_cqe64 *cqe;
+ struct mlx5e_cq *cq;
+};
+
+static u64 mlx5e_xsk_fill_timestamp(void *_priv)
+{
+ struct mlx5e_xsk_tx_complete *priv = _priv;
+ u64 ts;
+
+ ts = get_cqe_ts(priv->cqe);
+
+ if (mlx5_is_real_time_rq(priv->cq->mdev) || mlx5_is_real_time_sq(priv->cq->mdev))
+ return mlx5_real_time_cyc2time(&priv->cq->mdev->clock, ts);
+
+ return mlx5_timecounter_cyc2time(&priv->cq->mdev->clock, ts);
+}
+
+static void mlx5e_xsk_request_checksum(u16 csum_start, u16 csum_offset, void *priv)
+{
+ struct mlx5_wqe_eth_seg *eseg = priv;
+
+ /* HW/FW is doing parsing, so offsets are largely ignored. */
+ eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+}
+
+const struct xsk_tx_metadata_ops mlx5e_xsk_tx_metadata_ops = {
+ .tmo_fill_timestamp = mlx5e_xsk_fill_timestamp,
+ .tmo_request_checksum = mlx5e_xsk_request_checksum,
+};
+
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq,
struct bpf_prog *prog, struct mlx5e_xdp_buff *mxbuf)
@@ -398,11 +429,11 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq
INDIRECT_CALLABLE_SCOPE bool
mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
- int check_result);
+ int check_result, struct xsk_tx_metadata *meta);
INDIRECT_CALLABLE_SCOPE bool
mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
- int check_result)
+ int check_result, struct xsk_tx_metadata *meta)
{
struct mlx5e_tx_mpwqe *session = &sq->mpwqe;
struct mlx5e_xdpsq_stats *stats = sq->stats;
@@ -420,7 +451,7 @@ mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptx
*/
if (unlikely(sq->mpwqe.wqe))
mlx5e_xdp_mpwqe_complete(sq);
- return mlx5e_xmit_xdp_frame(sq, xdptxd, 0);
+ return mlx5e_xmit_xdp_frame(sq, xdptxd, 0, meta);
}
if (!xdptxd->len) {
skb_frag_t *frag = &xdptxdf->sinfo->frags[0];
@@ -450,6 +481,7 @@ mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptx
* and it's safe to complete it at any time.
*/
mlx5e_xdp_mpwqe_session_start(sq);
+ xsk_tx_metadata_request(meta, &mlx5e_xsk_tx_metadata_ops, &session->wqe->eth);
}
mlx5e_xdp_mpwqe_add_dseg(sq, p, stats);
@@ -480,7 +512,7 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq)
INDIRECT_CALLABLE_SCOPE bool
mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
- int check_result)
+ int check_result, struct xsk_tx_metadata *meta)
{
struct mlx5e_xmit_data_frags *xdptxdf =
container_of(xdptxd, struct mlx5e_xmit_data_frags, xd);
@@ -599,6 +631,8 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
sq->pc++;
}
+ xsk_tx_metadata_request(meta, &mlx5e_xsk_tx_metadata_ops, eseg);
+
sq->doorbell_cseg = cseg;
stats->xmit++;
@@ -608,7 +642,9 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
struct mlx5e_xdp_wqe_info *wi,
u32 *xsk_frames,
- struct xdp_frame_bulk *bq)
+ struct xdp_frame_bulk *bq,
+ struct mlx5e_cq *cq,
+ struct mlx5_cqe64 *cqe)
{
struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo;
u16 i;
@@ -668,10 +704,24 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
break;
}
- case MLX5E_XDP_XMIT_MODE_XSK:
+ case MLX5E_XDP_XMIT_MODE_XSK: {
/* AF_XDP send */
+ struct xsk_tx_metadata_compl *compl = NULL;
+ struct mlx5e_xsk_tx_complete priv = {
+ .cqe = cqe,
+ .cq = cq,
+ };
+
+ if (xp_tx_metadata_enabled(sq->xsk_pool)) {
+ xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo);
+ compl = &xdpi.xsk_meta;
+
+ xsk_tx_metadata_complete(compl, &mlx5e_xsk_tx_metadata_ops, &priv);
+ }
+
(*xsk_frames)++;
break;
+ }
default:
WARN_ON_ONCE(true);
}
@@ -720,7 +770,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
sqcc += wi->num_wqebbs;
- mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq);
+ mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq, cq, cqe);
} while (!last_wqe);
if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) {
@@ -767,7 +817,7 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
sq->cc += wi->num_wqebbs;
- mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq);
+ mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq, NULL, NULL);
}
xdp_flush_frame_bulk(&bq);
@@ -840,7 +890,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
}
ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
- mlx5e_xmit_xdp_frame, sq, xdptxd, 0);
+ mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL);
if (unlikely(!ret)) {
int j;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index ecfe93a479da..e054db1e10f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -33,6 +33,7 @@
#define __MLX5_EN_XDP_H__
#include <linux/indirect_call_wrapper.h>
+#include <net/xdp_sock.h>
#include "en.h"
#include "en/txrx.h"
@@ -82,7 +83,7 @@ enum mlx5e_xdp_xmit_mode {
* num, page_1, page_2, ... , page_num.
*
* MLX5E_XDP_XMIT_MODE_XSK:
- * none.
+ * frame.xsk_meta.
*/
#define MLX5E_XDP_FIFO_ENTRIES2DS_MAX_RATIO 4
@@ -97,6 +98,7 @@ union mlx5e_xdp_info {
u8 num;
struct page *page;
} page;
+ struct xsk_tx_metadata_compl xsk_meta;
};
struct mlx5e_xsk_param;
@@ -112,13 +114,16 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
u32 flags);
extern const struct xdp_metadata_ops mlx5e_xdp_metadata_ops;
+extern const struct xsk_tx_metadata_ops mlx5e_xsk_tx_metadata_ops;
INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq,
struct mlx5e_xmit_data *xdptxd,
- int check_result));
+ int check_result,
+ struct xsk_tx_metadata *meta));
INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq,
struct mlx5e_xmit_data *xdptxd,
- int check_result));
+ int check_result,
+ struct xsk_tx_metadata *meta));
INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq));
INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 597f319d4770..a59199ed590d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -55,12 +55,16 @@ static void mlx5e_xsk_tx_post_err(struct mlx5e_xdpsq *sq,
nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, *xdpi);
+ if (xp_tx_metadata_enabled(sq->xsk_pool))
+ mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo,
+ (union mlx5e_xdp_info) { .xsk_meta = {} });
sq->doorbell_cseg = &nopwqe->ctrl;
}
bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
{
struct xsk_buff_pool *pool = sq->xsk_pool;
+ struct xsk_tx_metadata *meta = NULL;
union mlx5e_xdp_info xdpi;
bool work_done = true;
bool flush = false;
@@ -93,12 +97,13 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
xdptxd.dma_addr = xsk_buff_raw_get_dma(pool, desc.addr);
xdptxd.data = xsk_buff_raw_get_data(pool, desc.addr);
xdptxd.len = desc.len;
+ meta = xsk_buff_get_metadata(pool, desc.addr);
xsk_buff_raw_dma_sync_for_device(pool, xdptxd.dma_addr, xdptxd.len);
ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
mlx5e_xmit_xdp_frame, sq, &xdptxd,
- check_result);
+ check_result, meta);
if (unlikely(!ret)) {
if (sq->mpwqe.wqe)
mlx5e_xdp_mpwqe_complete(sq);
@@ -106,6 +111,16 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
mlx5e_xsk_tx_post_err(sq, &xdpi);
} else {
mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi);
+ if (xp_tx_metadata_enabled(sq->xsk_pool)) {
+ struct xsk_tx_metadata_compl compl;
+
+ xsk_tx_metadata_to_compl(meta, &compl);
+ XSK_TX_COMPL_FITS(void *);
+
+ mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo,
+ (union mlx5e_xdp_info)
+ { .xsk_meta = compl });
+ }
}
flush = true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index af5e0d3c8ee1..26a98cfb0a59 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5165,6 +5165,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
netdev->netdev_ops = &mlx5e_netdev_ops;
netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops;
+ netdev->xsk_tx_metadata_ops = &mlx5e_xsk_tx_metadata_ops;
mlx5e_dcbnl_build_netdev(netdev);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index cd7a9768de5f..686c94c2e8a7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -51,6 +51,7 @@ struct stmmac_tx_info {
bool last_segment;
bool is_jumbo;
enum stmmac_txbuf_type buf_type;
+ struct xsk_tx_metadata_compl xsk_meta;
};
#define STMMAC_TBS_AVAIL BIT(0)
@@ -100,6 +101,17 @@ struct stmmac_xdp_buff {
struct dma_desc *ndesc;
};
+struct stmmac_metadata_request {
+ struct stmmac_priv *priv;
+ struct dma_desc *tx_desc;
+ bool *set_ic;
+};
+
+struct stmmac_xsk_tx_complete {
+ struct stmmac_priv *priv;
+ struct dma_desc *desc;
+};
+
struct stmmac_rx_queue {
u32 rx_count_frames;
u32 queue_index;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 8964fc8a955f..c2ac88aaffed 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2430,6 +2430,46 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv)
}
}
+static void stmmac_xsk_request_timestamp(void *_priv)
+{
+ struct stmmac_metadata_request *meta_req = _priv;
+
+ stmmac_enable_tx_timestamp(meta_req->priv, meta_req->tx_desc);
+ *meta_req->set_ic = true;
+}
+
+static u64 stmmac_xsk_fill_timestamp(void *_priv)
+{
+ struct stmmac_xsk_tx_complete *tx_compl = _priv;
+ struct stmmac_priv *priv = tx_compl->priv;
+ struct dma_desc *desc = tx_compl->desc;
+ bool found = false;
+ u64 ns = 0;
+
+ if (!priv->hwts_tx_en)
+ return 0;
+
+ /* check tx tstamp status */
+ if (stmmac_get_tx_timestamp_status(priv, desc)) {
+ stmmac_get_timestamp(priv, desc, priv->adv_ts, &ns);
+ found = true;
+ } else if (!stmmac_get_mac_tx_timestamp(priv, priv->hw, &ns)) {
+ found = true;
+ }
+
+ if (found) {
+ ns -= priv->plat->cdc_error_adj;
+ return ns_to_ktime(ns);
+ }
+
+ return 0;
+}
+
+static const struct xsk_tx_metadata_ops stmmac_xsk_tx_metadata_ops = {
+ .tmo_request_timestamp = stmmac_xsk_request_timestamp,
+ .tmo_fill_timestamp = stmmac_xsk_fill_timestamp,
+};
+
static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
{
struct netdev_queue *nq = netdev_get_tx_queue(priv->dev, queue);
@@ -2449,6 +2489,8 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
budget = min(budget, stmmac_tx_avail(priv, queue));
while (budget-- > 0) {
+ struct stmmac_metadata_request meta_req;
+ struct xsk_tx_metadata *meta = NULL;
dma_addr_t dma_addr;
bool set_ic;
@@ -2472,6 +2514,7 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
tx_desc = tx_q->dma_tx + entry;
dma_addr = xsk_buff_raw_get_dma(pool, xdp_desc.addr);
+ meta = xsk_buff_get_metadata(pool, xdp_desc.addr);
xsk_buff_raw_dma_sync_for_device(pool, dma_addr, xdp_desc.len);
tx_q->tx_skbuff_dma[entry].buf_type = STMMAC_TXBUF_T_XSK_TX;
@@ -2499,6 +2542,11 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
else
set_ic = false;
+ meta_req.priv = priv;
+ meta_req.tx_desc = tx_desc;
+ meta_req.set_ic = &set_ic;
+ xsk_tx_metadata_request(meta, &stmmac_xsk_tx_metadata_ops,
+ &meta_req);
if (set_ic) {
tx_q->tx_count_frames = 0;
stmmac_set_tx_ic(priv, tx_desc);
@@ -2511,6 +2559,9 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
stmmac_enable_dma_transmission(priv, priv->ioaddr);
+ xsk_tx_metadata_to_compl(meta,
+ &tx_q->tx_skbuff_dma[entry].xsk_meta);
+
tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, priv->dma_conf.dma_tx_size);
entry = tx_q->cur_tx;
}
@@ -2620,8 +2671,18 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue,
} else {
tx_packets++;
}
- if (skb)
+ if (skb) {
stmmac_get_tx_hwtstamp(priv, p, skb);
+ } else {
+ struct stmmac_xsk_tx_complete tx_compl = {
+ .priv = priv,
+ .desc = p,
+ };
+
+ xsk_tx_metadata_complete(&tx_q->tx_skbuff_dma[entry].xsk_meta,
+ &stmmac_xsk_tx_metadata_ops,
+ &tx_compl);
+ }
}
if (likely(tx_q->tx_skbuff_dma[entry].buf &&
@@ -7464,6 +7525,7 @@ int stmmac_dvr_probe(struct device *device,
ndev->netdev_ops = &stmmac_netdev_ops;
ndev->xdp_metadata_ops = &stmmac_xdp_metadata_ops;
+ ndev->xsk_tx_metadata_ops = &stmmac_xsk_tx_metadata_ops;
ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
NETIF_F_RXCSUM;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 258ba232e302..eb447b0a9423 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
struct bpf_func_info_aux {
u16 linkage;
bool unreliable;
+ bool called : 1;
+ bool verified : 1;
};
enum bpf_jit_poke_reason {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 998c7aaa98b8..c2d74bc112dd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1865,6 +1865,7 @@ enum netdev_stat_type {
* @netdev_ops: Includes several pointers to callbacks,
* if one wants to override the ndo_*() functions
* @xdp_metadata_ops: Includes pointers to XDP metadata callbacks.
+ * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks.
* @ethtool_ops: Management operations
* @l3mdev_ops: Layer 3 master device operations
* @ndisc_ops: Includes callbacks for different IPv6 neighbour
@@ -2128,6 +2129,7 @@ struct net_device {
unsigned long long priv_flags;
const struct net_device_ops *netdev_ops;
const struct xdp_metadata_ops *xdp_metadata_ops;
+ const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
int ifindex;
unsigned short gflags;
unsigned short hard_header_len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 27998f73183e..b370eb8d70f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -566,6 +566,15 @@ struct ubuf_info_msgzc {
int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);
+/* Preserve some data across TX submission and completion.
+ *
+ * Note, this state is stored in the driver. Extending the layout
+ * might need some special care.
+ */
+struct xsk_tx_metadata_compl {
+ __u64 *tx_timestamp;
+};
+
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
@@ -578,7 +587,10 @@ struct skb_shared_info {
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
struct sk_buff *frag_list;
- struct skb_shared_hwtstamps hwtstamps;
+ union {
+ struct skb_shared_hwtstamps hwtstamps;
+ struct xsk_tx_metadata_compl xsk_meta;
+ };
unsigned int gso_type;
u32 tskey;
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index f83128007fb0..3cb4dc9bd70e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -30,6 +30,7 @@ struct xdp_umem {
struct user_struct *user;
refcount_t users;
u8 flags;
+ u8 tx_metadata_len;
bool zc;
struct page **pgs;
int id;
@@ -92,12 +93,105 @@ struct xdp_sock {
struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
};
+/*
+ * AF_XDP TX metadata hooks for network devices.
+ * The following hooks can be defined; unless noted otherwise, they are
+ * optional and can be filled with a null pointer.
+ *
+ * void (*tmo_request_timestamp)(void *priv)
+ * Called when AF_XDP frame requested egress timestamp.
+ *
+ * u64 (*tmo_fill_timestamp)(void *priv)
+ * Called when AF_XDP frame, that had requested egress timestamp,
+ * received a completion. The hook needs to return the actual HW timestamp.
+ *
+ * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv)
+ * Called when AF_XDP frame requested HW checksum offload. csum_start
+ * indicates position where checksumming should start.
+ * csum_offset indicates position where checksum should be stored.
+ *
+ */
+struct xsk_tx_metadata_ops {
+ void (*tmo_request_timestamp)(void *priv);
+ u64 (*tmo_fill_timestamp)(void *priv);
+ void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv);
+};
+
#ifdef CONFIG_XDP_SOCKETS
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
void __xsk_map_flush(void);
+/**
+ * xsk_tx_metadata_to_compl - Save enough relevant metadata information
+ * to perform tx completion in the future.
+ * @meta: pointer to AF_XDP metadata area
+ * @compl: pointer to output struct xsk_tx_metadata_to_compl
+ *
+ * This function should be called by the networking device when
+ * it prepares AF_XDP egress packet. The value of @compl should be stored
+ * and passed to xsk_tx_metadata_complete upon TX completion.
+ */
+static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
+ struct xsk_tx_metadata_compl *compl)
+{
+ if (!meta)
+ return;
+
+ if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
+ compl->tx_timestamp = &meta->completion.tx_timestamp;
+ else
+ compl->tx_timestamp = NULL;
+}
+
+/**
+ * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission
+ * and call appropriate xsk_tx_metadata_ops operation.
+ * @meta: pointer to AF_XDP metadata area
+ * @ops: pointer to struct xsk_tx_metadata_ops
+ * @priv: pointer to driver-private aread
+ *
+ * This function should be called by the networking device when
+ * it prepares AF_XDP egress packet.
+ */
+static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+ if (!meta)
+ return;
+
+ if (ops->tmo_request_timestamp)
+ if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
+ ops->tmo_request_timestamp(priv);
+
+ if (ops->tmo_request_checksum)
+ if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM)
+ ops->tmo_request_checksum(meta->request.csum_start,
+ meta->request.csum_offset, priv);
+}
+
+/**
+ * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion
+ * and call appropriate xsk_tx_metadata_ops operation.
+ * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl
+ * @ops: pointer to struct xsk_tx_metadata_ops
+ * @priv: pointer to driver-private aread
+ *
+ * This function should be called by the networking device upon
+ * AF_XDP egress completion.
+ */
+static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+ if (!compl)
+ return;
+
+ *compl->tx_timestamp = ops->tmo_fill_timestamp(priv);
+}
+
#else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
@@ -114,6 +208,23 @@ static inline void __xsk_map_flush(void)
{
}
+static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
+ struct xsk_tx_metadata_compl *compl)
+{
+}
+
+static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+}
+
+static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+}
+
#endif /* CONFIG_XDP_SOCKETS */
#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 1f6fc8c7a84c..81e02de3f453 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -165,6 +165,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return xp_raw_get_data(pool, addr);
}
+#define XDP_TXMD_FLAGS_VALID ( \
+ XDP_TXMD_FLAGS_TIMESTAMP | \
+ XDP_TXMD_FLAGS_CHECKSUM | \
+ 0)
+
+static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
+{
+ return !(meta->flags & ~XDP_TXMD_FLAGS_VALID);
+}
+
+static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+ struct xsk_tx_metadata *meta;
+
+ if (!pool->tx_metadata_len)
+ return NULL;
+
+ meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len;
+ if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
+ return NULL; /* no way to signal the error to the user */
+
+ return meta;
+}
+
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
@@ -324,6 +348,16 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return NULL;
}
+static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
+{
+ return false;
+}
+
+static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+ return NULL;
+}
+
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
}
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index b0bdff26fc88..8d48d37ab7c0 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -33,6 +33,7 @@ struct xdp_buff_xsk {
};
#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb))
+#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t))
struct xsk_dma_map {
dma_addr_t *dma_pages;
@@ -77,10 +78,12 @@ struct xsk_buff_pool {
u32 chunk_size;
u32 chunk_shift;
u32 frame_len;
+ u8 tx_metadata_len; /* inherited from umem */
u8 cached_need_wakeup;
bool uses_need_wakeup;
bool dma_need_sync;
bool unaligned;
+ bool tx_sw_csum;
void *addrs;
/* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
* NAPI TX thread and sendmsg error paths in the SKB destructor callback and when
@@ -233,4 +236,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb)
return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
}
+static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool)
+{
+ return pool->tx_metadata_len > 0;
+}
+
#endif /* XSK_BUFF_POOL_H_ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7a5498242eaa..e88746ba7d21 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6563,6 +6563,16 @@ struct bpf_link_info {
__u64 missed;
} kprobe_multi;
struct {
+ __aligned_u64 path;
+ __aligned_u64 offsets;
+ __aligned_u64 ref_ctr_offsets;
+ __aligned_u64 cookies;
+ __u32 path_size; /* in/out: real path size on success, including zero byte */
+ __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */
+ __u32 flags;
+ __u32 pid;
+ } uprobe_multi;
+ struct {
__u32 type; /* enum bpf_perf_event_type */
__u32 :32;
union {
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 8d48863472b9..d31698410410 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -33,7 +33,13 @@
#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */
-#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+
+/* Force checksum calculation in software. Can be used for testing or
+ * working around potential HW issues. This option causes performance
+ * degradation and only works in XDP_COPY mode.
+ */
+#define XDP_UMEM_TX_SW_CSUM (1 << 1)
struct sockaddr_xdp {
__u16 sxdp_family;
@@ -76,6 +82,7 @@ struct xdp_umem_reg {
__u32 chunk_size;
__u32 headroom;
__u32 flags;
+ __u32 tx_metadata_len;
};
struct xdp_statistics {
@@ -105,6 +112,41 @@ struct xdp_options {
#define XSK_UNALIGNED_BUF_ADDR_MASK \
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+/* Request transmit timestamp. Upon completion, put it into tx_timestamp
+ * field of struct xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0)
+
+/* Request transmit checksum offload. Checksum start position and offset
+ * are communicated via csum_start and csum_offset fields of struct
+ * xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)
+
+/* AF_XDP offloads request. 'request' union member is consumed by the driver
+ * when the packet is being transmitted. 'completion' union member is
+ * filled by the driver when the transmit completion arrives.
+ */
+struct xsk_tx_metadata {
+ __u64 flags;
+
+ union {
+ struct {
+ /* XDP_TXMD_FLAGS_CHECKSUM */
+
+ /* Offset from desc->addr where checksumming should start. */
+ __u16 csum_start;
+ /* Offset from csum_start where checksum should be stored. */
+ __u16 csum_offset;
+ } request;
+
+ struct {
+ /* XDP_TXMD_FLAGS_TIMESTAMP */
+ __u64 tx_timestamp;
+ } completion;
+ };
+};
+
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;
@@ -121,4 +163,7 @@ struct xdp_desc {
*/
#define XDP_PKT_CONTD (1 << 0)
+/* TX packet carries valid metadata. */
+#define XDP_TX_METADATA (1 << 1)
+
#endif /* _LINUX_IF_XDP_H */
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 2b37233e00c0..6244c0164976 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -48,9 +48,18 @@ enum netdev_xdp_act {
enum netdev_xdp_rx_metadata {
NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
NETDEV_XDP_RX_METADATA_HASH = 2,
+};
- /* private: */
- NETDEV_XDP_RX_METADATA_MASK = 3,
+/**
+ * enum netdev_xsk_flags
+ * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported
+ * by the driver.
+ * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
+ * driver.
+ */
+enum netdev_xsk_flags {
+ NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
+ NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
};
enum {
@@ -59,6 +68,7 @@ enum {
NETDEV_A_DEV_XDP_FEATURES,
NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ NETDEV_A_DEV_XSK_FEATURES,
__NETDEV_A_DEV_MAX,
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 405da1f9e724..8e7b6072e3f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -339,6 +339,11 @@ struct bpf_kfunc_call_arg_meta {
struct btf *btf_vmlinux;
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
static DEFINE_MUTEX(bpf_verifier_lock);
static DEFINE_MUTEX(bpf_percpu_ma_lock);
@@ -418,6 +423,22 @@ static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
}
+static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_func_info *info;
+
+ if (!env->prog->aux->func_info)
+ return "";
+
+ info = &env->prog->aux->func_info[subprog];
+ return btf_type_name(env->prog->aux->btf, info->type_id);
+}
+
+static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog)
+{
+ return &env->prog->aux->func_info_aux[subprog];
+}
+
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -587,11 +608,6 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}
-static const char *btf_type_name(const struct btf *btf, u32 id)
-{
- return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
-}
-
static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
{
switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -9269,13 +9285,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (err == -EFAULT)
return err;
if (subprog_is_global(env, subprog)) {
+ const char *sub_name = subprog_name(env, subprog);
+
if (err) {
- verbose(env, "Caller passes invalid args into func#%d\n", subprog);
+ verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
+ subprog, sub_name);
return err;
}
- if (env->log.level & BPF_LOG_LEVEL)
- verbose(env, "Func#%d is global and valid. Skipping.\n", subprog);
+ verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
+ subprog, sub_name);
+ /* mark global subprog for verifying after main prog */
+ subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
/* All global functions return a 64-bit SCALAR_VALUE */
@@ -19859,8 +19880,11 @@ out:
return ret;
}
-/* Verify all global functions in a BPF program one by one based on their BTF.
- * All global functions must pass verification. Otherwise the whole program is rejected.
+/* Lazily verify all global functions based on their BTF, if they are called
+ * from main BPF program or any of subprograms transitively.
+ * BPF global subprogs called from dead code are not validated.
+ * All callable global functions must pass verification.
+ * Otherwise the whole program is rejected.
* Consider:
* int bar(int);
* int foo(int f)
@@ -19879,25 +19903,50 @@ out:
static int do_check_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog_aux *aux = env->prog->aux;
- int i, ret;
+ struct bpf_func_info_aux *sub_aux;
+ int i, ret, new_cnt;
if (!aux->func_info)
return 0;
+ /* exception callback is presumed to be always called */
+ if (env->exception_callback_subprog)
+ subprog_aux(env, env->exception_callback_subprog)->called = true;
+
+again:
+ new_cnt = 0;
for (i = 1; i < env->subprog_cnt; i++) {
- if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+ if (!subprog_is_global(env, i))
+ continue;
+
+ sub_aux = subprog_aux(env, i);
+ if (!sub_aux->called || sub_aux->verified)
continue;
+
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
ret = do_check_common(env, i, env->exception_callback_subprog == i);
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
- verbose(env,
- "Func#%d is safe for any args that match its prototype\n",
- i);
+ verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
+ i, subprog_name(env, i));
}
+
+ /* We verified new global subprog, it might have called some
+ * more global subprogs that we haven't verified yet, so we
+ * need to do another pass over subprogs to verify those.
+ */
+ sub_aux->verified = true;
+ new_cnt++;
}
+
+ /* We can't loop forever as we verify at least one global subprog on
+ * each pass.
+ */
+ if (new_cnt)
+ goto again;
+
return 0;
}
@@ -20543,8 +20592,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
- ret = do_check_subprogs(env);
- ret = ret ?: do_check_main(env);
+ ret = do_check_main(env);
+ ret = ret ?: do_check_subprogs(env);
if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f0b8b7c29126..c284a4ad0315 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3033,6 +3033,7 @@ struct bpf_uprobe_multi_link;
struct bpf_uprobe {
struct bpf_uprobe_multi_link *link;
loff_t offset;
+ unsigned long ref_ctr_offset;
u64 cookie;
struct uprobe_consumer consumer;
};
@@ -3041,6 +3042,7 @@ struct bpf_uprobe_multi_link {
struct path path;
struct bpf_link link;
u32 cnt;
+ u32 flags;
struct bpf_uprobe *uprobes;
struct task_struct *task;
};
@@ -3082,9 +3084,79 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
kfree(umulti_link);
}
+static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
+ u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
+ u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
+ u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
+ u32 upath_size = info->uprobe_multi.path_size;
+ struct bpf_uprobe_multi_link *umulti_link;
+ u32 ucount = info->uprobe_multi.count;
+ int err = 0, i;
+ long left;
+
+ if (!upath ^ !upath_size)
+ return -EINVAL;
+
+ if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
+ return -EINVAL;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ info->uprobe_multi.count = umulti_link->cnt;
+ info->uprobe_multi.flags = umulti_link->flags;
+ info->uprobe_multi.pid = umulti_link->task ?
+ task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
+
+ if (upath) {
+ char *p, *buf;
+
+ upath_size = min_t(u32, upath_size, PATH_MAX);
+
+ buf = kmalloc(upath_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = d_path(&umulti_link->path, buf, upath_size);
+ if (IS_ERR(p)) {
+ kfree(buf);
+ return PTR_ERR(p);
+ }
+ upath_size = buf + upath_size - p;
+ left = copy_to_user(upath, p, upath_size);
+ kfree(buf);
+ if (left)
+ return -EFAULT;
+ info->uprobe_multi.path_size = upath_size;
+ }
+
+ if (!uoffsets && !ucookies && !uref_ctr_offsets)
+ return 0;
+
+ if (ucount < umulti_link->cnt)
+ err = -ENOSPC;
+ else
+ ucount = umulti_link->cnt;
+
+ for (i = 0; i < ucount; i++) {
+ if (uoffsets &&
+ put_user(umulti_link->uprobes[i].offset, uoffsets + i))
+ return -EFAULT;
+ if (uref_ctr_offsets &&
+ put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
+ return -EFAULT;
+ if (ucookies &&
+ put_user(umulti_link->uprobes[i].cookie, ucookies + i))
+ return -EFAULT;
+ }
+
+ return err;
+}
+
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
.dealloc = bpf_uprobe_multi_link_dealloc,
+ .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
@@ -3172,7 +3244,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
{
struct bpf_uprobe_multi_link *link = NULL;
unsigned long __user *uref_ctr_offsets;
- unsigned long *ref_ctr_offsets = NULL;
struct bpf_link_primer link_primer;
struct bpf_uprobe *uprobes = NULL;
struct task_struct *task = NULL;
@@ -3245,18 +3316,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!uprobes || !link)
goto error_free;
- if (uref_ctr_offsets) {
- ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL);
- if (!ref_ctr_offsets)
- goto error_free;
- }
-
for (i = 0; i < cnt; i++) {
if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
err = -EFAULT;
goto error_free;
}
- if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) {
+ if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
err = -EFAULT;
goto error_free;
}
@@ -3280,6 +3345,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->uprobes = uprobes;
link->path = path;
link->task = task;
+ link->flags = flags;
bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
&bpf_uprobe_multi_link_lops, prog);
@@ -3287,7 +3353,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
for (i = 0; i < cnt; i++) {
err = uprobe_register_refctr(d_real_inode(link->path.dentry),
uprobes[i].offset,
- ref_ctr_offsets ? ref_ctr_offsets[i] : 0,
+ uprobes[i].ref_ctr_offset,
&uprobes[i].consumer);
if (err) {
bpf_uprobe_unregister(&path, uprobes, i);
@@ -3299,11 +3365,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (err)
goto error_free;
- kvfree(ref_ctr_offsets);
return bpf_link_settle(&link_primer);
error_free:
- kvfree(ref_ctr_offsets);
kvfree(uprobes);
kfree(link);
if (task)
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index c148f8d1e564..e380fdf756db 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -12199,7 +12199,6 @@ static struct bpf_test tests[] = {
BPF_JMP32_IMM_ZEXT(JLE),
BPF_JMP32_IMM_ZEXT(JSGT),
BPF_JMP32_IMM_ZEXT(JSGE),
- BPF_JMP32_IMM_ZEXT(JSGT),
BPF_JMP32_IMM_ZEXT(JSLT),
BPF_JMP32_IMM_ZEXT(JSLE),
#undef BPF_JMP2_IMM_ZEXT
@@ -12235,7 +12234,6 @@ static struct bpf_test tests[] = {
BPF_JMP32_REG_ZEXT(JLE),
BPF_JMP32_REG_ZEXT(JSGT),
BPF_JMP32_REG_ZEXT(JSGE),
- BPF_JMP32_REG_ZEXT(JSGT),
BPF_JMP32_REG_ZEXT(JSLT),
BPF_JMP32_REG_ZEXT(JSLE),
#undef BPF_JMP2_REG_ZEXT
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index c9fdcc5cdce1..711cf5d59816 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -542,7 +542,7 @@ struct bpf_fentry_test_t {
int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg)
{
- asm volatile ("");
+ asm volatile ("": "+r"(arg));
return (long)arg;
}
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index fe61f85bcf33..10f2124e9e23 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -6,6 +6,7 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/xdp.h>
+#include <net/xdp_sock.h>
#include "netdev-genl-gen.h"
@@ -13,6 +14,7 @@ static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
+ u64 xsk_features = 0;
u64 xdp_rx_meta = 0;
void *hdr;
@@ -26,11 +28,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
+ if (netdev->xsk_tx_metadata_ops) {
+ if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ }
+
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
netdev->xdp_features, NETDEV_A_DEV_PAD) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
- xdp_rx_meta, NETDEV_A_DEV_PAD)) {
+ xdp_rx_meta, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
+ xsk_features, NETDEV_A_DEV_PAD)) {
genlmsg_cancel(rsp, hdr);
return -EINVAL;
}
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 06cead2b8e34..caa340134b0e 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -148,6 +148,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
return 0;
}
+#define XDP_UMEM_FLAGS_VALID ( \
+ XDP_UMEM_UNALIGNED_CHUNK_FLAG | \
+ XDP_UMEM_TX_SW_CSUM | \
+ 0)
+
static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
@@ -167,7 +172,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
}
- if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+ if (mr->flags & ~XDP_UMEM_FLAGS_VALID)
return -EINVAL;
if (!unaligned_chunks && !is_power_of_2(chunk_size))
@@ -199,6 +204,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
return -EINVAL;
+ if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8)
+ return -EINVAL;
+
umem->size = size;
umem->headroom = headroom;
umem->chunk_size = chunk_size;
@@ -207,6 +215,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->pgs = NULL;
umem->user = NULL;
umem->flags = mr->flags;
+ umem->tx_metadata_len = mr->tx_metadata_len;
INIT_LIST_HEAD(&umem->xsk_dma_list);
refcount_set(&umem->users, 1);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ae9f8cb611f6..281d49b4fca4 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -571,6 +571,13 @@ static u32 xsk_get_num_desc(struct sk_buff *skb)
static void xsk_destruct_skb(struct sk_buff *skb)
{
+ struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
+
+ if (compl->tx_timestamp) {
+ /* sw completion timestamp, not a real one */
+ *compl->tx_timestamp = ktime_get_tai_fast_ns();
+ }
+
xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
sock_wfree(skb);
}
@@ -655,8 +662,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct xdp_desc *desc)
{
+ struct xsk_tx_metadata *meta = NULL;
struct net_device *dev = xs->dev;
struct sk_buff *skb = xs->skb;
+ bool first_frag = false;
int err;
if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
@@ -687,6 +696,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
kfree_skb(skb);
goto free_err;
}
+
+ first_frag = true;
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
struct page *page;
@@ -709,12 +720,45 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb_add_rx_frag(skb, nr_frags, page, 0, len, 0);
}
+
+ if (first_frag && desc->options & XDP_TX_METADATA) {
+ if (unlikely(xs->pool->tx_metadata_len == 0)) {
+ err = -EINVAL;
+ goto free_err;
+ }
+
+ meta = buffer - xs->pool->tx_metadata_len;
+ if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
+ err = -EINVAL;
+ goto free_err;
+ }
+
+ if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
+ if (unlikely(meta->request.csum_start +
+ meta->request.csum_offset +
+ sizeof(__sum16) > len)) {
+ err = -EINVAL;
+ goto free_err;
+ }
+
+ skb->csum_start = hr + meta->request.csum_start;
+ skb->csum_offset = meta->request.csum_offset;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (unlikely(xs->pool->tx_sw_csum)) {
+ err = skb_checksum_help(skb);
+ if (err)
+ goto free_err;
+ }
+ }
+ }
}
skb->dev = dev;
skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
+ xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
xsk_set_destructor_arg(skb);
return skb;
@@ -1283,6 +1327,14 @@ struct xdp_umem_reg_v1 {
__u32 headroom;
};
+struct xdp_umem_reg_v2 {
+ __u64 addr; /* Start of packet data area */
+ __u64 len; /* Length of packet data area */
+ __u32 chunk_size;
+ __u32 headroom;
+ __u32 flags;
+};
+
static int xsk_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -1326,8 +1378,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(struct xdp_umem_reg_v1))
return -EINVAL;
- else if (optlen < sizeof(mr))
+ else if (optlen < sizeof(struct xdp_umem_reg_v2))
mr_size = sizeof(struct xdp_umem_reg_v1);
+ else if (optlen < sizeof(mr))
+ mr_size = sizeof(struct xdp_umem_reg_v2);
if (copy_from_sockptr(&mr, optval, mr_size))
return -EFAULT;
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 49cb9f9a09be..4f6f538a5462 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -85,6 +85,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
XDP_PACKET_HEADROOM;
pool->umem = umem;
pool->addrs = umem->addrs;
+ pool->tx_metadata_len = umem->tx_metadata_len;
+ pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM;
INIT_LIST_HEAD(&pool->free_list);
INIT_LIST_HEAD(&pool->xskb_list);
INIT_LIST_HEAD(&pool->xsk_tx_list);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 13354a1e4280..6f2d1621c992 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -137,21 +137,23 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
static inline bool xp_unused_options_set(u32 options)
{
- return options & ~XDP_PKT_CONTD;
+ return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA);
}
static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 offset = desc->addr & (pool->chunk_size - 1);
+ u64 addr = desc->addr - pool->tx_metadata_len;
+ u64 len = desc->len + pool->tx_metadata_len;
+ u64 offset = addr & (pool->chunk_size - 1);
if (!desc->len)
return false;
- if (offset + desc->len > pool->chunk_size)
+ if (offset + len > pool->chunk_size)
return false;
- if (desc->addr >= pool->addrs_cnt)
+ if (addr >= pool->addrs_cnt)
return false;
if (xp_unused_options_set(desc->options))
@@ -162,16 +164,17 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
+ u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len;
+ u64 len = desc->len + pool->tx_metadata_len;
if (!desc->len)
return false;
- if (desc->len > pool->chunk_size)
+ if (len > pool->chunk_size)
return false;
- if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
- xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+ if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt ||
+ xp_desc_crosses_non_contig_pg(pool, addr, len))
return false;
if (xp_unused_options_set(desc->options))
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index a1528cde81ab..cb46667a6b2e 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -294,6 +294,37 @@ show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr)
jsonw_end_array(json_wtr);
}
+static __u64 *u64_to_arr(__u64 val)
+{
+ return (__u64 *) u64_to_ptr(val);
+}
+
+static void
+show_uprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr)
+{
+ __u32 i;
+
+ jsonw_bool_field(json_wtr, "retprobe",
+ info->uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN);
+ jsonw_string_field(json_wtr, "path", (char *) u64_to_ptr(info->uprobe_multi.path));
+ jsonw_uint_field(json_wtr, "func_cnt", info->uprobe_multi.count);
+ jsonw_int_field(json_wtr, "pid", (int) info->uprobe_multi.pid);
+ jsonw_name(json_wtr, "funcs");
+ jsonw_start_array(json_wtr);
+
+ for (i = 0; i < info->uprobe_multi.count; i++) {
+ jsonw_start_object(json_wtr);
+ jsonw_uint_field(json_wtr, "offset",
+ u64_to_arr(info->uprobe_multi.offsets)[i]);
+ jsonw_uint_field(json_wtr, "ref_ctr_offset",
+ u64_to_arr(info->uprobe_multi.ref_ctr_offsets)[i]);
+ jsonw_uint_field(json_wtr, "cookie",
+ u64_to_arr(info->uprobe_multi.cookies)[i]);
+ jsonw_end_object(json_wtr);
+ }
+ jsonw_end_array(json_wtr);
+}
+
static void
show_perf_event_kprobe_json(struct bpf_link_info *info, json_writer_t *wtr)
{
@@ -465,6 +496,9 @@ static int show_link_close_json(int fd, struct bpf_link_info *info)
case BPF_LINK_TYPE_KPROBE_MULTI:
show_kprobe_multi_json(info, json_wtr);
break;
+ case BPF_LINK_TYPE_UPROBE_MULTI:
+ show_uprobe_multi_json(info, json_wtr);
+ break;
case BPF_LINK_TYPE_PERF_EVENT:
switch (info->perf_event.type) {
case BPF_PERF_EVENT_EVENT:
@@ -674,6 +708,33 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info)
}
}
+static void show_uprobe_multi_plain(struct bpf_link_info *info)
+{
+ __u32 i;
+
+ if (!info->uprobe_multi.count)
+ return;
+
+ if (info->uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN)
+ printf("\n\turetprobe.multi ");
+ else
+ printf("\n\tuprobe.multi ");
+
+ printf("path %s ", (char *) u64_to_ptr(info->uprobe_multi.path));
+ printf("func_cnt %u ", info->uprobe_multi.count);
+
+ if (info->uprobe_multi.pid)
+ printf("pid %d ", info->uprobe_multi.pid);
+
+ printf("\n\t%-16s %-16s %-16s", "offset", "ref_ctr_offset", "cookies");
+ for (i = 0; i < info->uprobe_multi.count; i++) {
+ printf("\n\t0x%-16llx 0x%-16llx 0x%-16llx",
+ u64_to_arr(info->uprobe_multi.offsets)[i],
+ u64_to_arr(info->uprobe_multi.ref_ctr_offsets)[i],
+ u64_to_arr(info->uprobe_multi.cookies)[i]);
+ }
+}
+
static void show_perf_event_kprobe_plain(struct bpf_link_info *info)
{
const char *buf;
@@ -807,6 +868,9 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info)
case BPF_LINK_TYPE_KPROBE_MULTI:
show_kprobe_multi_plain(info);
break;
+ case BPF_LINK_TYPE_UPROBE_MULTI:
+ show_uprobe_multi_plain(info);
+ break;
case BPF_LINK_TYPE_PERF_EVENT:
switch (info->perf_event.type) {
case BPF_PERF_EVENT_EVENT:
@@ -846,8 +910,10 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info)
static int do_show_link(int fd)
{
+ __u64 *ref_ctr_offsets = NULL, *offsets = NULL, *cookies = NULL;
struct bpf_link_info info;
__u32 len = sizeof(info);
+ char path_buf[PATH_MAX];
__u64 *addrs = NULL;
char buf[PATH_MAX];
int count;
@@ -889,6 +955,39 @@ again:
goto again;
}
}
+ if (info.type == BPF_LINK_TYPE_UPROBE_MULTI &&
+ !info.uprobe_multi.offsets) {
+ count = info.uprobe_multi.count;
+ if (count) {
+ offsets = calloc(count, sizeof(__u64));
+ if (!offsets) {
+ p_err("mem alloc failed");
+ close(fd);
+ return -ENOMEM;
+ }
+ info.uprobe_multi.offsets = ptr_to_u64(offsets);
+ ref_ctr_offsets = calloc(count, sizeof(__u64));
+ if (!ref_ctr_offsets) {
+ p_err("mem alloc failed");
+ free(offsets);
+ close(fd);
+ return -ENOMEM;
+ }
+ info.uprobe_multi.ref_ctr_offsets = ptr_to_u64(ref_ctr_offsets);
+ cookies = calloc(count, sizeof(__u64));
+ if (!cookies) {
+ p_err("mem alloc failed");
+ free(cookies);
+ free(offsets);
+ close(fd);
+ return -ENOMEM;
+ }
+ info.uprobe_multi.cookies = ptr_to_u64(cookies);
+ info.uprobe_multi.path = ptr_to_u64(path_buf);
+ info.uprobe_multi.path_size = sizeof(path_buf);
+ goto again;
+ }
+ }
if (info.type == BPF_LINK_TYPE_PERF_EVENT) {
switch (info.perf_event.type) {
case BPF_PERF_EVENT_TRACEPOINT:
@@ -924,8 +1023,10 @@ again:
else
show_link_close_plain(fd, &info);
- if (addrs)
- free(addrs);
+ free(ref_ctr_offsets);
+ free(cookies);
+ free(offsets);
+ free(addrs);
close(fd);
return 0;
}
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 7ec4f5671e7a..feb8e305804f 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -442,7 +442,7 @@ static void print_prog_header_json(struct bpf_prog_info *info, int fd)
jsonw_uint_field(json_wtr, "recursion_misses", info->recursion_misses);
}
-static void print_prog_json(struct bpf_prog_info *info, int fd)
+static void print_prog_json(struct bpf_prog_info *info, int fd, bool orphaned)
{
char *memlock;
@@ -461,6 +461,7 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
jsonw_uint_field(json_wtr, "uid", info->created_by_uid);
}
+ jsonw_bool_field(json_wtr, "orphaned", orphaned);
jsonw_uint_field(json_wtr, "bytes_xlated", info->xlated_prog_len);
if (info->jited_prog_len) {
@@ -527,7 +528,7 @@ static void print_prog_header_plain(struct bpf_prog_info *info, int fd)
printf("\n");
}
-static void print_prog_plain(struct bpf_prog_info *info, int fd)
+static void print_prog_plain(struct bpf_prog_info *info, int fd, bool orphaned)
{
char *memlock;
@@ -554,6 +555,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
printf(" memlock %sB", memlock);
free(memlock);
+ if (orphaned)
+ printf(" orphaned");
+
if (info->nr_map_ids)
show_prog_maps(fd, info->nr_map_ids);
@@ -581,15 +585,15 @@ static int show_prog(int fd)
int err;
err = bpf_prog_get_info_by_fd(fd, &info, &len);
- if (err) {
+ if (err && err != -ENODEV) {
p_err("can't get prog info: %s", strerror(errno));
return -1;
}
if (json_output)
- print_prog_json(&info, fd);
+ print_prog_json(&info, fd, err == -ENODEV);
else
- print_prog_plain(&info, fd);
+ print_prog_plain(&info, fd, err == -ENODEV);
return 0;
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7a5498242eaa..e88746ba7d21 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6563,6 +6563,16 @@ struct bpf_link_info {
__u64 missed;
} kprobe_multi;
struct {
+ __aligned_u64 path;
+ __aligned_u64 offsets;
+ __aligned_u64 ref_ctr_offsets;
+ __aligned_u64 cookies;
+ __u32 path_size; /* in/out: real path size on success, including zero byte */
+ __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */
+ __u32 flags;
+ __u32 pid;
+ } uprobe_multi;
+ struct {
__u32 type; /* enum bpf_perf_event_type */
__u32 :32;
union {
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 73a47da885dc..638c606dfa74 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -26,14 +26,20 @@
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
/* By setting this option, userspace application indicates that it can
- * handle multiple descriptors per packet thus enabling xsk core to split
+ * handle multiple descriptors per packet thus enabling AF_XDP to split
* multi-buffer XDP frames into multiple Rx descriptors. Without this set
- * such frames will be dropped by xsk.
+ * such frames will be dropped.
*/
-#define XDP_USE_SG (1 << 4)
+#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */
-#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+
+/* Force checksum calculation in software. Can be used for testing or
+ * working around potential HW issues. This option causes performance
+ * degradation and only works in XDP_COPY mode.
+ */
+#define XDP_UMEM_TX_SW_CSUM (1 << 1)
struct sockaddr_xdp {
__u16 sxdp_family;
@@ -76,6 +82,7 @@ struct xdp_umem_reg {
__u32 chunk_size;
__u32 headroom;
__u32 flags;
+ __u32 tx_metadata_len;
};
struct xdp_statistics {
@@ -105,6 +112,41 @@ struct xdp_options {
#define XSK_UNALIGNED_BUF_ADDR_MASK \
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+/* Request transmit timestamp. Upon completion, put it into tx_timestamp
+ * field of union xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0)
+
+/* Request transmit checksum offload. Checksum start position and offset
+ * are communicated via csum_start and csum_offset fields of union
+ * xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)
+
+/* AF_XDP offloads request. 'request' union member is consumed by the driver
+ * when the packet is being transmitted. 'completion' union member is
+ * filled by the driver when the transmit completion arrives.
+ */
+struct xsk_tx_metadata {
+ __u64 flags;
+
+ union {
+ struct {
+ /* XDP_TXMD_FLAGS_CHECKSUM */
+
+ /* Offset from desc->addr where checksumming should start. */
+ __u16 csum_start;
+ /* Offset from csum_start where checksum should be stored. */
+ __u16 csum_offset;
+ } request;
+
+ struct {
+ /* XDP_TXMD_FLAGS_TIMESTAMP */
+ __u64 tx_timestamp;
+ } completion;
+ };
+};
+
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;
@@ -112,9 +154,16 @@ struct xdp_desc {
__u32 options;
};
-/* Flag indicating packet constitutes of multiple buffers*/
+/* UMEM descriptor is __u64 */
+
+/* Flag indicating that the packet continues with the buffer pointed out by the
+ * next frame in the ring. The end of the packet is signalled by setting this
+ * bit to zero. For single buffer packets, every descriptor has 'options' set
+ * to 0 and this maintains backward compatibility.
+ */
#define XDP_PKT_CONTD (1 << 0)
-/* UMEM descriptor is __u64 */
+/* TX packet carries valid metadata. */
+#define XDP_TX_METADATA (1 << 1)
#endif /* _LINUX_IF_XDP_H */
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 2b37233e00c0..6244c0164976 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -48,9 +48,18 @@ enum netdev_xdp_act {
enum netdev_xdp_rx_metadata {
NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
NETDEV_XDP_RX_METADATA_HASH = 2,
+};
- /* private: */
- NETDEV_XDP_RX_METADATA_MASK = 3,
+/**
+ * enum netdev_xsk_flags
+ * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported
+ * by the driver.
+ * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
+ * driver.
+ */
+enum netdev_xsk_flags {
+ NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
+ NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
};
enum {
@@ -59,6 +68,7 @@ enum {
NETDEV_A_DEV_XDP_FEATURES,
NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ NETDEV_A_DEV_XSK_FEATURES,
__NETDEV_A_DEV_MAX,
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c
index 2a62bf411bb3..b02faec748a5 100644
--- a/tools/lib/bpf/elf.c
+++ b/tools/lib/bpf/elf.c
@@ -407,7 +407,8 @@ static int symbol_cmp(const void *a, const void *b)
* size, that needs to be released by the caller.
*/
int elf_resolve_syms_offsets(const char *binary_path, int cnt,
- const char **syms, unsigned long **poffsets)
+ const char **syms, unsigned long **poffsets,
+ int st_type)
{
int sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB };
int err = 0, i, cnt_done = 0;
@@ -438,7 +439,7 @@ int elf_resolve_syms_offsets(const char *binary_path, int cnt,
struct elf_sym_iter iter;
struct elf_sym *sym;
- err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], STT_FUNC);
+ err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], st_type);
if (err == -ENOENT)
continue;
if (err)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e067be95da3c..ea9b8158c20d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -11447,7 +11447,7 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog,
return libbpf_err_ptr(err);
offsets = resolved_offsets;
} else if (syms) {
- err = elf_resolve_syms_offsets(path, cnt, syms, &resolved_offsets);
+ err = elf_resolve_syms_offsets(path, cnt, syms, &resolved_offsets, STT_FUNC);
if (err < 0)
return libbpf_err_ptr(err);
offsets = resolved_offsets;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b52dc28dc8af..91c5aef7dae7 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -409,3 +409,6 @@ LIBBPF_1.3.0 {
ring__size;
ring_buffer__ring;
} LIBBPF_1.2.0;
+
+LIBBPF_1.4.0 {
+} LIBBPF_1.3.0;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index f0f08635adb0..b5d334754e5d 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -594,7 +594,8 @@ int elf_open(const char *binary_path, struct elf_fd *elf_fd);
void elf_close(struct elf_fd *elf_fd);
int elf_resolve_syms_offsets(const char *binary_path, int cnt,
- const char **syms, unsigned long **poffsets);
+ const char **syms, unsigned long **poffsets,
+ int st_type);
int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern,
unsigned long **poffsets, size_t *pcnt);
diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h
index 290411ddb39e..e783a47da815 100644
--- a/tools/lib/bpf/libbpf_version.h
+++ b/tools/lib/bpf/libbpf_version.h
@@ -4,6 +4,6 @@
#define __LIBBPF_VERSION_H
#define LIBBPF_MAJOR_VERSION 1
-#define LIBBPF_MINOR_VERSION 3
+#define LIBBPF_MINOR_VERSION 4
#endif /* __LIBBPF_VERSION_H */
diff --git a/tools/net/ynl/generated/netdev-user.c b/tools/net/ynl/generated/netdev-user.c
index a7b7019d00f1..3b9dee94d4ce 100644
--- a/tools/net/ynl/generated/netdev-user.c
+++ b/tools/net/ynl/generated/netdev-user.c
@@ -63,6 +63,19 @@ const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value)
return netdev_xdp_rx_metadata_strmap[value];
}
+static const char * const netdev_xsk_flags_strmap[] = {
+ [0] = "tx-timestamp",
+ [1] = "tx-checksum",
+};
+
+const char *netdev_xsk_flags_str(enum netdev_xsk_flags value)
+{
+ value = ffs(value) - 1;
+ if (value < 0 || value >= (int)MNL_ARRAY_SIZE(netdev_xsk_flags_strmap))
+ return NULL;
+ return netdev_xsk_flags_strmap[value];
+}
+
/* Policies */
struct ynl_policy_attr netdev_page_pool_info_policy[NETDEV_A_PAGE_POOL_MAX + 1] = {
[NETDEV_A_PAGE_POOL_ID] = { .name = "id", .type = YNL_PT_UINT, },
@@ -80,6 +93,7 @@ struct ynl_policy_attr netdev_dev_policy[NETDEV_A_DEV_MAX + 1] = {
[NETDEV_A_DEV_XDP_FEATURES] = { .name = "xdp-features", .type = YNL_PT_U64, },
[NETDEV_A_DEV_XDP_ZC_MAX_SEGS] = { .name = "xdp-zc-max-segs", .type = YNL_PT_U32, },
[NETDEV_A_DEV_XDP_RX_METADATA_FEATURES] = { .name = "xdp-rx-metadata-features", .type = YNL_PT_U64, },
+ [NETDEV_A_DEV_XSK_FEATURES] = { .name = "xsk-features", .type = YNL_PT_U64, },
};
struct ynl_policy_nest netdev_dev_nest = {
@@ -209,6 +223,11 @@ int netdev_dev_get_rsp_parse(const struct nlmsghdr *nlh, void *data)
return MNL_CB_ERROR;
dst->_present.xdp_rx_metadata_features = 1;
dst->xdp_rx_metadata_features = mnl_attr_get_u64(attr);
+ } else if (type == NETDEV_A_DEV_XSK_FEATURES) {
+ if (ynl_attr_validate(yarg, attr))
+ return MNL_CB_ERROR;
+ dst->_present.xsk_features = 1;
+ dst->xsk_features = mnl_attr_get_u64(attr);
}
}
diff --git a/tools/net/ynl/generated/netdev-user.h b/tools/net/ynl/generated/netdev-user.h
index 4093602c9b6c..cc3d80d1cf8c 100644
--- a/tools/net/ynl/generated/netdev-user.h
+++ b/tools/net/ynl/generated/netdev-user.h
@@ -19,6 +19,7 @@ extern const struct ynl_family ynl_netdev_family;
const char *netdev_op_str(int op);
const char *netdev_xdp_act_str(enum netdev_xdp_act value);
const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value);
+const char *netdev_xsk_flags_str(enum netdev_xsk_flags value);
/* Common nested types */
struct netdev_page_pool_info {
@@ -60,12 +61,14 @@ struct netdev_dev_get_rsp {
__u32 xdp_features:1;
__u32 xdp_zc_max_segs:1;
__u32 xdp_rx_metadata_features:1;
+ __u32 xsk_features:1;
} _present;
__u32 ifindex;
__u64 xdp_features;
__u32 xdp_zc_max_segs;
__u64 xdp_rx_metadata_features;
+ __u64 xsk_features;
};
void netdev_dev_get_rsp_free(struct netdev_dev_get_rsp *rsp);
diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c
index b828225daad0..591b90e21890 100644
--- a/tools/net/ynl/samples/netdev.c
+++ b/tools/net/ynl/samples/netdev.c
@@ -33,17 +33,23 @@ static void netdev_print_device(struct netdev_dev_get_rsp *d, unsigned int op)
return;
printf("xdp-features (%llx):", d->xdp_features);
- for (int i = 0; d->xdp_features > 1U << i; i++) {
+ for (int i = 0; d->xdp_features >= 1U << i; i++) {
if (d->xdp_features & (1U << i))
printf(" %s", netdev_xdp_act_str(1 << i));
}
printf(" xdp-rx-metadata-features (%llx):", d->xdp_rx_metadata_features);
- for (int i = 0; d->xdp_rx_metadata_features > 1U << i; i++) {
+ for (int i = 0; d->xdp_rx_metadata_features >= 1U << i; i++) {
if (d->xdp_rx_metadata_features & (1U << i))
printf(" %s", netdev_xdp_rx_metadata_str(1 << i));
}
+ printf(" xsk-features (%llx):", d->xsk_features);
+ for (int i = 0; d->xsk_features >= 1U << i; i++) {
+ if (d->xsk_features & (1U << i))
+ printf(" %s", netdev_xsk_flags_str(1 << i));
+ }
+
printf(" xdp-zc-max-segs=%u", d->xdp_zc_max_segs);
name = netdev_op_str(op);
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9c27b67bc7b1..617ae55c3bb5 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -18,7 +18,7 @@ else
GENDIR := $(abspath ../../../../include/generated)
endif
GENHDR := $(GENDIR)/autoconf.h
-HOSTPKG_CONFIG := pkg-config
+PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config
ifneq ($(wildcard $(GENHDR)),)
GENFLAGS := -DHAVE_GENHDR
@@ -29,13 +29,17 @@ SAN_CFLAGS ?=
SAN_LDFLAGS ?= $(SAN_CFLAGS)
RELEASE ?=
OPT_FLAGS ?= $(if $(RELEASE),-O2,-O0)
+
+LIBELF_CFLAGS := $(shell $(PKG_CONFIG) libelf --cflags 2>/dev/null)
+LIBELF_LIBS := $(shell $(PKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf)
+
CFLAGS += -g $(OPT_FLAGS) -rdynamic \
-Wall -Werror \
- $(GENFLAGS) $(SAN_CFLAGS) \
+ $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \
-I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \
-I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)
LDFLAGS += $(SAN_LDFLAGS)
-LDLIBS += -lelf -lz -lrt -lpthread
+LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread
ifneq ($(LLVM),)
# Silence some warnings when compiled with clang
@@ -219,9 +223,9 @@ $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_r
$(OUTPUT)/sign-file: ../../../../scripts/sign-file.c
$(call msg,SIGN-FILE,,$@)
- $(Q)$(CC) $(shell $(HOSTPKG_CONFIG) --cflags libcrypto 2> /dev/null) \
+ $(Q)$(CC) $(shell $(PKG_CONFIG) --cflags libcrypto 2> /dev/null) \
$< -o $@ \
- $(shell $(HOSTPKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto)
+ $(shell $(PKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto)
$(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch])
$(call msg,MOD,,$@)
diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index cb9b95702ac6..9af79c7a9b58 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -77,7 +77,7 @@ In case of linker errors when running selftests, try using static linking:
.. code-block:: console
- $ LDLIBS=-static vmtest.sh
+ $ LDLIBS=-static PKG_CONFIG='pkg-config --static' vmtest.sh
.. note:: Some distros may not support static linking.
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 34f1200a781b..94b9be24e39b 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -71,4 +71,47 @@ struct nstoken;
*/
struct nstoken *open_netns(const char *name);
void close_netns(struct nstoken *token);
+
+static __u16 csum_fold(__u32 csum)
+{
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+
+ return (__u16)~csum;
+}
+
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ __u32 len, __u8 proto,
+ __wsum csum)
+{
+ __u64 s = csum;
+
+ s += (__u32)saddr;
+ s += (__u32)daddr;
+ s += htons(proto + len);
+ s = (s & 0xffffffff) + (s >> 32);
+ s = (s & 0xffffffff) + (s >> 32);
+
+ return csum_fold((__u32)s);
+}
+
+static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto,
+ __wsum csum)
+{
+ __u64 s = csum;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ s += (__u32)saddr->s6_addr32[i];
+ for (i = 0; i < 4; i++)
+ s += (__u32)daddr->s6_addr32[i];
+ s += htons(proto + len);
+ s = (s & 0xffffffff) + (s >> 32);
+ s = (s & 0xffffffff) + (s >> 32);
+
+ return csum_fold((__u32)s);
+}
+
#endif
diff --git a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
index 97142a4db374..d4b1901f7879 100644
--- a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
+++ b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
@@ -7,6 +7,7 @@
#include <test_progs.h>
#include "trace_helpers.h"
#include "test_fill_link_info.skel.h"
+#include "bpf/libbpf_internal.h"
#define TP_CAT "sched"
#define TP_NAME "sched_switch"
@@ -140,14 +141,14 @@ static void test_kprobe_fill_link_info(struct test_fill_link_info *skel,
.retprobe = type == BPF_PERF_EVENT_KRETPROBE,
);
ssize_t entry_offset = 0;
+ struct bpf_link *link;
int link_fd, err;
- skel->links.kprobe_run = bpf_program__attach_kprobe_opts(skel->progs.kprobe_run,
- KPROBE_FUNC, &opts);
- if (!ASSERT_OK_PTR(skel->links.kprobe_run, "attach_kprobe"))
+ link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_run, KPROBE_FUNC, &opts);
+ if (!ASSERT_OK_PTR(link, "attach_kprobe"))
return;
- link_fd = bpf_link__fd(skel->links.kprobe_run);
+ link_fd = bpf_link__fd(link);
if (!invalid) {
/* See also arch_adjust_kprobe_addr(). */
if (skel->kconfig->CONFIG_X86_KERNEL_IBT)
@@ -157,39 +158,41 @@ static void test_kprobe_fill_link_info(struct test_fill_link_info *skel,
} else {
kprobe_fill_invalid_user_buffer(link_fd);
}
- bpf_link__detach(skel->links.kprobe_run);
+ bpf_link__destroy(link);
}
static void test_tp_fill_link_info(struct test_fill_link_info *skel)
{
+ struct bpf_link *link;
int link_fd, err;
- skel->links.tp_run = bpf_program__attach_tracepoint(skel->progs.tp_run, TP_CAT, TP_NAME);
- if (!ASSERT_OK_PTR(skel->links.tp_run, "attach_tp"))
+ link = bpf_program__attach_tracepoint(skel->progs.tp_run, TP_CAT, TP_NAME);
+ if (!ASSERT_OK_PTR(link, "attach_tp"))
return;
- link_fd = bpf_link__fd(skel->links.tp_run);
+ link_fd = bpf_link__fd(link);
err = verify_perf_link_info(link_fd, BPF_PERF_EVENT_TRACEPOINT, 0, 0, 0);
ASSERT_OK(err, "verify_perf_link_info");
- bpf_link__detach(skel->links.tp_run);
+ bpf_link__destroy(link);
}
static void test_uprobe_fill_link_info(struct test_fill_link_info *skel,
enum bpf_perf_event_type type)
{
+ struct bpf_link *link;
int link_fd, err;
- skel->links.uprobe_run = bpf_program__attach_uprobe(skel->progs.uprobe_run,
- type == BPF_PERF_EVENT_URETPROBE,
- 0, /* self pid */
- UPROBE_FILE, uprobe_offset);
- if (!ASSERT_OK_PTR(skel->links.uprobe_run, "attach_uprobe"))
+ link = bpf_program__attach_uprobe(skel->progs.uprobe_run,
+ type == BPF_PERF_EVENT_URETPROBE,
+ 0, /* self pid */
+ UPROBE_FILE, uprobe_offset);
+ if (!ASSERT_OK_PTR(link, "attach_uprobe"))
return;
- link_fd = bpf_link__fd(skel->links.uprobe_run);
+ link_fd = bpf_link__fd(link);
err = verify_perf_link_info(link_fd, type, 0, uprobe_offset, 0);
ASSERT_OK(err, "verify_perf_link_info");
- bpf_link__detach(skel->links.uprobe_run);
+ bpf_link__destroy(link);
}
static int verify_kmulti_link_info(int fd, bool retprobe)
@@ -278,24 +281,214 @@ static void test_kprobe_multi_fill_link_info(struct test_fill_link_info *skel,
bool retprobe, bool invalid)
{
LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
+ struct bpf_link *link;
int link_fd, err;
opts.syms = kmulti_syms;
opts.cnt = KMULTI_CNT;
opts.retprobe = retprobe;
- skel->links.kmulti_run = bpf_program__attach_kprobe_multi_opts(skel->progs.kmulti_run,
- NULL, &opts);
- if (!ASSERT_OK_PTR(skel->links.kmulti_run, "attach_kprobe_multi"))
+ link = bpf_program__attach_kprobe_multi_opts(skel->progs.kmulti_run, NULL, &opts);
+ if (!ASSERT_OK_PTR(link, "attach_kprobe_multi"))
return;
- link_fd = bpf_link__fd(skel->links.kmulti_run);
+ link_fd = bpf_link__fd(link);
if (!invalid) {
err = verify_kmulti_link_info(link_fd, retprobe);
ASSERT_OK(err, "verify_kmulti_link_info");
} else {
verify_kmulti_invalid_user_buffer(link_fd);
}
- bpf_link__detach(skel->links.kmulti_run);
+ bpf_link__destroy(link);
+}
+
+#define SEC(name) __attribute__((section(name), used))
+
+static short uprobe_link_info_sema_1 SEC(".probes");
+static short uprobe_link_info_sema_2 SEC(".probes");
+static short uprobe_link_info_sema_3 SEC(".probes");
+
+noinline void uprobe_link_info_func_1(void)
+{
+ asm volatile ("");
+ uprobe_link_info_sema_1++;
+}
+
+noinline void uprobe_link_info_func_2(void)
+{
+ asm volatile ("");
+ uprobe_link_info_sema_2++;
+}
+
+noinline void uprobe_link_info_func_3(void)
+{
+ asm volatile ("");
+ uprobe_link_info_sema_3++;
+}
+
+static int
+verify_umulti_link_info(int fd, bool retprobe, __u64 *offsets,
+ __u64 *cookies, __u64 *ref_ctr_offsets)
+{
+ char path[PATH_MAX], path_buf[PATH_MAX];
+ struct bpf_link_info info;
+ __u32 len = sizeof(info);
+ __u64 ref_ctr_offsets_buf[3];
+ __u64 offsets_buf[3];
+ __u64 cookies_buf[3];
+ int i, err, bit;
+ __u32 count = 0;
+
+ memset(path, 0, sizeof(path));
+ err = readlink("/proc/self/exe", path, sizeof(path));
+ if (!ASSERT_NEQ(err, -1, "readlink"))
+ return -1;
+
+ for (bit = 0; bit < 8; bit++) {
+ memset(&info, 0, sizeof(info));
+ info.uprobe_multi.path = ptr_to_u64(path_buf);
+ info.uprobe_multi.path_size = sizeof(path_buf);
+ info.uprobe_multi.count = count;
+
+ if (bit & 0x1)
+ info.uprobe_multi.offsets = ptr_to_u64(offsets_buf);
+ if (bit & 0x2)
+ info.uprobe_multi.cookies = ptr_to_u64(cookies_buf);
+ if (bit & 0x4)
+ info.uprobe_multi.ref_ctr_offsets = ptr_to_u64(ref_ctr_offsets_buf);
+
+ err = bpf_link_get_info_by_fd(fd, &info, &len);
+ if (!ASSERT_OK(err, "bpf_link_get_info_by_fd"))
+ return -1;
+
+ if (!ASSERT_EQ(info.type, BPF_LINK_TYPE_UPROBE_MULTI, "info.type"))
+ return -1;
+
+ ASSERT_EQ(info.uprobe_multi.pid, getpid(), "info.uprobe_multi.pid");
+ ASSERT_EQ(info.uprobe_multi.count, 3, "info.uprobe_multi.count");
+ ASSERT_EQ(info.uprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN,
+ retprobe, "info.uprobe_multi.flags.retprobe");
+ ASSERT_EQ(info.uprobe_multi.path_size, strlen(path) + 1, "info.uprobe_multi.path_size");
+ ASSERT_STREQ(path_buf, path, "info.uprobe_multi.path");
+
+ for (i = 0; i < info.uprobe_multi.count; i++) {
+ if (info.uprobe_multi.offsets)
+ ASSERT_EQ(offsets_buf[i], offsets[i], "info.uprobe_multi.offsets");
+ if (info.uprobe_multi.cookies)
+ ASSERT_EQ(cookies_buf[i], cookies[i], "info.uprobe_multi.cookies");
+ if (info.uprobe_multi.ref_ctr_offsets) {
+ ASSERT_EQ(ref_ctr_offsets_buf[i], ref_ctr_offsets[i],
+ "info.uprobe_multi.ref_ctr_offsets");
+ }
+ }
+ count = count ?: info.uprobe_multi.count;
+ }
+
+ return 0;
+}
+
+static void verify_umulti_invalid_user_buffer(int fd)
+{
+ struct bpf_link_info info;
+ __u32 len = sizeof(info);
+ __u64 buf[3];
+ int err;
+
+ /* upath_size defined, not path */
+ memset(&info, 0, sizeof(info));
+ info.uprobe_multi.path_size = 3;
+ err = bpf_link_get_info_by_fd(fd, &info, &len);
+ ASSERT_EQ(err, -EINVAL, "failed_upath_size");
+
+ /* path defined, but small */
+ memset(&info, 0, sizeof(info));
+ info.uprobe_multi.path = ptr_to_u64(buf);
+ info.uprobe_multi.path_size = 3;
+ err = bpf_link_get_info_by_fd(fd, &info, &len);
+ ASSERT_LT(err, 0, "failed_upath_small");
+
+ /* path has wrong pointer */
+ memset(&info, 0, sizeof(info));
+ info.uprobe_multi.path_size = PATH_MAX;
+ info.uprobe_multi.path = 123;
+ err = bpf_link_get_info_by_fd(fd, &info, &len);
+ ASSERT_EQ(err, -EFAULT, "failed_bad_path_ptr");
+
+ /* count zero, with offsets */
+ memset(&info, 0, sizeof(info));
+ info.uprobe_multi.offsets = ptr_to_u64(buf);
+ err = bpf_link_get_info_by_fd(fd, &info, &len);
+ ASSERT_EQ(err, -EINVAL, "failed_count");
+
+ /* offsets not big enough */
+ memset(&info, 0, sizeof(info));
+ info.uprobe_multi.offsets = ptr_to_u64(buf);
+ info.uprobe_multi.count = 2;
+ err = bpf_link_get_info_by_fd(fd, &info, &len);
+ ASSERT_EQ(err, -ENOSPC, "failed_small_count");
+
+ /* offsets has wrong pointer */
+ memset(&info, 0, sizeof(info));
+ info.uprobe_multi.offsets = 123;
+ info.uprobe_multi.count = 3;
+ err = bpf_link_get_info_by_fd(fd, &info, &len);
+ ASSERT_EQ(err, -EFAULT, "failed_wrong_offsets");
+}
+
+static void test_uprobe_multi_fill_link_info(struct test_fill_link_info *skel,
+ bool retprobe, bool invalid)
+{
+ LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
+ .retprobe = retprobe,
+ );
+ const char *syms[3] = {
+ "uprobe_link_info_func_1",
+ "uprobe_link_info_func_2",
+ "uprobe_link_info_func_3",
+ };
+ __u64 cookies[3] = {
+ 0xdead,
+ 0xbeef,
+ 0xcafe,
+ };
+ const char *sema[3] = {
+ "uprobe_link_info_sema_1",
+ "uprobe_link_info_sema_2",
+ "uprobe_link_info_sema_3",
+ };
+ __u64 *offsets = NULL, *ref_ctr_offsets;
+ struct bpf_link *link;
+ int link_fd, err;
+
+ err = elf_resolve_syms_offsets("/proc/self/exe", 3, sema,
+ (unsigned long **) &ref_ctr_offsets, STT_OBJECT);
+ if (!ASSERT_OK(err, "elf_resolve_syms_offsets_object"))
+ return;
+
+ err = elf_resolve_syms_offsets("/proc/self/exe", 3, syms,
+ (unsigned long **) &offsets, STT_FUNC);
+ if (!ASSERT_OK(err, "elf_resolve_syms_offsets_func"))
+ goto out;
+
+ opts.syms = syms;
+ opts.cookies = &cookies[0];
+ opts.ref_ctr_offsets = (unsigned long *) &ref_ctr_offsets[0];
+ opts.cnt = ARRAY_SIZE(syms);
+
+ link = bpf_program__attach_uprobe_multi(skel->progs.umulti_run, 0,
+ "/proc/self/exe", NULL, &opts);
+ if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
+ goto out;
+
+ link_fd = bpf_link__fd(link);
+ if (invalid)
+ verify_umulti_invalid_user_buffer(link_fd);
+ else
+ verify_umulti_link_info(link_fd, retprobe, offsets, cookies, ref_ctr_offsets);
+
+ bpf_link__destroy(link);
+out:
+ free(ref_ctr_offsets);
+ free(offsets);
}
void test_fill_link_info(void)
@@ -337,6 +530,13 @@ void test_fill_link_info(void)
if (test__start_subtest("kprobe_multi_invalid_ubuff"))
test_kprobe_multi_fill_link_info(skel, true, true);
+ if (test__start_subtest("uprobe_multi_link_info"))
+ test_uprobe_multi_fill_link_info(skel, false, false);
+ if (test__start_subtest("uretprobe_multi_link_info"))
+ test_uprobe_multi_fill_link_info(skel, true, false);
+ if (test__start_subtest("uprobe_multi_invalid"))
+ test_uprobe_multi_fill_link_info(skel, false, true);
+
cleanup:
test_fill_link_info__destroy(skel);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index cd051d3901a9..ece260cf2c0b 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -249,7 +249,7 @@ static void __test_link_api(struct child *child)
int link_extra_fd = -1;
int err;
- err = elf_resolve_syms_offsets(path, 3, syms, (unsigned long **) &offsets);
+ err = elf_resolve_syms_offsets(path, 3, syms, (unsigned long **) &offsets, STT_FUNC);
if (!ASSERT_OK(err, "elf_resolve_syms_offsets"))
return;
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 5cfa7a6316b6..8d746642cbd7 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -25,6 +25,7 @@
#include "verifier_direct_stack_access_wraparound.skel.h"
#include "verifier_div0.skel.h"
#include "verifier_div_overflow.skel.h"
+#include "verifier_global_subprogs.skel.h"
#include "verifier_gotol.skel.h"
#include "verifier_helper_access_var_len.skel.h"
#include "verifier_helper_packet_access.skel.h"
@@ -134,6 +135,7 @@ void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_acces
void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); }
void test_verifier_div0(void) { RUN(verifier_div0); }
void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); }
+void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); }
void test_verifier_gotol(void) { RUN(verifier_gotol); }
void test_verifier_helper_access_var_len(void) { RUN(verifier_helper_access_var_len); }
void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_access); }
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index 4439ba9392f8..33cdf88efa6b 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -56,7 +56,8 @@ static int open_xsk(int ifindex, struct xsk *xsk)
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
- .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+ .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM,
+ .tx_metadata_len = sizeof(struct xsk_tx_metadata),
};
__u32 idx;
u64 addr;
@@ -138,6 +139,7 @@ static void ip_csum(struct iphdr *iph)
static int generate_packet(struct xsk *xsk, __u16 dst_port)
{
+ struct xsk_tx_metadata *meta;
struct xdp_desc *tx_desc;
struct udphdr *udph;
struct ethhdr *eth;
@@ -151,10 +153,14 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port)
return -1;
tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
- tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE;
+ tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata);
printf("%p: tx_desc[%u]->addr=%llx\n", xsk, idx, tx_desc->addr);
data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
+ meta = data - sizeof(struct xsk_tx_metadata);
+ memset(meta, 0, sizeof(*meta));
+ meta->flags = XDP_TXMD_FLAGS_TIMESTAMP;
+
eth = data;
iph = (void *)(eth + 1);
udph = (void *)(iph + 1);
@@ -178,11 +184,17 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port)
udph->source = htons(AF_XDP_SOURCE_PORT);
udph->dest = htons(dst_port);
udph->len = htons(sizeof(*udph) + UDP_PAYLOAD_BYTES);
- udph->check = 0;
+ udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ ntohs(udph->len), IPPROTO_UDP, 0);
memset(udph + 1, 0xAA, UDP_PAYLOAD_BYTES);
+ meta->flags |= XDP_TXMD_FLAGS_CHECKSUM;
+ meta->request.csum_start = sizeof(*eth) + sizeof(*iph);
+ meta->request.csum_offset = offsetof(struct udphdr, check);
+
tx_desc->len = sizeof(*eth) + sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES;
+ tx_desc->options |= XDP_TX_METADATA;
xsk_ring_prod__submit(&xsk->tx, 1);
ret = sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
@@ -194,13 +206,21 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port)
static void complete_tx(struct xsk *xsk)
{
- __u32 idx;
+ struct xsk_tx_metadata *meta;
__u64 addr;
+ void *data;
+ __u32 idx;
if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) {
addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
+
+ data = xsk_umem__get_data(xsk->umem_area, addr);
+ meta = data - sizeof(struct xsk_tx_metadata);
+
+ ASSERT_NEQ(meta->completion.tx_timestamp, 0, "tx_timestamp");
+
xsk_ring_cons__release(&xsk->comp, 1);
}
}
@@ -221,6 +241,7 @@ static int verify_xsk_metadata(struct xsk *xsk)
const struct xdp_desc *rx_desc;
struct pollfd fds = {};
struct xdp_meta *meta;
+ struct udphdr *udph;
struct ethhdr *eth;
struct iphdr *iph;
__u64 comp_addr;
@@ -257,6 +278,7 @@ static int verify_xsk_metadata(struct xsk *xsk)
ASSERT_EQ(eth->h_proto, htons(ETH_P_IP), "eth->h_proto");
iph = (void *)(eth + 1);
ASSERT_EQ((int)iph->version, 4, "iph->version");
+ udph = (void *)(iph + 1);
/* custom metadata */
@@ -270,6 +292,9 @@ static int verify_xsk_metadata(struct xsk *xsk)
ASSERT_EQ(meta->rx_hash_type, 0, "rx_hash_type");
+ /* checksum offload */
+ ASSERT_EQ(udph->check, htons(0x721c), "csum");
+
xsk_ring_cons__release(&xsk->rx, 1);
refill_rx(xsk, comp_addr);
diff --git a/tools/testing/selftests/bpf/progs/test_fill_link_info.c b/tools/testing/selftests/bpf/progs/test_fill_link_info.c
index 564f402d56fe..69509f8bb680 100644
--- a/tools/testing/selftests/bpf/progs/test_fill_link_info.c
+++ b/tools/testing/selftests/bpf/progs/test_fill_link_info.c
@@ -39,4 +39,10 @@ int BPF_PROG(kmulti_run)
return 0;
}
+SEC("uprobe.multi")
+int BPF_PROG(umulti_run)
+{
+ return 0;
+}
+
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_global_func12.c b/tools/testing/selftests/bpf/progs/test_global_func12.c
index 7f159d83c6f6..6e03d42519a6 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func12.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func12.c
@@ -19,5 +19,7 @@ int global_func12(struct __sk_buff *skb)
{
const struct S s = {.x = skb->len };
- return foo(&s);
+ foo(&s);
+
+ return 1;
}
diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c
index a32e11c7d933..5de44b09e8ec 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func17.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func17.c
@@ -5,6 +5,7 @@
__noinline int foo(int *p)
{
+ barrier_var(p);
return p ? (*p = 42) : 0;
}
diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
new file mode 100644
index 000000000000..a0a5efd1caa1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <stdbool.h>
+#include <errno.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+int arr[1];
+int unkn_idx;
+
+__noinline long global_bad(void)
+{
+ return arr[unkn_idx]; /* BOOM */
+}
+
+__noinline long global_good(void)
+{
+ return arr[0];
+}
+
+__noinline long global_calls_bad(void)
+{
+ return global_good() + global_bad() /* does BOOM indirectly */;
+}
+
+__noinline long global_calls_good_only(void)
+{
+ return global_good();
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+/* main prog is validated completely first */
+__msg("('global_calls_good_only') is global and assumed valid.")
+__msg("1: (95) exit")
+/* eventually global_good() is transitively validated as well */
+__msg("Validating global_good() func")
+__msg("('global_good') is safe for any args that match its prototype")
+int chained_global_func_calls_success(void)
+{
+ return global_calls_good_only();
+}
+
+SEC("?raw_tp")
+__failure __log_level(2)
+/* main prog validated successfully first */
+__msg("1: (95) exit")
+/* eventually we validate global_bad() and fail */
+__msg("Validating global_bad() func")
+__msg("math between map_value pointer and register") /* BOOM */
+int chained_global_func_calls_bad(void)
+{
+ return global_calls_bad();
+}
+
+/* do out of bounds access forcing verifier to fail verification if this
+ * global func is called
+ */
+__noinline int global_unsupp(const int *mem)
+{
+ if (!mem)
+ return 0;
+ return mem[100]; /* BOOM */
+}
+
+const volatile bool skip_unsupp_global = true;
+
+SEC("?raw_tp")
+__success
+int guarded_unsupp_global_called(void)
+{
+ if (!skip_unsupp_global)
+ return global_unsupp(NULL);
+ return 0;
+}
+
+SEC("?raw_tp")
+__failure __log_level(2)
+__msg("Func#1 ('global_unsupp') is global and assumed valid.")
+__msg("Validating global_unsupp() func#1...")
+__msg("value is outside of the allowed memory range")
+int unguarded_unsupp_global_called(void)
+{
+ int x = 0;
+
+ return global_unsupp(&x);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index f61d623b1ce8..b5efcaeaa1ae 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -370,12 +370,10 @@ __naked int parent_stack_slot_precise(void)
SEC("?raw_tp")
__success __log_level(2)
__msg("9: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 9 first_idx 6")
+__msg("mark_precise: frame0: last_idx 9 first_idx 0")
__msg("mark_precise: frame0: regs=r6 stack= before 8: (bf) r1 = r7")
__msg("mark_precise: frame0: regs=r6 stack= before 7: (27) r6 *= 4")
__msg("mark_precise: frame0: regs=r6 stack= before 6: (79) r6 = *(u64 *)(r10 -8)")
-__msg("mark_precise: frame0: parent state regs= stack=-8:")
-__msg("mark_precise: frame0: last_idx 5 first_idx 0")
__msg("mark_precise: frame0: regs= stack=-8 before 5: (85) call pc+6")
__msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r1 = 0")
__msg("mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r6")
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index 40cba8d368d9..6157f884d091 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -169,12 +169,14 @@ def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False):
return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns,
fail=fail, include_stderr=include_stderr)
-def bpftool_prog_list(expected=None, ns=""):
+def bpftool_prog_list(expected=None, ns="", exclude_orphaned=True):
_, progs = bpftool("prog show", JSON=True, ns=ns, fail=True)
# Remove the base progs
for p in base_progs:
if p in progs:
progs.remove(p)
+ if exclude_orphaned:
+ progs = [ p for p in progs if not p['orphaned'] ]
if expected is not None:
if len(progs) != expected:
fail(True, "%d BPF programs loaded, expected %d" %
@@ -612,11 +614,9 @@ def pin_map(file_name, idx=0, expected=1):
def check_dev_info_removed(prog_file=None, map_file=None):
bpftool_prog_list(expected=0)
+ bpftool_prog_list(expected=1, exclude_orphaned=False)
ret, err = bpftool("prog show pin %s" % (prog_file), fail=False)
- fail(ret == 0, "Showing prog with removed device did not fail")
- fail(err["error"].find("No such device") == -1,
- "Showing prog with removed device expected ENODEV, error is %s" %
- (err["error"]))
+ fail(ret != 0, "failed to show prog with removed device")
bpftool_map_list(expected=0)
ret, err = bpftool("map show pin %s" % (map_file), fail=False)
@@ -1395,10 +1395,7 @@ try:
start_test("Test multi-dev ASIC cross-dev destruction - orphaned...")
ret, out = bpftool("prog show %s" % (progB), fail=False)
- fail(ret == 0, "got information about orphaned program")
- fail("error" not in out, "no error reported for get info on orphaned")
- fail(out["error"] != "can't get prog info: No such device",
- "wrong error for get info on orphaned")
+ fail(ret != 0, "couldn't get information about orphaned program")
print("%s: OK" % (os.path.basename(__file__)))
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index c3ba40d0b9de..3291625ba4fb 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -10,7 +10,9 @@
* - rx_hash
*
* TX:
- * - TBD
+ * - UDP 9091 packets trigger TX reply
+ * - TX HW timestamp is requested and reported back upon completion
+ * - TX checksum is requested
*/
#include <test_progs.h>
@@ -24,15 +26,18 @@
#include <linux/net_tstamp.h>
#include <linux/udp.h>
#include <linux/sockios.h>
+#include <linux/if_xdp.h>
#include <sys/mman.h>
#include <net/if.h>
#include <ctype.h>
#include <poll.h>
#include <time.h>
+#include <unistd.h>
+#include <libgen.h>
#include "xdp_metadata.h"
-#define UMEM_NUM 16
+#define UMEM_NUM 256
#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
#define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE)
@@ -48,11 +53,14 @@ struct xsk {
};
struct xdp_hw_metadata *bpf_obj;
-__u16 bind_flags = XDP_COPY;
+__u16 bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY;
struct xsk *rx_xsk;
const char *ifname;
int ifindex;
int rxq;
+bool skip_tx;
+__u64 last_hw_rx_timestamp;
+__u64 last_xdp_rx_timestamp;
void test__fail(void) { /* for network_helpers.c */ }
@@ -68,7 +76,8 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
- .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+ .flags = XSK_UMEM__DEFAULT_FLAGS,
+ .tx_metadata_len = sizeof(struct xsk_tx_metadata),
};
__u32 idx;
u64 addr;
@@ -110,7 +119,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
for (i = 0; i < UMEM_NUM / 2; i++) {
addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
- *xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
+ *xsk_ring_prod__fill_addr(&xsk->fill, idx + i) = addr;
}
xsk_ring_prod__submit(&xsk->fill, ret);
@@ -131,12 +140,22 @@ static void refill_rx(struct xsk *xsk, __u64 addr)
__u32 idx;
if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) {
- printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
+ printf("%p: complete rx idx=%u addr=%llx\n", xsk, idx, addr);
*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
xsk_ring_prod__submit(&xsk->fill, 1);
}
}
+static int kick_tx(struct xsk *xsk)
+{
+ return sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
+}
+
+static int kick_rx(struct xsk *xsk)
+{
+ return recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL);
+}
+
#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
static __u64 gettime(clockid_t clock_id)
{
@@ -152,6 +171,17 @@ static __u64 gettime(clockid_t clock_id)
return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
}
+static void print_tstamp_delta(const char *name, const char *refname,
+ __u64 tstamp, __u64 reference)
+{
+ __s64 delta = (__s64)reference - (__s64)tstamp;
+
+ printf("%s: %llu (sec:%0.4f) delta to %s sec:%0.4f (%0.3f usec)\n",
+ name, tstamp, (double)tstamp / NANOSEC_PER_SEC, refname,
+ (double)delta / NANOSEC_PER_SEC,
+ (double)delta / 1000);
+}
+
static void verify_xdp_metadata(void *data, clockid_t clock_id)
{
struct xdp_meta *meta;
@@ -164,25 +194,20 @@ static void verify_xdp_metadata(void *data, clockid_t clock_id)
printf("rx_hash: 0x%X with RSS type:0x%X\n",
meta->rx_hash, meta->rx_hash_type);
- printf("rx_timestamp: %llu (sec:%0.4f)\n", meta->rx_timestamp,
- (double)meta->rx_timestamp / NANOSEC_PER_SEC);
if (meta->rx_timestamp) {
- __u64 usr_clock = gettime(clock_id);
- __u64 xdp_clock = meta->xdp_timestamp;
- __s64 delta_X = xdp_clock - meta->rx_timestamp;
- __s64 delta_X2U = usr_clock - xdp_clock;
-
- printf("XDP RX-time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n",
- xdp_clock, (double)xdp_clock / NANOSEC_PER_SEC,
- (double)delta_X / NANOSEC_PER_SEC,
- (double)delta_X / 1000);
-
- printf("AF_XDP time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n",
- usr_clock, (double)usr_clock / NANOSEC_PER_SEC,
- (double)delta_X2U / NANOSEC_PER_SEC,
- (double)delta_X2U / 1000);
+ __u64 ref_tstamp = gettime(clock_id);
+
+ /* store received timestamps to calculate a delta at tx */
+ last_hw_rx_timestamp = meta->rx_timestamp;
+ last_xdp_rx_timestamp = meta->xdp_timestamp;
+
+ print_tstamp_delta("HW RX-time", "User RX-time",
+ meta->rx_timestamp, ref_tstamp);
+ print_tstamp_delta("XDP RX-time", "User RX-time",
+ meta->xdp_timestamp, ref_tstamp);
+ } else {
+ printf("No rx_timestamp\n");
}
-
}
static void verify_skb_metadata(int fd)
@@ -230,6 +255,129 @@ static void verify_skb_metadata(int fd)
printf("skb hwtstamp is not found!\n");
}
+static bool complete_tx(struct xsk *xsk, clockid_t clock_id)
+{
+ struct xsk_tx_metadata *meta;
+ __u64 addr;
+ void *data;
+ __u32 idx;
+
+ if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx))
+ return false;
+
+ addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
+ data = xsk_umem__get_data(xsk->umem_area, addr);
+ meta = data - sizeof(struct xsk_tx_metadata);
+
+ printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
+
+ if (meta->completion.tx_timestamp) {
+ __u64 ref_tstamp = gettime(clock_id);
+
+ print_tstamp_delta("HW TX-complete-time", "User TX-complete-time",
+ meta->completion.tx_timestamp, ref_tstamp);
+ print_tstamp_delta("XDP RX-time", "User TX-complete-time",
+ last_xdp_rx_timestamp, ref_tstamp);
+ print_tstamp_delta("HW RX-time", "HW TX-complete-time",
+ last_hw_rx_timestamp, meta->completion.tx_timestamp);
+ } else {
+ printf("No tx_timestamp\n");
+ }
+
+ xsk_ring_cons__release(&xsk->comp, 1);
+
+ return true;
+}
+
+#define swap(a, b, len) do { \
+ for (int i = 0; i < len; i++) { \
+ __u8 tmp = ((__u8 *)a)[i]; \
+ ((__u8 *)a)[i] = ((__u8 *)b)[i]; \
+ ((__u8 *)b)[i] = tmp; \
+ } \
+} while (0)
+
+static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id)
+{
+ struct xsk_tx_metadata *meta;
+ struct ipv6hdr *ip6h = NULL;
+ struct iphdr *iph = NULL;
+ struct xdp_desc *tx_desc;
+ struct udphdr *udph;
+ struct ethhdr *eth;
+ __sum16 want_csum;
+ void *data;
+ __u32 idx;
+ int ret;
+ int len;
+
+ ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
+ if (ret != 1) {
+ printf("%p: failed to reserve tx slot\n", xsk);
+ return;
+ }
+
+ tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
+ tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata);
+ data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
+
+ meta = data - sizeof(struct xsk_tx_metadata);
+ memset(meta, 0, sizeof(*meta));
+ meta->flags = XDP_TXMD_FLAGS_TIMESTAMP;
+
+ eth = rx_packet;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ iph = (void *)(eth + 1);
+ udph = (void *)(iph + 1);
+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
+ ip6h = (void *)(eth + 1);
+ udph = (void *)(ip6h + 1);
+ } else {
+ printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth->h_proto);
+ xsk_ring_prod__cancel(&xsk->tx, 1);
+ return;
+ }
+
+ len = ETH_HLEN;
+ if (ip6h)
+ len += sizeof(*ip6h) + ntohs(ip6h->payload_len);
+ if (iph)
+ len += ntohs(iph->tot_len);
+
+ swap(eth->h_dest, eth->h_source, ETH_ALEN);
+ if (iph)
+ swap(&iph->saddr, &iph->daddr, 4);
+ else
+ swap(&ip6h->saddr, &ip6h->daddr, 16);
+ swap(&udph->source, &udph->dest, 2);
+
+ want_csum = udph->check;
+ if (ip6h)
+ udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ ntohs(udph->len), IPPROTO_UDP, 0);
+ else
+ udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ ntohs(udph->len), IPPROTO_UDP, 0);
+
+ meta->flags |= XDP_TXMD_FLAGS_CHECKSUM;
+ if (iph)
+ meta->request.csum_start = sizeof(*eth) + sizeof(*iph);
+ else
+ meta->request.csum_start = sizeof(*eth) + sizeof(*ip6h);
+ meta->request.csum_offset = offsetof(struct udphdr, check);
+
+ printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d csum_offset=%d\n",
+ xsk, ntohs(udph->check), ntohs(want_csum),
+ meta->request.csum_start, meta->request.csum_offset);
+
+ memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */
+ tx_desc->options |= XDP_TX_METADATA;
+ tx_desc->len = len;
+
+ xsk_ring_prod__submit(&xsk->tx, 1);
+}
+
static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
{
const struct xdp_desc *rx_desc;
@@ -252,6 +400,13 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t
while (true) {
errno = 0;
+
+ for (i = 0; i < rxq; i++) {
+ ret = kick_rx(&rx_xsk[i]);
+ if (ret)
+ printf("kick_rx ret=%d\n", ret);
+ }
+
ret = poll(fds, rxq + 1, 1000);
printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n",
ret, errno, bpf_obj->bss->pkts_skip,
@@ -288,6 +443,22 @@ peek:
verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr),
clock_id);
first_seg = false;
+
+ if (!skip_tx) {
+ /* mirror first chunk back */
+ ping_pong(xsk, xsk_umem__get_data(xsk->umem_area, addr),
+ clock_id);
+
+ ret = kick_tx(xsk);
+ if (ret)
+ printf("kick_tx ret=%d\n", ret);
+
+ for (int j = 0; j < 500; j++) {
+ if (complete_tx(xsk, clock_id))
+ break;
+ usleep(10*1000);
+ }
+ }
}
xsk_ring_cons__release(&xsk->rx, 1);
@@ -420,8 +591,10 @@ static void print_usage(void)
{
const char *usage =
"Usage: xdp_hw_metadata [OPTIONS] [IFNAME]\n"
- " -m Enable multi-buffer XDP for larger MTU\n"
+ " -c Run in copy mode (zerocopy is default)\n"
" -h Display this help and exit\n\n"
+ " -m Enable multi-buffer XDP for larger MTU\n"
+ " -r Don't generate AF_XDP reply (rx metadata only)\n"
"Generate test packets on the other machine with:\n"
" echo -n xdp | nc -u -q1 <dst_ip> 9091\n";
@@ -432,14 +605,22 @@ static void read_args(int argc, char *argv[])
{
int opt;
- while ((opt = getopt(argc, argv, "mh")) != -1) {
+ while ((opt = getopt(argc, argv, "chmr")) != -1) {
switch (opt) {
- case 'm':
- bind_flags |= XDP_USE_SG;
+ case 'c':
+ bind_flags &= ~XDP_USE_NEED_WAKEUP;
+ bind_flags &= ~XDP_ZEROCOPY;
+ bind_flags |= XDP_COPY;
break;
case 'h':
print_usage();
exit(0);
+ case 'm':
+ bind_flags |= XDP_USE_SG;
+ break;
+ case 'r':
+ skip_tx = true;
+ break;
case '?':
if (isprint(optopt))
fprintf(stderr, "Unknown option: -%c\n", optopt);
diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index e574711eeb84..25d568abf0f2 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -115,6 +115,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
+ cfg->tx_metadata_len = 0;
return;
}
@@ -123,6 +124,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
cfg->frame_size = usr_cfg->frame_size;
cfg->frame_headroom = usr_cfg->frame_headroom;
cfg->flags = usr_cfg->flags;
+ cfg->tx_metadata_len = usr_cfg->tx_metadata_len;
}
static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
@@ -252,6 +254,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
mr.chunk_size = umem->config.frame_size;
mr.headroom = umem->config.frame_headroom;
mr.flags = umem->config.flags;
+ mr.tx_metadata_len = umem->config.tx_metadata_len;
err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
if (err) {
diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h
index 771570bc3731..93c2cc413cfc 100644
--- a/tools/testing/selftests/bpf/xsk.h
+++ b/tools/testing/selftests/bpf/xsk.h
@@ -200,6 +200,7 @@ struct xsk_umem_config {
__u32 frame_size;
__u32 frame_headroom;
__u32 flags;
+ __u32 tx_metadata_len;
};
int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags);