diff options
40 files changed, 1911 insertions, 293 deletions
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 4f2d1f682a18..4ddcae33c336 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -120,6 +120,7 @@ Contents: xfrm_proc xfrm_sync xfrm_sysctl + xdp-rx-metadata .. only:: subproject and html diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst new file mode 100644 index 000000000000..aac63fc2d08b --- /dev/null +++ b/Documentation/networking/xdp-rx-metadata.rst @@ -0,0 +1,110 @@ +=============== +XDP RX Metadata +=============== + +This document describes how an eXpress Data Path (XDP) program can access +hardware metadata related to a packet using a set of helper functions, +and how it can pass that metadata on to other consumers. + +General Design +============== + +XDP has access to a set of kfuncs to manipulate the metadata in an XDP frame. +Every device driver that wishes to expose additional packet metadata can +implement these kfuncs. The set of kfuncs is declared in ``include/net/xdp.h`` +via ``XDP_METADATA_KFUNC_xxx``. + +Currently, the following kfuncs are supported. In the future, as more +metadata is supported, this set will grow: + +.. kernel-doc:: net/core/xdp.c + :identifiers: bpf_xdp_metadata_rx_timestamp bpf_xdp_metadata_rx_hash + +An XDP program can use these kfuncs to read the metadata into stack +variables for its own consumption. Or, to pass the metadata on to other +consumers, an XDP program can store it into the metadata area carried +ahead of the packet. + +Not all kfuncs have to be implemented by the device driver; when not +implemented, the default ones that return ``-EOPNOTSUPP`` will be used. + +Within an XDP frame, the metadata layout (accessed via ``xdp_buff``) is +as follows:: + + +----------+-----------------+------+ + | headroom | custom metadata | data | + +----------+-----------------+------+ + ^ ^ + | | + xdp_buff->data_meta xdp_buff->data + +An XDP program can store individual metadata items into this ``data_meta`` +area in whichever format it chooses. Later consumers of the metadata +will have to agree on the format by some out of band contract (like for +the AF_XDP use case, see below). + +AF_XDP +====== + +:doc:`af_xdp` use-case implies that there is a contract between the BPF +program that redirects XDP frames into the ``AF_XDP`` socket (``XSK``) and +the final consumer. Thus the BPF program manually allocates a fixed number of +bytes out of metadata via ``bpf_xdp_adjust_meta`` and calls a subset +of kfuncs to populate it. The userspace ``XSK`` consumer computes +``xsk_umem__get_data() - METADATA_SIZE`` to locate that metadata. +Note, ``xsk_umem__get_data`` is defined in ``libxdp`` and +``METADATA_SIZE`` is an application-specific constant (``AF_XDP`` receive +descriptor does _not_ explicitly carry the size of the metadata). + +Here is the ``AF_XDP`` consumer layout (note missing ``data_meta`` pointer):: + + +----------+-----------------+------+ + | headroom | custom metadata | data | + +----------+-----------------+------+ + ^ + | + rx_desc->address + +XDP_PASS +======== + +This is the path where the packets processed by the XDP program are passed +into the kernel. The kernel creates the ``skb`` out of the ``xdp_buff`` +contents. Currently, every driver has custom kernel code to parse +the descriptors and populate ``skb`` metadata when doing this ``xdp_buff->skb`` +conversion, and the XDP metadata is not used by the kernel when building +``skbs``. However, TC-BPF programs can access the XDP metadata area using +the ``data_meta`` pointer. + +In the future, we'd like to support a case where an XDP program +can override some of the metadata used for building ``skbs``. + +bpf_redirect_map +================ + +``bpf_redirect_map`` can redirect the frame to a different device. +Some devices (like virtual ethernet links) support running a second XDP +program after the redirect. However, the final consumer doesn't have +access to the original hardware descriptor and can't access any of +the original metadata. The same applies to XDP programs installed +into devmaps and cpumaps. + +This means that for redirected packets only custom metadata is +currently supported, which has to be prepared by the initial XDP program +before redirect. If the frame is eventually passed to the kernel, the +``skb`` created from such a frame won't have any hardware metadata populated +in its ``skb``. If such a packet is later redirected into an ``XSK``, +that will also only have access to the custom metadata. + +bpf_tail_call +============= + +Adding programs that access metadata kfuncs to the ``BPF_MAP_TYPE_PROG_ARRAY`` +is currently not supported. + +Example +======= + +See ``tools/testing/selftests/bpf/progs/xdp_metadata.c`` and +``tools/testing/selftests/bpf/prog_tests/xdp_metadata.c`` for an example of +BPF program that handles XDP metadata. diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c index 98b5ffb4d729..9e3b76182088 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c @@ -58,9 +58,7 @@ u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe) return hi | lo; } -void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev, - struct skb_shared_hwtstamps *hwts, - u64 timestamp) +u64 mlx4_en_get_hwtstamp(struct mlx4_en_dev *mdev, u64 timestamp) { unsigned int seq; u64 nsec; @@ -70,8 +68,15 @@ void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev, nsec = timecounter_cyc2time(&mdev->clock, timestamp); } while (read_seqretry(&mdev->clock_lock, seq)); + return ns_to_ktime(nsec); +} + +void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev, + struct skb_shared_hwtstamps *hwts, + u64 timestamp) +{ memset(hwts, 0, sizeof(struct skb_shared_hwtstamps)); - hwts->hwtstamp = ns_to_ktime(nsec); + hwts->hwtstamp = mlx4_en_get_hwtstamp(mdev, timestamp); } /** diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 8800d3f1f55c..af4c4858f397 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2889,6 +2889,11 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_bpf = mlx4_xdp, }; +static const struct xdp_metadata_ops mlx4_xdp_metadata_ops = { + .xmo_rx_timestamp = mlx4_en_xdp_rx_timestamp, + .xmo_rx_hash = mlx4_en_xdp_rx_hash, +}; + struct mlx4_en_bond { struct work_struct work; struct mlx4_en_priv *priv; @@ -3310,6 +3315,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, dev->netdev_ops = &mlx4_netdev_ops_master; else dev->netdev_ops = &mlx4_netdev_ops; + dev->xdp_metadata_ops = &mlx4_xdp_metadata_ops; dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT; netif_set_real_num_tx_queues(dev, priv->tx_ring_num[TX]); netif_set_real_num_rx_queues(dev, priv->rx_ring_num); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 8f762fc170b3..0869d4fff17b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -661,9 +661,41 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va, #define MLX4_CQE_STATUS_IP_ANY (MLX4_CQE_STATUS_IPV4) #endif +struct mlx4_en_xdp_buff { + struct xdp_buff xdp; + struct mlx4_cqe *cqe; + struct mlx4_en_dev *mdev; + struct mlx4_en_rx_ring *ring; + struct net_device *dev; +}; + +int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + struct mlx4_en_xdp_buff *_ctx = (void *)ctx; + + if (unlikely(_ctx->ring->hwtstamp_rx_filter != HWTSTAMP_FILTER_ALL)) + return -EOPNOTSUPP; + + *timestamp = mlx4_en_get_hwtstamp(_ctx->mdev, + mlx4_en_get_cqe_ts(_ctx->cqe)); + return 0; +} + +int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +{ + struct mlx4_en_xdp_buff *_ctx = (void *)ctx; + + if (unlikely(!(_ctx->dev->features & NETIF_F_RXHASH))) + return -EOPNOTSUPP; + + *hash = be32_to_cpu(_ctx->cqe->immed_rss_invalid); + return 0; +} + int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget) { struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_xdp_buff mxbuf = {}; int factor = priv->cqe_factor; struct mlx4_en_rx_ring *ring; struct bpf_prog *xdp_prog; @@ -671,7 +703,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud bool doorbell_pending; bool xdp_redir_flush; struct mlx4_cqe *cqe; - struct xdp_buff xdp; int polled = 0; int index; @@ -681,7 +712,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud ring = priv->rx_ring[cq_ring]; xdp_prog = rcu_dereference_bh(ring->xdp_prog); - xdp_init_buff(&xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq); + xdp_init_buff(&mxbuf.xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq); doorbell_pending = false; xdp_redir_flush = false; @@ -776,24 +807,28 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud priv->frag_info[0].frag_size, DMA_FROM_DEVICE); - xdp_prepare_buff(&xdp, va - frags[0].page_offset, - frags[0].page_offset, length, false); - orig_data = xdp.data; - - act = bpf_prog_run_xdp(xdp_prog, &xdp); - - length = xdp.data_end - xdp.data; - if (xdp.data != orig_data) { - frags[0].page_offset = xdp.data - - xdp.data_hard_start; - va = xdp.data; + xdp_prepare_buff(&mxbuf.xdp, va - frags[0].page_offset, + frags[0].page_offset, length, true); + orig_data = mxbuf.xdp.data; + mxbuf.cqe = cqe; + mxbuf.mdev = priv->mdev; + mxbuf.ring = ring; + mxbuf.dev = dev; + + act = bpf_prog_run_xdp(xdp_prog, &mxbuf.xdp); + + length = mxbuf.xdp.data_end - mxbuf.xdp.data; + if (mxbuf.xdp.data != orig_data) { + frags[0].page_offset = mxbuf.xdp.data - + mxbuf.xdp.data_hard_start; + va = mxbuf.xdp.data; } switch (act) { case XDP_PASS: break; case XDP_REDIRECT: - if (likely(!xdp_do_redirect(dev, &xdp, xdp_prog))) { + if (likely(!xdp_do_redirect(dev, &mxbuf.xdp, xdp_prog))) { ring->xdp_redirect++; xdp_redir_flush = true; frags[0].page = NULL; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 3d4226ddba5e..544e09b97483 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -796,10 +796,15 @@ void mlx4_en_update_pfc_stats_bitmap(struct mlx4_dev *dev, int mlx4_en_netdev_event(struct notifier_block *this, unsigned long event, void *ptr); +struct xdp_md; +int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp); +int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash); + /* * Functions for time stamping */ u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe); +u64 mlx4_en_get_hwtstamp(struct mlx4_en_dev *mdev, u64 timestamp); void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev, struct skb_shared_hwtstamps *hwts, u64 timestamp); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 2d77fb8a8a01..711dc88d8b48 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -626,10 +626,11 @@ struct mlx5e_rq; typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*); typedef struct sk_buff * (*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, - u16 cqe_bcnt, u32 head_offset, u32 page_idx); + struct mlx5_cqe64 *cqe, u16 cqe_bcnt, + u32 head_offset, u32 page_idx); typedef struct sk_buff * (*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, - u32 cqe_bcnt); + struct mlx5_cqe64 *cqe, u32 cqe_bcnt); typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq); typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16); typedef void (*mlx5e_fp_shampo_dealloc_hd)(struct mlx5e_rq*, u16, u16, bool); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 853f312cd757..757c012ece27 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -73,6 +73,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget); void mlx5e_free_rx_descs(struct mlx5e_rq *rq); void mlx5e_free_rx_in_progress_descs(struct mlx5e_rq *rq); +static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config) +{ + return config->rx_filter == HWTSTAMP_FILTER_ALL; +} + /* TX */ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev); bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 20507ef2f956..f7d52b1d293b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -156,10 +156,39 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, return true; } +static int mlx5e_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + const struct mlx5e_xdp_buff *_ctx = (void *)ctx; + + if (unlikely(!mlx5e_rx_hw_stamp(_ctx->rq->tstamp))) + return -EOPNOTSUPP; + + *timestamp = mlx5e_cqe_ts_to_ns(_ctx->rq->ptp_cyc2time, + _ctx->rq->clock, get_cqe_ts(_ctx->cqe)); + return 0; +} + +static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +{ + const struct mlx5e_xdp_buff *_ctx = (void *)ctx; + + if (unlikely(!(_ctx->xdp.rxq->dev->features & NETIF_F_RXHASH))) + return -EOPNOTSUPP; + + *hash = be32_to_cpu(_ctx->cqe->rss_hash_result); + return 0; +} + +const struct xdp_metadata_ops mlx5e_xdp_metadata_ops = { + .xmo_rx_timestamp = mlx5e_xdp_rx_timestamp, + .xmo_rx_hash = mlx5e_xdp_rx_hash, +}; + /* returns true if packet was consumed by xdp */ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct page *page, - struct bpf_prog *prog, struct xdp_buff *xdp) + struct bpf_prog *prog, struct mlx5e_xdp_buff *mxbuf) { + struct xdp_buff *xdp = &mxbuf->xdp; u32 act; int err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index bc2d9034af5b..69f338bc0633 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -44,10 +44,16 @@ (MLX5E_XDP_INLINE_WQE_MAX_DS_CNT * MLX5_SEND_WQE_DS - \ sizeof(struct mlx5_wqe_inline_seg)) +struct mlx5e_xdp_buff { + struct xdp_buff xdp; + struct mlx5_cqe64 *cqe; + struct mlx5e_rq *rq; +}; + struct mlx5e_xsk_param; int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct page *page, - struct bpf_prog *prog, struct xdp_buff *xdp); + struct bpf_prog *prog, struct mlx5e_xdp_buff *mlctx); void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq); bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq); void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq); @@ -56,6 +62,8 @@ void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq); int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); +extern const struct xdp_metadata_ops mlx5e_xdp_metadata_ops; + INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, struct skb_shared_info *sinfo, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c index c91b54d9ff27..b7c84ebe8418 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -8,6 +8,14 @@ /* RX data path */ +static struct mlx5e_xdp_buff *xsk_buff_to_mxbuf(struct xdp_buff *xdp) +{ + /* mlx5e_xdp_buff shares its layout with xdp_buff_xsk + * and private mlx5e_xdp_buff fields fall into xdp_buff_xsk->cb + */ + return (struct mlx5e_xdp_buff *)xdp; +} + int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) { struct mlx5e_mpw_info *wi = mlx5e_get_mpw_info(rq, ix); @@ -22,6 +30,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) goto err; BUILD_BUG_ON(sizeof(wi->alloc_units[0]) != sizeof(wi->alloc_units[0].xsk)); + XSK_CHECK_PRIV_TYPE(struct mlx5e_xdp_buff); batch = xsk_buff_alloc_batch(rq->xsk_pool, (struct xdp_buff **)wi->alloc_units, rq->mpwqe.pages_per_wqe); @@ -43,25 +52,30 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED)) { for (i = 0; i < batch; i++) { + struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[i].xsk); dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk); umr_wqe->inline_mtts[i] = (struct mlx5_mtt) { .ptag = cpu_to_be64(addr | MLX5_EN_WR), }; + mxbuf->rq = rq; } } else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) { for (i = 0; i < batch; i++) { + struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[i].xsk); dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk); umr_wqe->inline_ksms[i] = (struct mlx5_ksm) { .key = rq->mkey_be, .va = cpu_to_be64(addr), }; + mxbuf->rq = rq; } } else if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE)) { u32 mapping_size = 1 << (rq->mpwqe.page_shift - 2); for (i = 0; i < batch; i++) { + struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[i].xsk); dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk); umr_wqe->inline_ksms[i << 2] = (struct mlx5_ksm) { @@ -80,6 +94,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) .key = rq->mkey_be, .va = cpu_to_be64(rq->wqe_overflow.addr), }; + mxbuf->rq = rq; } } else { __be32 pad_size = cpu_to_be32((1 << rq->mpwqe.page_shift) - @@ -87,6 +102,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) __be32 frame_size = cpu_to_be32(rq->xsk_pool->chunk_size); for (i = 0; i < batch; i++) { + struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[i].xsk); dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk); umr_wqe->inline_klms[i << 1] = (struct mlx5_klm) { @@ -99,6 +115,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) .va = cpu_to_be64(rq->wqe_overflow.addr), .bcount = pad_size, }; + mxbuf->rq = rq; } } @@ -229,11 +246,12 @@ static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, struct xdp_b struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset, u32 page_idx) { - struct xdp_buff *xdp = wi->alloc_units[page_idx].xsk; + struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->alloc_units[page_idx].xsk); struct bpf_prog *prog; /* Check packet size. Note LRO doesn't use linear SKB */ @@ -249,9 +267,11 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, */ WARN_ON_ONCE(head_offset); - xsk_buff_set_size(xdp, cqe_bcnt); - xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool); - net_prefetch(xdp->data); + /* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */ + mxbuf->cqe = cqe; + xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt); + xsk_buff_dma_sync_for_cpu(&mxbuf->xdp, rq->xsk_pool); + net_prefetch(mxbuf->xdp.data); /* Possible flows: * - XDP_REDIRECT to XSKMAP: @@ -269,7 +289,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, */ prog = rcu_dereference(rq->xdp_prog); - if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, xdp))) { + if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, mxbuf))) { if (likely(__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))) __set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */ return NULL; /* page/packet was consumed by XDP */ @@ -278,14 +298,15 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, /* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the * frame. On SKB allocation failure, NULL is returned. */ - return mlx5e_xsk_construct_skb(rq, xdp); + return mlx5e_xsk_construct_skb(rq, &mxbuf->xdp); } struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, + struct mlx5_cqe64 *cqe, u32 cqe_bcnt) { - struct xdp_buff *xdp = wi->au->xsk; + struct mlx5e_xdp_buff *mxbuf = xsk_buff_to_mxbuf(wi->au->xsk); struct bpf_prog *prog; /* wi->offset is not used in this function, because xdp->data and the @@ -295,17 +316,19 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, */ WARN_ON_ONCE(wi->offset); - xsk_buff_set_size(xdp, cqe_bcnt); - xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool); - net_prefetch(xdp->data); + /* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */ + mxbuf->cqe = cqe; + xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt); + xsk_buff_dma_sync_for_cpu(&mxbuf->xdp, rq->xsk_pool); + net_prefetch(mxbuf->xdp.data); prog = rcu_dereference(rq->xdp_prog); - if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, xdp))) + if (likely(prog && mlx5e_xdp_handle(rq, NULL, prog, mxbuf))) return NULL; /* page/packet was consumed by XDP */ /* XDP_PASS: copy the data from the UMEM to a new SKB. The frame reuse * will be handled by mlx5e_free_rx_wqe. * On SKB allocation failure, NULL is returned. */ - return mlx5e_xsk_construct_skb(rq, xdp); + return mlx5e_xsk_construct_skb(rq, &mxbuf->xdp); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h index 087c943bd8e9..cefc0ef6105d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h @@ -13,11 +13,13 @@ int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk); int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk); struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset, u32 page_idx); struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, + struct mlx5_cqe64 *cqe, u32 cqe_bcnt); #endif /* __MLX5_EN_XSK_RX_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index cff5f2e29e1e..3370c8b9f983 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5053,6 +5053,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) SET_NETDEV_DEV(netdev, mdev->device); netdev->netdev_ops = &mlx5e_netdev_ops; + netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops; mlx5e_dcbnl_build_netdev(netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index c8820ab22169..7b08653be000 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -62,10 +62,12 @@ static struct sk_buff * mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, - u16 cqe_bcnt, u32 head_offset, u32 page_idx); + struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset, + u32 page_idx); static struct sk_buff * mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, - u16 cqe_bcnt, u32 head_offset, u32 page_idx); + struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset, + u32 page_idx); static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); @@ -76,11 +78,6 @@ const struct mlx5e_rx_handlers mlx5e_rx_handlers_nic = { .handle_rx_cqe_mpwqe_shampo = mlx5e_handle_rx_cqe_mpwrq_shampo, }; -static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config) -{ - return config->rx_filter == HWTSTAMP_FILTER_ALL; -} - static inline void mlx5e_read_cqe_slot(struct mlx5_cqwq *wq, u32 cqcc, void *data) { @@ -1575,16 +1572,19 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va, return skb; } -static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, - u32 len, struct xdp_buff *xdp) +static void mlx5e_fill_mxbuf(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, + void *va, u16 headroom, u32 len, + struct mlx5e_xdp_buff *mxbuf) { - xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq); - xdp_prepare_buff(xdp, va, headroom, len, true); + xdp_init_buff(&mxbuf->xdp, rq->buff.frame0_sz, &rq->xdp_rxq); + xdp_prepare_buff(&mxbuf->xdp, va, headroom, len, true); + mxbuf->cqe = cqe; + mxbuf->rq = rq; } static struct sk_buff * mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, - u32 cqe_bcnt) + struct mlx5_cqe64 *cqe, u32 cqe_bcnt) { union mlx5e_alloc_unit *au = wi->au; u16 rx_headroom = rq->buff.headroom; @@ -1606,16 +1606,16 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, prog = rcu_dereference(rq->xdp_prog); if (prog) { - struct xdp_buff xdp; + struct mlx5e_xdp_buff mxbuf; net_prefetchw(va); /* xdp_frame data area */ - mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp); - if (mlx5e_xdp_handle(rq, au->page, prog, &xdp)) + mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, cqe_bcnt, &mxbuf); + if (mlx5e_xdp_handle(rq, au->page, prog, &mxbuf)) return NULL; /* page/packet was consumed by XDP */ - rx_headroom = xdp.data - xdp.data_hard_start; - metasize = xdp.data - xdp.data_meta; - cqe_bcnt = xdp.data_end - xdp.data; + rx_headroom = mxbuf.xdp.data - mxbuf.xdp.data_hard_start; + metasize = mxbuf.xdp.data - mxbuf.xdp.data_meta; + cqe_bcnt = mxbuf.xdp.data_end - mxbuf.xdp.data; } frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize); @@ -1630,16 +1630,16 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, static struct sk_buff * mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, - u32 cqe_bcnt) + struct mlx5_cqe64 *cqe, u32 cqe_bcnt) { struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0]; struct mlx5e_wqe_frag_info *head_wi = wi; union mlx5e_alloc_unit *au = wi->au; u16 rx_headroom = rq->buff.headroom; struct skb_shared_info *sinfo; + struct mlx5e_xdp_buff mxbuf; u32 frag_consumed_bytes; struct bpf_prog *prog; - struct xdp_buff xdp; struct sk_buff *skb; dma_addr_t addr; u32 truesize; @@ -1654,8 +1654,8 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi net_prefetchw(va); /* xdp_frame data area */ net_prefetch(va + rx_headroom); - mlx5e_fill_xdp_buff(rq, va, rx_headroom, frag_consumed_bytes, &xdp); - sinfo = xdp_get_shared_info_from_buff(&xdp); + mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, frag_consumed_bytes, &mxbuf); + sinfo = xdp_get_shared_info_from_buff(&mxbuf.xdp); truesize = 0; cqe_bcnt -= frag_consumed_bytes; @@ -1673,13 +1673,13 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi dma_sync_single_for_cpu(rq->pdev, addr + wi->offset, frag_consumed_bytes, rq->buff.map_dir); - if (!xdp_buff_has_frags(&xdp)) { + if (!xdp_buff_has_frags(&mxbuf.xdp)) { /* Init on the first fragment to avoid cold cache access * when possible. */ sinfo->nr_frags = 0; sinfo->xdp_frags_size = 0; - xdp_buff_set_frags_flag(&xdp); + xdp_buff_set_frags_flag(&mxbuf.xdp); } frag = &sinfo->frags[sinfo->nr_frags++]; @@ -1688,7 +1688,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi skb_frag_size_set(frag, frag_consumed_bytes); if (page_is_pfmemalloc(au->page)) - xdp_buff_set_frag_pfmemalloc(&xdp); + xdp_buff_set_frag_pfmemalloc(&mxbuf.xdp); sinfo->xdp_frags_size += frag_consumed_bytes; truesize += frag_info->frag_stride; @@ -1701,7 +1701,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi au = head_wi->au; prog = rcu_dereference(rq->xdp_prog); - if (prog && mlx5e_xdp_handle(rq, au->page, prog, &xdp)) { + if (prog && mlx5e_xdp_handle(rq, au->page, prog, &mxbuf)) { if (test_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { int i; @@ -1711,22 +1711,22 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi return NULL; /* page/packet was consumed by XDP */ } - skb = mlx5e_build_linear_skb(rq, xdp.data_hard_start, rq->buff.frame0_sz, - xdp.data - xdp.data_hard_start, - xdp.data_end - xdp.data, - xdp.data - xdp.data_meta); + skb = mlx5e_build_linear_skb(rq, mxbuf.xdp.data_hard_start, rq->buff.frame0_sz, + mxbuf.xdp.data - mxbuf.xdp.data_hard_start, + mxbuf.xdp.data_end - mxbuf.xdp.data, + mxbuf.xdp.data - mxbuf.xdp.data_meta); if (unlikely(!skb)) return NULL; page_ref_inc(au->page); - if (unlikely(xdp_buff_has_frags(&xdp))) { + if (unlikely(xdp_buff_has_frags(&mxbuf.xdp))) { int i; /* sinfo->nr_frags is reset by build_skb, calculate again. */ xdp_update_skb_shared_info(skb, wi - head_wi - 1, sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc(&xdp)); + xdp_buff_is_frag_pfmemalloc(&mxbuf.xdp)); for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; @@ -1777,7 +1777,7 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) mlx5e_skb_from_cqe_linear, mlx5e_skb_from_cqe_nonlinear, mlx5e_xsk_skb_from_cqe_linear, - rq, wi, cqe_bcnt); + rq, wi, cqe, cqe_bcnt); if (!skb) { /* probably for XDP */ if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { @@ -1830,7 +1830,7 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe, mlx5e_skb_from_cqe_linear, mlx5e_skb_from_cqe_nonlinear, - rq, wi, cqe_bcnt); + rq, wi, cqe, cqe_bcnt); if (!skb) { /* probably for XDP */ if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { @@ -1889,7 +1889,7 @@ static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 skb = INDIRECT_CALL_2(rq->mpwqe.skb_from_cqe_mpwrq, mlx5e_skb_from_cqe_mpwrq_linear, mlx5e_skb_from_cqe_mpwrq_nonlinear, - rq, wi, cqe_bcnt, head_offset, page_idx); + rq, wi, cqe, cqe_bcnt, head_offset, page_idx); if (!skb) goto mpwrq_cqe_out; @@ -1940,7 +1940,8 @@ mlx5e_fill_skb_data(struct sk_buff *skb, struct mlx5e_rq *rq, static struct sk_buff * mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, - u16 cqe_bcnt, u32 head_offset, u32 page_idx) + struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset, + u32 page_idx) { union mlx5e_alloc_unit *au = &wi->alloc_units[page_idx]; u16 headlen = min_t(u16, MLX5E_RX_MAX_HEAD, cqe_bcnt); @@ -1979,7 +1980,8 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w static struct sk_buff * mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, - u16 cqe_bcnt, u32 head_offset, u32 page_idx) + struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset, + u32 page_idx) { union mlx5e_alloc_unit *au = &wi->alloc_units[page_idx]; u16 rx_headroom = rq->buff.headroom; @@ -2007,19 +2009,19 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, prog = rcu_dereference(rq->xdp_prog); if (prog) { - struct xdp_buff xdp; + struct mlx5e_xdp_buff mxbuf; net_prefetchw(va); /* xdp_frame data area */ - mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp); - if (mlx5e_xdp_handle(rq, au->page, prog, &xdp)) { + mlx5e_fill_mxbuf(rq, cqe, va, rx_headroom, cqe_bcnt, &mxbuf); + if (mlx5e_xdp_handle(rq, au->page, prog, &mxbuf)) { if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) __set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */ return NULL; /* page/packet was consumed by XDP */ } - rx_headroom = xdp.data - xdp.data_hard_start; - metasize = xdp.data - xdp.data_meta; - cqe_bcnt = xdp.data_end - xdp.data; + rx_headroom = mxbuf.xdp.data - mxbuf.xdp.data_hard_start; + metasize = mxbuf.xdp.data - mxbuf.xdp.data_meta; + cqe_bcnt = mxbuf.xdp.data_end - mxbuf.xdp.data; } frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize); @@ -2174,8 +2176,8 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq if (likely(head_size)) *skb = mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index); else - *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe_bcnt, data_offset, - page_idx); + *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe, cqe_bcnt, + data_offset, page_idx); if (unlikely(!*skb)) goto free_hd_entry; @@ -2249,7 +2251,8 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq mlx5e_skb_from_cqe_mpwrq_linear, mlx5e_skb_from_cqe_mpwrq_nonlinear, mlx5e_xsk_skb_from_cqe_mpwrq_linear, - rq, wi, cqe_bcnt, head_offset, page_idx); + rq, wi, cqe, cqe_bcnt, head_offset, + page_idx); if (!skb) goto mpwrq_cqe_out; @@ -2494,7 +2497,7 @@ static void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe, mlx5e_skb_from_cqe_linear, mlx5e_skb_from_cqe_nonlinear, - rq, wi, cqe_bcnt); + rq, wi, cqe, cqe_bcnt); if (!skb) goto wq_free_wqe; @@ -2586,7 +2589,7 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe goto free_wqe; } - skb = mlx5e_skb_from_cqe_nonlinear(rq, wi, cqe_bcnt); + skb = mlx5e_skb_from_cqe_nonlinear(rq, wi, cqe, cqe_bcnt); if (!skb) goto free_wqe; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 50854265864d..f60eb97e3a62 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -315,10 +315,6 @@ nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) NSIM_EA(bpf->extack, "xdpoffload of non-bound program"); return -EINVAL; } - if (!bpf_offload_dev_match(bpf->prog, ns->netdev)) { - NSIM_EA(bpf->extack, "program bound to different dev"); - return -EINVAL; - } state = bpf->prog->aux->offload->dev_priv; if (WARN_ON(strcmp(state->state, "xlated"))) { diff --git a/drivers/net/veth.c b/drivers/net/veth.c index dfc7d87fad59..ba3e05832843 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -116,6 +116,11 @@ static struct { { "peer_ifindex" }, }; +struct veth_xdp_buff { + struct xdp_buff xdp; + struct sk_buff *skb; +}; + static int veth_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { @@ -592,23 +597,25 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { - struct xdp_buff xdp; + struct veth_xdp_buff vxbuf; + struct xdp_buff *xdp = &vxbuf.xdp; u32 act; - xdp_convert_frame_to_buff(frame, &xdp); - xdp.rxq = &rq->xdp_rxq; + xdp_convert_frame_to_buff(frame, xdp); + xdp->rxq = &rq->xdp_rxq; + vxbuf.skb = NULL; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: - if (xdp_update_frame_from_buff(&xdp, frame)) + if (xdp_update_frame_from_buff(xdp, frame)) goto err_xdp; break; case XDP_TX: orig_frame = *frame; - xdp.rxq->mem = frame->mem; - if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { + xdp->rxq->mem = frame->mem; + if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; stats->rx_drops++; @@ -619,8 +626,8 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp.rxq->mem = frame->mem; - if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { + xdp->rxq->mem = frame->mem; + if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; goto err_xdp; @@ -801,7 +808,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, { void *orig_data, *orig_data_end; struct bpf_prog *xdp_prog; - struct xdp_buff xdp; + struct veth_xdp_buff vxbuf; + struct xdp_buff *xdp = &vxbuf.xdp; u32 act, metalen; int off; @@ -815,22 +823,23 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, } __skb_push(skb, skb->data - skb_mac_header(skb)); - if (veth_convert_skb_to_xdp_buff(rq, &xdp, &skb)) + if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) goto drop; + vxbuf.skb = skb; - orig_data = xdp.data; - orig_data_end = xdp.data_end; + orig_data = xdp->data; + orig_data_end = xdp->data_end; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: - veth_xdp_get(&xdp); + veth_xdp_get(xdp); consume_skb(skb); - xdp.rxq->mem = rq->xdp_mem; - if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { + xdp->rxq->mem = rq->xdp_mem; + if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); stats->rx_drops++; goto err_xdp; @@ -839,10 +848,10 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: - veth_xdp_get(&xdp); + veth_xdp_get(xdp); consume_skb(skb); - xdp.rxq->mem = rq->xdp_mem; - if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { + xdp->rxq->mem = rq->xdp_mem; + if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { stats->rx_drops++; goto err_xdp; } @@ -862,7 +871,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); /* check if bpf_xdp_adjust_head was used */ - off = orig_data - xdp.data; + off = orig_data - xdp->data; if (off > 0) __skb_push(skb, off); else if (off < 0) @@ -871,21 +880,21 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, skb_reset_mac_header(skb); /* check if bpf_xdp_adjust_tail was used */ - off = xdp.data_end - orig_data_end; + off = xdp->data_end - orig_data_end; if (off != 0) __skb_put(skb, off); /* positive on grow, negative on shrink */ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ - if (xdp_buff_has_frags(&xdp)) + if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; skb->protocol = eth_type_trans(skb, rq->dev); - metalen = xdp.data - xdp.data_meta; + metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); out: @@ -898,7 +907,7 @@ xdp_drop: return NULL; err_xdp: rcu_read_unlock(); - xdp_return_buff(&xdp); + xdp_return_buff(xdp); xdp_xmit: return NULL; } @@ -1596,6 +1605,28 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } +static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + struct veth_xdp_buff *_ctx = (void *)ctx; + + if (!_ctx->skb) + return -EOPNOTSUPP; + + *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; + return 0; +} + +static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +{ + struct veth_xdp_buff *_ctx = (void *)ctx; + + if (!_ctx->skb) + return -EOPNOTSUPP; + + *hash = skb_get_hash(_ctx->skb); + return 0; +} + static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, @@ -1617,6 +1648,11 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_get_peer_dev = veth_peer_dev, }; +static const struct xdp_metadata_ops veth_xdp_metadata_ops = { + .xmo_rx_timestamp = veth_xdp_rx_timestamp, + .xmo_rx_hash = veth_xdp_rx_hash, +}; + #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ @@ -1633,6 +1669,7 @@ static void veth_setup(struct net_device *dev) dev->priv_flags |= IFF_PHONY_HEADROOM; dev->netdev_ops = &veth_netdev_ops; + dev->xdp_metadata_ops = &veth_xdp_metadata_ops; dev->ethtool_ops = &veth_ethtool_ops; dev->features |= NETIF_F_LLTX; dev->features |= VETH_FEATURES; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ae7771c7d750..ad4bb36d4c10 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1261,7 +1261,8 @@ struct bpf_prog_aux { enum bpf_prog_type saved_dst_prog_type; enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ - bool offload_requested; + bool dev_bound; /* Program is bound to the netdev. */ + bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; bool sleepable; @@ -2451,7 +2452,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); int bpf_prog_offload_compile(struct bpf_prog *prog); -void bpf_prog_offload_destroy(struct bpf_prog *prog); +void bpf_prog_dev_bound_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); @@ -2479,14 +2480,26 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); void unpriv_ebpf_notify(int new_state); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) -int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); +int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, + struct bpf_prog_aux *prog_aux); +void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id); +int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr); +int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog); +void bpf_dev_bound_netdev_unregister(struct net_device *dev); static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { + return aux->dev_bound; +} + +static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux) +{ return aux->offload_requested; } -static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs); + +static inline bool bpf_map_is_offloaded(struct bpf_map *map) { return unlikely(map->ops == &bpf_map_offload_ops); } @@ -2507,18 +2520,50 @@ void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else -static inline int bpf_prog_offload_init(struct bpf_prog *prog, - union bpf_attr *attr) +static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, + struct bpf_prog_aux *prog_aux) +{ + return -EOPNOTSUPP; +} + +static inline void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, + u32 func_id) +{ + return NULL; +} + +static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog, + union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, + struct bpf_prog *old_prog) { return -EOPNOTSUPP; } -static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) +static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev) +{ +} + +static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) +{ + return false; +} + +static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux) +{ + return false; +} + +static inline bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) { return false; } -static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +static inline bool bpf_map_is_offloaded(struct bpf_map *map) { return false; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aad12a179e54..90f2be194bc5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -74,6 +74,7 @@ struct udp_tunnel_nic_info; struct udp_tunnel_nic; struct bpf_prog; struct xdp_buff; +struct xdp_md; void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, @@ -1618,6 +1619,11 @@ struct net_device_ops { bool cycles); }; +struct xdp_metadata_ops { + int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); + int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash); +}; + /** * enum netdev_priv_flags - &struct net_device priv_flags * @@ -1801,6 +1807,7 @@ enum netdev_ml_priv_type { * * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions + * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour @@ -2050,6 +2057,7 @@ struct net_device { unsigned int flags; unsigned long long priv_flags; const struct net_device_ops *netdev_ops; + const struct xdp_metadata_ops *xdp_metadata_ops; int ifindex; unsigned short gflags; unsigned short hard_header_len; diff --git a/include/net/xdp.h b/include/net/xdp.h index 55dbc68bfffc..91292aa13bc0 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -409,4 +409,25 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE +#define XDP_METADATA_KFUNC_xxx \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ + bpf_xdp_metadata_rx_timestamp) \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ + bpf_xdp_metadata_rx_hash) \ + +enum { +#define XDP_METADATA_KFUNC(name, _) name, +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC +MAX_XDP_METADATA_KFUNC, +}; + +#ifdef CONFIG_NET +u32 bpf_xdp_metadata_kfunc_id(int id); +bool bpf_dev_bound_kfunc_id(u32 btf_id); +#else +static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } +static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } +#endif + #endif /* __LINUX_NET_XDP_H__ */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index f787c3f524b0..3e952e569418 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -19,8 +19,11 @@ struct xdp_sock; struct device; struct page; +#define XSK_PRIV_MAX 24 + struct xdp_buff_xsk { struct xdp_buff xdp; + u8 cb[XSK_PRIV_MAX]; dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; @@ -28,6 +31,8 @@ struct xdp_buff_xsk { struct list_head free_list_node; }; +#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) + struct xsk_dma_map { dma_addr_t *dma_pages; struct device *dev; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index adae5b168f9d..ba0f0cfb5e42 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1156,6 +1156,11 @@ enum bpf_link_type { */ #define BPF_F_XDP_HAS_FRAGS (1U << 5) +/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded + * program becomes device-bound but can access XDP metadata. + */ +#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) + /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba3fff17e2f9..16da51093aff 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2096,6 +2096,14 @@ bool bpf_prog_map_compatible(struct bpf_map *map, if (fp->kprobe_override) return false; + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for @@ -2182,7 +2190,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ - if (!bpf_prog_is_dev_bound(fp->aux)) { + if (!bpf_prog_is_offloaded(fp->aux)) { *err = bpf_prog_alloc_jited_linfo(fp); if (*err) return fp; @@ -2554,7 +2562,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) - bpf_prog_offload_destroy(aux->prog); + bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS if (aux->prog->has_callchain_buf) put_callchain_buffers(); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 13e4efc971e6..e87cab2ed710 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -41,7 +41,7 @@ struct bpf_offload_dev { struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; - struct bpf_offload_dev *offdev; + struct bpf_offload_dev *offdev; /* NULL when bound-only */ struct list_head progs; struct list_head maps; struct list_head offdev_netdevs; @@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = { }; static struct rhashtable offdevs; -static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -72,58 +71,221 @@ bpf_offload_find_netdev(struct net_device *netdev) { lockdep_assert_held(&bpf_devs_lock); - if (!offdevs_inited) - return NULL; return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); } -int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { struct bpf_offload_netdev *ondev; - struct bpf_prog_offload *offload; int err; - if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && - attr->prog_type != BPF_PROG_TYPE_XDP) - return -EINVAL; + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; - if (attr->prog_flags) - return -EINVAL; + ondev->netdev = netdev; + ondev->offdev = offdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_free; + } + + if (offdev) + list_add(&ondev->offdev_netdevs, &offdev->netdevs); + return 0; + +err_free: + kfree(ondev); + return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_prog_offload *offload = prog->aux->offload; + + if (offload->dev_state) + offload->offdev->ops->destroy(prog); + + /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ + bpf_prog_free_id(prog, true); + + list_del_init(&offload->offloads); + kfree(offload); + prog->aux->offload = NULL; +} + +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) +{ + struct bpf_offload_netdev *ondev, *altdev = NULL; + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; + + ASSERT_RTNL(); + + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + return; + + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); + + /* Try to move the objects to another netdev of the device */ + if (offdev) { + list_del(&ondev->offdev_netdevs); + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + } + + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); + } + + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +} + +static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev) +{ + struct bpf_offload_netdev *ondev; + struct bpf_prog_offload *offload; + int err; offload = kzalloc(sizeof(*offload), GFP_USER); if (!offload) return -ENOMEM; offload->prog = prog; + offload->netdev = netdev; - offload->netdev = dev_get_by_index(current->nsproxy->net_ns, - attr->prog_ifindex); - err = bpf_dev_offload_check(offload->netdev); - if (err) - goto err_maybe_put; - - down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offload->netdev); if (!ondev) { - err = -EINVAL; - goto err_unlock; + if (bpf_prog_is_offloaded(prog->aux)) { + err = -EINVAL; + goto err_free; + } + + /* When only binding to the device, explicitly + * create an entry in the hashtable. + */ + err = __bpf_offload_dev_netdev_register(NULL, offload->netdev); + if (err) + goto err_free; + ondev = bpf_offload_find_netdev(offload->netdev); } offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); - dev_put(offload->netdev); - up_write(&bpf_devs_lock); return 0; -err_unlock: - up_write(&bpf_devs_lock); -err_maybe_put: - if (offload->netdev) - dev_put(offload->netdev); +err_free: kfree(offload); return err; } +int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) +{ + struct net_device *netdev; + int err; + + if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && + attr->prog_type != BPF_PROG_TYPE_XDP) + return -EINVAL; + + if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY) + return -EINVAL; + + if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && + attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY) + return -EINVAL; + + netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); + if (!netdev) + return -EINVAL; + + err = bpf_dev_offload_check(netdev); + if (err) + goto out; + + prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY); + + down_write(&bpf_devs_lock); + err = __bpf_prog_dev_bound_init(prog, netdev); + up_write(&bpf_devs_lock); + +out: + dev_put(netdev); + return err; +} + +int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) +{ + int err; + + if (!bpf_prog_is_dev_bound(old_prog->aux)) + return 0; + + if (bpf_prog_is_offloaded(old_prog->aux)) + return -EINVAL; + + new_prog->aux->dev_bound = old_prog->aux->dev_bound; + new_prog->aux->offload_requested = old_prog->aux->offload_requested; + + down_write(&bpf_devs_lock); + if (!old_prog->aux->offload) { + err = -EINVAL; + goto out; + } + + err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev); + +out: + up_write(&bpf_devs_lock); + return err; +} + int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { struct bpf_prog_offload *offload; @@ -209,27 +371,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) up_read(&bpf_devs_lock); } -static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +void bpf_prog_dev_bound_destroy(struct bpf_prog *prog) { - struct bpf_prog_offload *offload = prog->aux->offload; - - if (offload->dev_state) - offload->offdev->ops->destroy(prog); - - /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ - bpf_prog_free_id(prog, true); - - list_del_init(&offload->offloads); - kfree(offload); - prog->aux->offload = NULL; -} + struct bpf_offload_netdev *ondev; + struct net_device *netdev; -void bpf_prog_offload_destroy(struct bpf_prog *prog) -{ + rtnl_lock(); down_write(&bpf_devs_lock); - if (prog->aux->offload) + if (prog->aux->offload) { + list_del_init(&prog->aux->offload->offloads); + + netdev = prog->aux->offload->netdev; __bpf_prog_offload_destroy(prog); + + ondev = bpf_offload_find_netdev(netdev); + if (!ondev->offdev && list_empty(&ondev->progs)) + __bpf_offload_dev_netdev_unregister(NULL, netdev); + } up_write(&bpf_devs_lock); + rtnl_unlock(); } static int bpf_prog_offload_translate(struct bpf_prog *prog) @@ -343,22 +503,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, const struct bpf_prog_ops bpf_offload_prog_ops = { }; -static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, - enum bpf_netdev_command cmd) -{ - struct netdev_bpf data = {}; - struct net_device *netdev; - - ASSERT_RTNL(); - - data.command = cmd; - data.offmap = offmap; - /* Caller must make sure netdev is valid */ - netdev = offmap->netdev; - - return netdev->netdev_ops->ndo_bpf(netdev, &data); -} - struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; @@ -408,15 +552,6 @@ err_unlock: return ERR_PTR(err); } -static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) -{ - WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); - /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ - bpf_map_free_id(&offmap->map, true); - list_del_init(&offmap->offloads); - offmap->netdev = NULL; -} - void bpf_map_offload_map_free(struct bpf_map *map) { struct bpf_offloaded_map *offmap = map_to_offmap(map); @@ -576,12 +711,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) } EXPORT_SYMBOL_GPL(bpf_offload_dev_match); +bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) +{ + bool ret; + + if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux)) + return false; + + down_read(&bpf_devs_lock); + ret = lhs->aux->offload && rhs->aux->offload && + lhs->aux->offload->netdev && + lhs->aux->offload->netdev == rhs->aux->offload->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; bool ret; - if (!bpf_map_is_dev_bound(map)) + if (!bpf_map_is_offloaded(map)) return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); @@ -595,32 +746,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev) { - struct bpf_offload_netdev *ondev; int err; - ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); - if (!ondev) - return -ENOMEM; - - ondev->netdev = netdev; - ondev->offdev = offdev; - INIT_LIST_HEAD(&ondev->progs); - INIT_LIST_HEAD(&ondev->maps); - down_write(&bpf_devs_lock); - err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); - if (err) { - netdev_warn(netdev, "failed to register for BPF offload\n"); - goto err_unlock_free; - } - - list_add(&ondev->offdev_netdevs, &offdev->netdevs); - up_write(&bpf_devs_lock); - return 0; - -err_unlock_free: + err = __bpf_offload_dev_netdev_register(offdev, netdev); up_write(&bpf_devs_lock); - kfree(ondev); return err; } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); @@ -628,43 +758,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev) { - struct bpf_offload_netdev *ondev, *altdev; - struct bpf_offloaded_map *offmap, *mtmp; - struct bpf_prog_offload *offload, *ptmp; - - ASSERT_RTNL(); - down_write(&bpf_devs_lock); - ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); - if (WARN_ON(!ondev)) - goto unlock; - - WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); - list_del(&ondev->offdev_netdevs); - - /* Try to move the objects to another netdev of the device */ - altdev = list_first_entry_or_null(&offdev->netdevs, - struct bpf_offload_netdev, - offdev_netdevs); - if (altdev) { - list_for_each_entry(offload, &ondev->progs, offloads) - offload->netdev = altdev->netdev; - list_splice_init(&ondev->progs, &altdev->progs); - - list_for_each_entry(offmap, &ondev->maps, offloads) - offmap->netdev = altdev->netdev; - list_splice_init(&ondev->maps, &altdev->maps); - } else { - list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) - __bpf_prog_offload_destroy(offload->prog); - list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) - __bpf_map_offload_destroy(offmap); - } - - WARN_ON(!list_empty(&ondev->progs)); - WARN_ON(!list_empty(&ondev->maps)); - kfree(ondev); -unlock: + __bpf_offload_dev_netdev_unregister(offdev, netdev); up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); @@ -673,18 +768,6 @@ struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; - int err; - - down_write(&bpf_devs_lock); - if (!offdevs_inited) { - err = rhashtable_init(&offdevs, &offdevs_params); - if (err) { - up_write(&bpf_devs_lock); - return ERR_PTR(err); - } - offdevs_inited = true; - } - up_write(&bpf_devs_lock); offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); if (!offdev) @@ -710,3 +793,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) return offdev->priv; } EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); + +void bpf_dev_bound_netdev_unregister(struct net_device *dev) +{ + struct bpf_offload_netdev *ondev; + + ASSERT_RTNL(); + + down_write(&bpf_devs_lock); + ondev = bpf_offload_find_netdev(dev); + if (ondev && !ondev->offdev) + __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev); + up_write(&bpf_devs_lock); +} + +int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, + struct bpf_prog_aux *prog_aux) +{ + if (!bpf_prog_is_dev_bound(prog_aux)) { + bpf_log(log, "metadata kfuncs require device-bound program\n"); + return -EINVAL; + } + + if (bpf_prog_is_offloaded(prog_aux)) { + bpf_log(log, "metadata kfuncs can't be offloaded\n"); + return -EINVAL; + } + + return 0; +} + +void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) +{ + const struct xdp_metadata_ops *ops; + void *p = NULL; + + /* We don't hold bpf_devs_lock while resolving several + * kfuncs and can race with the unregister_netdevice(). + * We rely on bpf_dev_bound_match() check at attach + * to render this program unusable. + */ + down_read(&bpf_devs_lock); + if (!prog->aux->offload) + goto out; + + ops = prog->aux->offload->netdev->xdp_metadata_ops; + if (!ops) + goto out; + + if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) + p = ops->xmo_rx_timestamp; + else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH)) + p = ops->xmo_rx_hash; +out: + up_read(&bpf_devs_lock); + + return p; +} + +static int __init bpf_offload_init(void) +{ + return rhashtable_init(&offdevs, &offdevs_params); +} + +late_initcall(bpf_offload_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35ffd808f281..d5ffa7a01dfb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,7 +181,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, int err; /* Need to create a kthread, thus must support schedule */ - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { @@ -238,7 +238,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, void *ptr; int err; - if (bpf_map_is_dev_bound(map)) + if (bpf_map_is_offloaded(map)) return bpf_map_offload_lookup_elem(map, key, value); bpf_disable_instrumentation(); @@ -1483,7 +1483,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; } else if (IS_FD_PROG_ARRAY(map) || @@ -1547,7 +1547,7 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_get_next_key(map, key, next_key); goto out; } @@ -1605,7 +1605,7 @@ int generic_map_delete_batch(struct bpf_map *map, map->key_size)) break; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); break; } @@ -1851,7 +1851,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - if (!bpf_map_is_dev_bound(map)) { + if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); @@ -1944,7 +1944,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) if (!ops) return -EINVAL; - if (!bpf_prog_is_dev_bound(prog->aux)) + if (!bpf_prog_is_offloaded(prog->aux)) prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; @@ -2255,7 +2255,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog, if (prog->type != *attach_type) return false; - if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) + if (bpf_prog_is_offloaded(prog->aux) && !attach_drv) return false; return true; @@ -2491,7 +2491,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32 | - BPF_F_XDP_HAS_FRAGS)) + BPF_F_XDP_HAS_FRAGS | + BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2575,7 +2576,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->dst_prog = dst_prog; - prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; @@ -2599,7 +2600,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { - err = bpf_prog_offload_init(prog, attr); + err = bpf_prog_dev_bound_init(prog, attr); + if (err) + goto free_prog_sec; + } + + if (type == BPF_PROG_TYPE_EXT && dst_prog && + bpf_prog_is_dev_bound(dst_prog->aux)) { + err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) goto free_prog_sec; } @@ -3997,7 +4005,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, return -EFAULT; } - if (bpf_prog_is_dev_bound(prog->aux)) { + if (bpf_prog_is_offloaded(prog->aux)) { err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; @@ -4225,7 +4233,7 @@ static int bpf_map_get_info_by_fd(struct file *file, } info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; - if (bpf_map_is_dev_bound(map)) { + if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) return err; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ecf7fed7881c..800488289297 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2333,6 +2333,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } + if (bpf_dev_bound_kfunc_id(func_id)) { + err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); + if (err) + return err; + } + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; desc->imm = call_imm; @@ -14099,7 +14105,7 @@ static int do_check(struct bpf_verifier_env *env) env->prev_log_len = env->log.len_used; } - if (bpf_prog_is_dev_bound(env->prog->aux)) { + if (bpf_prog_is_offloaded(env->prog->aux)) { err = bpf_prog_offload_verify_insn(env, env->insn_idx, env->prev_insn_idx); if (err) @@ -14579,7 +14585,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; @@ -15060,7 +15066,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) unsigned int orig_prog_len = env->prog->len; int err; - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_remove_insns(env, off, cnt); err = bpf_remove_insns(env->prog, off, cnt); @@ -15141,7 +15147,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) else continue; - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_replace_insn(env, i, &ja); memcpy(insn, &ja, sizeof(ja)); @@ -15328,7 +15334,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_offloaded(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; @@ -15728,7 +15734,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) int err = 0; if (env->prog->jit_requested && - !bpf_prog_is_dev_bound(env->prog->aux)) { + !bpf_prog_is_offloaded(env->prog->aux)) { err = jit_subprogs(env); if (err == 0) return 0; @@ -15772,12 +15778,25 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { const struct bpf_kfunc_desc *desc; + void *xdp_kfunc; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); return -EINVAL; } + *cnt = 0; + + if (bpf_dev_bound_kfunc_id(insn->imm)) { + xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm); + if (xdp_kfunc) { + insn->imm = BPF_CALL_IMM(xdp_kfunc); + return 0; + } + + /* fallback to default kfunc when not supported by netdev */ + } + /* insn->imm has the btf func_id. Replace it with * an address (relative to __bpf_call_base). */ @@ -15788,7 +15807,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } - *cnt = 0; insn->imm = desc->imm; if (insn->off) return 0; @@ -16795,6 +16813,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; + if (bpf_prog_is_dev_bound(prog->aux) && + !bpf_prog_dev_bound_match(prog, tgt_prog)) { + bpf_log(log, "Target program bound device mismatch"); + return -EINVAL; + } + for (i = 0; i < aux->func_info_cnt; i++) if (aux->func_info[i].type_id == btf_id) { subprog = i; @@ -17231,7 +17255,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) if (ret < 0) goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { + if (bpf_prog_is_offloaded(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) goto skip_full_check; @@ -17244,7 +17268,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) ret = do_check_subprogs(env); ret = ret ?: do_check_main(env); - if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) + if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux)) ret = bpf_prog_offload_finalize(env); skip_full_check: @@ -17279,7 +17303,7 @@ skip_full_check: /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. */ - if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { + if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) { ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret : false; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2723623429ac..8da0d73b368e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1300,6 +1300,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) return -EINVAL; + if (bpf_prog_is_dev_bound(prog->aux)) + return -EINVAL; + if (do_live) { if (!batch_size) batch_size = NAPI_POLL_WEIGHT; diff --git a/net/core/dev.c b/net/core/dev.c index cf78f35bc0b9..e66da626df84 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9224,8 +9224,12 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } - if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { - NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); + if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); + return -EINVAL; + } + if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { + NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { @@ -10830,6 +10834,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_shutdown(dev); dev_xdp_uninstall(dev); + bpf_dev_bound_netdev_unregister(dev); netdev_offload_xstats_disable_all(dev); diff --git a/net/core/filter.c b/net/core/filter.c index b4547a2c02f4..ed08dbf10338 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8760,7 +8760,7 @@ static bool xdp_is_valid_access(int off, int size, } if (type == BPF_WRITE) { - if (bpf_prog_is_dev_bound(prog->aux)) { + if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); diff --git a/net/core/xdp.c b/net/core/xdp.c index 844c9d99dc0e..a5a7ecf6391c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -4,6 +4,7 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #include <linux/bpf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> @@ -709,3 +710,66 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) return nxdpf; } + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +/** + * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp. + * @ctx: XDP context pointer. + * @timestamp: Return value pointer. + * + * Returns 0 on success or ``-errno`` on error. + */ +int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + return -EOPNOTSUPP; +} + +/** + * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash. + * @ctx: XDP context pointer. + * @hash: Return value pointer. + * + * Returns 0 on success or ``-errno`` on error. + */ +int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash) +{ + return -EOPNOTSUPP; +} + +__diag_pop(); + +BTF_SET8_START(xdp_metadata_kfunc_ids) +#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, 0) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC +BTF_SET8_END(xdp_metadata_kfunc_ids) + +static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { + .owner = THIS_MODULE, + .set = &xdp_metadata_kfunc_ids, +}; + +BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) +#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str) +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + +u32 bpf_xdp_metadata_kfunc_id(int id) +{ + /* xdp_metadata_kfunc_ids is sorted and can't be used */ + return xdp_metadata_kfunc_ids_unsorted[id]; +} + +bool bpf_dev_bound_kfunc_id(u32 btf_id) +{ + return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id); +} + +static int __init xdp_metadata_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); +} +late_initcall(xdp_metadata_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 142b81bcbb2e..7f024ac22edd 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1156,6 +1156,11 @@ enum bpf_link_type { */ #define BPF_F_XDP_HAS_FRAGS (1U << 5) +/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded + * program becomes device-bound but can access XDP metadata. + */ +#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) + /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 401a75844cc0..4aa5bba956ff 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -47,3 +47,4 @@ test_cpp xskxceiver xdp_redirect_multi xdp_synproxy +xdp_hw_metadata diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 96e8371f5c2a..b9ef9cc61d36 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -86,5 +86,6 @@ xdp_adjust_tail # case-128 err 0 errno 28 retval 1 size xdp_bonding # failed to auto-attach program 'trace_on_entry': -524 (trampoline) xdp_bpf2bpf # failed to auto-attach program 'trace_on_entry': -524 (trampoline) xdp_do_redirect # prog_run_max_size unexpected error: -22 (errno 22) +xdp_metadata # JIT does not support calling kernel function (kfunc) xdp_synproxy # JIT does not support calling kernel function (kfunc) xfrm_info # JIT does not support calling kernel function (kfunc) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 22533a18705e..a554b41fe872 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -83,7 +83,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ - xskxceiver xdp_redirect_multi xdp_synproxy veristat + xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/sign-file TEST_GEN_FILES += liburandom_read.so @@ -383,6 +383,7 @@ test_subskeleton.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.bpf.o test_subskeleton_lib.bpf.o test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o +xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) @@ -527,7 +528,7 @@ TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ network_helpers.c testing_helpers.c \ btf_helpers.c flow_dissector_load.h \ - cap_helpers.c test_loader.c + cap_helpers.c test_loader.c xsk.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \ $(OUTPUT)/xdp_synproxy \ @@ -580,6 +581,10 @@ $(OUTPUT)/xskxceiver: xskxceiver.c $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel. $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ +$(OUTPUT)/xdp_hw_metadata: xdp_hw_metadata.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xdp_hw_metadata.skel.h | $(OUTPUT) + $(call msg,BINARY,,$@) + $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ + # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c new file mode 100644 index 000000000000..e033d48288c0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> +#include "xdp_metadata.skel.h" +#include "xdp_metadata2.skel.h" +#include "xdp_metadata.h" +#include "xsk.h" + +#include <bpf/btf.h> +#include <linux/errqueue.h> +#include <linux/if_link.h> +#include <linux/net_tstamp.h> +#include <linux/udp.h> +#include <sys/mman.h> +#include <net/if.h> +#include <poll.h> + +#define TX_NAME "veTX" +#define RX_NAME "veRX" + +#define UDP_PAYLOAD_BYTES 4 + +#define AF_XDP_SOURCE_PORT 1234 +#define AF_XDP_CONSUMER_PORT 8080 + +#define UMEM_NUM 16 +#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE +#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM) +#define XDP_FLAGS XDP_FLAGS_DRV_MODE +#define QUEUE_ID 0 + +#define TX_ADDR "10.0.0.1" +#define RX_ADDR "10.0.0.2" +#define PREFIX_LEN "8" +#define FAMILY AF_INET + +#define SYS(cmd) ({ \ + if (!ASSERT_OK(system(cmd), (cmd))) \ + goto out; \ +}) + +struct xsk { + void *umem_area; + struct xsk_umem *umem; + struct xsk_ring_prod fill; + struct xsk_ring_cons comp; + struct xsk_ring_prod tx; + struct xsk_ring_cons rx; + struct xsk_socket *socket; +}; + +static int open_xsk(int ifindex, struct xsk *xsk) +{ + int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + const struct xsk_socket_config socket_config = { + .rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .bind_flags = XDP_COPY, + }; + const struct xsk_umem_config umem_config = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG, + }; + __u32 idx; + u64 addr; + int ret; + int i; + + xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); + if (!ASSERT_NEQ(xsk->umem_area, MAP_FAILED, "mmap")) + return -1; + + ret = xsk_umem__create(&xsk->umem, + xsk->umem_area, UMEM_SIZE, + &xsk->fill, + &xsk->comp, + &umem_config); + if (!ASSERT_OK(ret, "xsk_umem__create")) + return ret; + + ret = xsk_socket__create(&xsk->socket, ifindex, QUEUE_ID, + xsk->umem, + &xsk->rx, + &xsk->tx, + &socket_config); + if (!ASSERT_OK(ret, "xsk_socket__create")) + return ret; + + /* First half of umem is for TX. This way address matches 1-to-1 + * to the completion queue index. + */ + + for (i = 0; i < UMEM_NUM / 2; i++) { + addr = i * UMEM_FRAME_SIZE; + printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr); + } + + /* Second half of umem is for RX. */ + + ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx); + if (!ASSERT_EQ(UMEM_NUM / 2, ret, "xsk_ring_prod__reserve")) + return ret; + if (!ASSERT_EQ(idx, 0, "fill idx != 0")) + return -1; + + for (i = 0; i < UMEM_NUM / 2; i++) { + addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE; + printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr); + *xsk_ring_prod__fill_addr(&xsk->fill, i) = addr; + } + xsk_ring_prod__submit(&xsk->fill, ret); + + return 0; +} + +static void close_xsk(struct xsk *xsk) +{ + if (xsk->umem) + xsk_umem__delete(xsk->umem); + if (xsk->socket) + xsk_socket__delete(xsk->socket); + munmap(xsk->umem, UMEM_SIZE); +} + +static void ip_csum(struct iphdr *iph) +{ + __u32 sum = 0; + __u16 *p; + int i; + + iph->check = 0; + p = (void *)iph; + for (i = 0; i < sizeof(*iph) / sizeof(*p); i++) + sum += p[i]; + + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + iph->check = ~sum; +} + +static int generate_packet(struct xsk *xsk, __u16 dst_port) +{ + struct xdp_desc *tx_desc; + struct udphdr *udph; + struct ethhdr *eth; + struct iphdr *iph; + void *data; + __u32 idx; + int ret; + + ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx); + if (!ASSERT_EQ(ret, 1, "xsk_ring_prod__reserve")) + return -1; + + tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx); + tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE; + printf("%p: tx_desc[%u]->addr=%llx\n", xsk, idx, tx_desc->addr); + data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr); + + eth = data; + iph = (void *)(eth + 1); + udph = (void *)(iph + 1); + + memcpy(eth->h_dest, "\x00\x00\x00\x00\x00\x02", ETH_ALEN); + memcpy(eth->h_source, "\x00\x00\x00\x00\x00\x01", ETH_ALEN); + eth->h_proto = htons(ETH_P_IP); + + iph->version = 0x4; + iph->ihl = 0x5; + iph->tos = 0x9; + iph->tot_len = htons(sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 0; + iph->protocol = IPPROTO_UDP; + ASSERT_EQ(inet_pton(FAMILY, TX_ADDR, &iph->saddr), 1, "inet_pton(TX_ADDR)"); + ASSERT_EQ(inet_pton(FAMILY, RX_ADDR, &iph->daddr), 1, "inet_pton(RX_ADDR)"); + ip_csum(iph); + + udph->source = htons(AF_XDP_SOURCE_PORT); + udph->dest = htons(dst_port); + udph->len = htons(sizeof(*udph) + UDP_PAYLOAD_BYTES); + udph->check = 0; + + memset(udph + 1, 0xAA, UDP_PAYLOAD_BYTES); + + tx_desc->len = sizeof(*eth) + sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES; + xsk_ring_prod__submit(&xsk->tx, 1); + + ret = sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0); + if (!ASSERT_GE(ret, 0, "sendto")) + return ret; + + return 0; +} + +static void complete_tx(struct xsk *xsk) +{ + __u32 idx; + __u64 addr; + + if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) { + addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx); + + printf("%p: refill idx=%u addr=%llx\n", xsk, idx, addr); + *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr; + xsk_ring_prod__submit(&xsk->fill, 1); + } +} + +static void refill_rx(struct xsk *xsk, __u64 addr) +{ + __u32 idx; + + if (ASSERT_EQ(xsk_ring_prod__reserve(&xsk->fill, 1, &idx), 1, "xsk_ring_prod__reserve")) { + printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr); + *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr; + xsk_ring_prod__submit(&xsk->fill, 1); + } +} + +static int verify_xsk_metadata(struct xsk *xsk) +{ + const struct xdp_desc *rx_desc; + struct pollfd fds = {}; + struct xdp_meta *meta; + struct ethhdr *eth; + struct iphdr *iph; + __u64 comp_addr; + void *data; + __u64 addr; + __u32 idx; + int ret; + + ret = recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL); + if (!ASSERT_EQ(ret, 0, "recvfrom")) + return -1; + + fds.fd = xsk_socket__fd(xsk->socket); + fds.events = POLLIN; + + ret = poll(&fds, 1, 1000); + if (!ASSERT_GT(ret, 0, "poll")) + return -1; + + ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx); + if (!ASSERT_EQ(ret, 1, "xsk_ring_cons__peek")) + return -2; + + rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx); + comp_addr = xsk_umem__extract_addr(rx_desc->addr); + addr = xsk_umem__add_offset_to_addr(rx_desc->addr); + printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n", + xsk, idx, rx_desc->addr, addr, comp_addr); + data = xsk_umem__get_data(xsk->umem_area, addr); + + /* Make sure we got the packet offset correctly. */ + + eth = data; + ASSERT_EQ(eth->h_proto, htons(ETH_P_IP), "eth->h_proto"); + iph = (void *)(eth + 1); + ASSERT_EQ((int)iph->version, 4, "iph->version"); + + /* custom metadata */ + + meta = data - sizeof(struct xdp_meta); + + if (!ASSERT_NEQ(meta->rx_timestamp, 0, "rx_timestamp")) + return -1; + + if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash")) + return -1; + + xsk_ring_cons__release(&xsk->rx, 1); + refill_rx(xsk, comp_addr); + + return 0; +} + +void test_xdp_metadata(void) +{ + struct xdp_metadata2 *bpf_obj2 = NULL; + struct xdp_metadata *bpf_obj = NULL; + struct bpf_program *new_prog, *prog; + struct nstoken *tok = NULL; + __u32 queue_id = QUEUE_ID; + struct bpf_map *prog_arr; + struct xsk tx_xsk = {}; + struct xsk rx_xsk = {}; + __u32 val, key = 0; + int retries = 10; + int rx_ifindex; + int tx_ifindex; + int sock_fd; + int ret; + + /* Setup new networking namespace, with a veth pair. */ + + SYS("ip netns add xdp_metadata"); + tok = open_netns("xdp_metadata"); + SYS("ip link add numtxqueues 1 numrxqueues 1 " TX_NAME + " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1"); + SYS("ip link set dev " TX_NAME " address 00:00:00:00:00:01"); + SYS("ip link set dev " RX_NAME " address 00:00:00:00:00:02"); + SYS("ip link set dev " TX_NAME " up"); + SYS("ip link set dev " RX_NAME " up"); + SYS("ip addr add " TX_ADDR "/" PREFIX_LEN " dev " TX_NAME); + SYS("ip addr add " RX_ADDR "/" PREFIX_LEN " dev " RX_NAME); + + rx_ifindex = if_nametoindex(RX_NAME); + tx_ifindex = if_nametoindex(TX_NAME); + + /* Setup separate AF_XDP for TX and RX interfaces. */ + + ret = open_xsk(tx_ifindex, &tx_xsk); + if (!ASSERT_OK(ret, "open_xsk(TX_NAME)")) + goto out; + + ret = open_xsk(rx_ifindex, &rx_xsk); + if (!ASSERT_OK(ret, "open_xsk(RX_NAME)")) + goto out; + + bpf_obj = xdp_metadata__open(); + if (!ASSERT_OK_PTR(bpf_obj, "open skeleton")) + goto out; + + prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx"); + bpf_program__set_ifindex(prog, rx_ifindex); + bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY); + + if (!ASSERT_OK(xdp_metadata__load(bpf_obj), "load skeleton")) + goto out; + + /* Make sure we can't add dev-bound programs to prog maps. */ + prog_arr = bpf_object__find_map_by_name(bpf_obj->obj, "prog_arr"); + if (!ASSERT_OK_PTR(prog_arr, "no prog_arr map")) + goto out; + + val = bpf_program__fd(prog); + if (!ASSERT_ERR(bpf_map__update_elem(prog_arr, &key, sizeof(key), + &val, sizeof(val), BPF_ANY), + "update prog_arr")) + goto out; + + /* Attach BPF program to RX interface. */ + + ret = bpf_xdp_attach(rx_ifindex, + bpf_program__fd(bpf_obj->progs.rx), + XDP_FLAGS, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto out; + + sock_fd = xsk_socket__fd(rx_xsk.socket); + ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0); + if (!ASSERT_GE(ret, 0, "bpf_map_update_elem")) + goto out; + + /* Send packet destined to RX AF_XDP socket. */ + if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0, + "generate AF_XDP_CONSUMER_PORT")) + goto out; + + /* Verify AF_XDP RX packet has proper metadata. */ + if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk), 0, + "verify_xsk_metadata")) + goto out; + + complete_tx(&tx_xsk); + + /* Make sure freplace correctly picks up original bound device + * and doesn't crash. + */ + + bpf_obj2 = xdp_metadata2__open(); + if (!ASSERT_OK_PTR(bpf_obj2, "open skeleton")) + goto out; + + new_prog = bpf_object__find_program_by_name(bpf_obj2->obj, "freplace_rx"); + bpf_program__set_attach_target(new_prog, bpf_program__fd(prog), "rx"); + + if (!ASSERT_OK(xdp_metadata2__load(bpf_obj2), "load freplace skeleton")) + goto out; + + if (!ASSERT_OK(xdp_metadata2__attach(bpf_obj2), "attach freplace")) + goto out; + + /* Send packet to trigger . */ + if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0, + "generate freplace packet")) + goto out; + + while (!retries--) { + if (bpf_obj2->bss->called) + break; + usleep(10); + } + ASSERT_GT(bpf_obj2->bss->called, 0, "not called"); + +out: + close_xsk(&rx_xsk); + close_xsk(&tx_xsk); + xdp_metadata2__destroy(bpf_obj2); + xdp_metadata__destroy(bpf_obj); + if (tok) + close_netns(tok); + system("ip netns del xdp_metadata"); +} diff --git a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c new file mode 100644 index 000000000000..25b8178735ee --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "xdp_metadata.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +struct { + __uint(type, BPF_MAP_TYPE_XSKMAP); + __uint(max_entries, 256); + __type(key, __u32); + __type(value, __u32); +} xsk SEC(".maps"); + +extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, + __u64 *timestamp) __ksym; +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, + __u32 *hash) __ksym; + +SEC("xdp") +int rx(struct xdp_md *ctx) +{ + void *data, *data_meta, *data_end; + struct ipv6hdr *ip6h = NULL; + struct ethhdr *eth = NULL; + struct udphdr *udp = NULL; + struct iphdr *iph = NULL; + struct xdp_meta *meta; + int ret; + + data = (void *)(long)ctx->data; + data_end = (void *)(long)ctx->data_end; + eth = data; + if (eth + 1 < data_end) { + if (eth->h_proto == bpf_htons(ETH_P_IP)) { + iph = (void *)(eth + 1); + if (iph + 1 < data_end && iph->protocol == IPPROTO_UDP) + udp = (void *)(iph + 1); + } + if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + ip6h = (void *)(eth + 1); + if (ip6h + 1 < data_end && ip6h->nexthdr == IPPROTO_UDP) + udp = (void *)(ip6h + 1); + } + if (udp && udp + 1 > data_end) + udp = NULL; + } + + if (!udp) + return XDP_PASS; + + if (udp->dest != bpf_htons(9091)) + return XDP_PASS; + + bpf_printk("forwarding UDP:9091 to AF_XDP"); + + ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta)); + if (ret != 0) { + bpf_printk("bpf_xdp_adjust_meta returned %d", ret); + return XDP_PASS; + } + + data = (void *)(long)ctx->data; + data_meta = (void *)(long)ctx->data_meta; + meta = data_meta; + + if (meta + 1 > data) { + bpf_printk("bpf_xdp_adjust_meta doesn't appear to work"); + return XDP_PASS; + } + + if (!bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp)) + bpf_printk("populated rx_timestamp with %u", meta->rx_timestamp); + + if (!bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash)) + bpf_printk("populated rx_hash with %u", meta->rx_hash); + + return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c new file mode 100644 index 000000000000..77678b034389 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "xdp_metadata.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +struct { + __uint(type, BPF_MAP_TYPE_XSKMAP); + __uint(max_entries, 4); + __type(key, __u32); + __type(value, __u32); +} xsk SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} prog_arr SEC(".maps"); + +extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, + __u64 *timestamp) __ksym; +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, + __u32 *hash) __ksym; + +SEC("xdp") +int rx(struct xdp_md *ctx) +{ + void *data, *data_meta; + struct xdp_meta *meta; + u64 timestamp = -1; + int ret; + + /* Reserve enough for all custom metadata. */ + + ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta)); + if (ret != 0) + return XDP_DROP; + + data = (void *)(long)ctx->data; + data_meta = (void *)(long)ctx->data_meta; + + if (data_meta + sizeof(struct xdp_meta) > data) + return XDP_DROP; + + meta = data_meta; + + /* Export metadata. */ + + /* We expect veth bpf_xdp_metadata_rx_timestamp to return 0 HW + * timestamp, so put some non-zero value into AF_XDP frame for + * the userspace. + */ + bpf_xdp_metadata_rx_timestamp(ctx, ×tamp); + if (timestamp == 0) + meta->rx_timestamp = 1; + + bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash); + + return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata2.c b/tools/testing/selftests/bpf/progs/xdp_metadata2.c new file mode 100644 index 000000000000..cf69d05451c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_metadata2.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "xdp_metadata.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, + __u32 *hash) __ksym; + +int called; + +SEC("freplace/rx") +int freplace_rx(struct xdp_md *ctx) +{ + u32 hash = 0; + /* Call _any_ metadata function to make sure we don't crash. */ + bpf_xdp_metadata_rx_hash(ctx, &hash); + called++; + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 7cb1bc05e5cf..40cba8d368d9 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -1039,7 +1039,7 @@ try: offload = bpf_pinned("/sys/fs/bpf/offload") ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) fail(ret == 0, "attached offloaded XDP program to drv") - check_extack(err, "Using device-bound program without HW_MODE flag is not supported.", args) + check_extack(err, "Using offloaded program without HW_MODE flag is not supported.", args) rm("/sys/fs/bpf/offload") sim.wait_for_flush() @@ -1088,12 +1088,12 @@ try: ret, _, err = sim.set_xdp(pinned, "offload", fail=False, include_stderr=True) fail(ret == 0, "Pinned program loaded for a different device accepted") - check_extack_nsim(err, "program bound to different dev.", args) + check_extack(err, "Program bound to different device.", args) simdev2.remove() ret, _, err = sim.set_xdp(pinned, "offload", fail=False, include_stderr=True) fail(ret == 0, "Pinned program loaded for a removed device accepted") - check_extack_nsim(err, "xdpoffload of non-bound program.", args) + check_extack(err, "Program bound to different device.", args) rm(pin_file) bpftool_prog_list_wait(expected=0) @@ -1334,12 +1334,12 @@ try: ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False, fail=False, include_stderr=True) fail(ret == 0, "cross-ASIC program allowed") - check_extack_nsim(err, "program bound to different dev.", args) + check_extack(err, "Program bound to different device.", args) for d in simdevB.nsims: ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False, fail=False, include_stderr=True) fail(ret == 0, "cross-ASIC program allowed") - check_extack_nsim(err, "program bound to different dev.", args) + check_extack(err, "Program bound to different device.", args) start_test("Test multi-dev ASIC cross-dev map reuse...") diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c new file mode 100644 index 000000000000..0008f0f239e8 --- /dev/null +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -0,0 +1,403 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Reference program for verifying XDP metadata on real HW. Functional test + * only, doesn't test the performance. + * + * RX: + * - UDP 9091 packets are diverted into AF_XDP + * - Metadata verified: + * - rx_timestamp + * - rx_hash + * + * TX: + * - TBD + */ + +#include <test_progs.h> +#include <network_helpers.h> +#include "xdp_hw_metadata.skel.h" +#include "xsk.h" + +#include <error.h> +#include <linux/errqueue.h> +#include <linux/if_link.h> +#include <linux/net_tstamp.h> +#include <linux/udp.h> +#include <linux/sockios.h> +#include <sys/mman.h> +#include <net/if.h> +#include <poll.h> + +#include "xdp_metadata.h" + +#define UMEM_NUM 16 +#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE +#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM) +#define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE) + +struct xsk { + void *umem_area; + struct xsk_umem *umem; + struct xsk_ring_prod fill; + struct xsk_ring_cons comp; + struct xsk_ring_prod tx; + struct xsk_ring_cons rx; + struct xsk_socket *socket; +}; + +struct xdp_hw_metadata *bpf_obj; +struct xsk *rx_xsk; +const char *ifname; +int ifindex; +int rxq; + +void test__fail(void) { /* for network_helpers.c */ } + +static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id) +{ + int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + const struct xsk_socket_config socket_config = { + .rx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .bind_flags = XDP_COPY, + }; + const struct xsk_umem_config umem_config = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG, + }; + __u32 idx; + u64 addr; + int ret; + int i; + + xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); + if (xsk->umem_area == MAP_FAILED) + return -ENOMEM; + + ret = xsk_umem__create(&xsk->umem, + xsk->umem_area, UMEM_SIZE, + &xsk->fill, + &xsk->comp, + &umem_config); + if (ret) + return ret; + + ret = xsk_socket__create(&xsk->socket, ifindex, queue_id, + xsk->umem, + &xsk->rx, + &xsk->tx, + &socket_config); + if (ret) + return ret; + + /* First half of umem is for TX. This way address matches 1-to-1 + * to the completion queue index. + */ + + for (i = 0; i < UMEM_NUM / 2; i++) { + addr = i * UMEM_FRAME_SIZE; + printf("%p: tx_desc[%d] -> %lx\n", xsk, i, addr); + } + + /* Second half of umem is for RX. */ + + ret = xsk_ring_prod__reserve(&xsk->fill, UMEM_NUM / 2, &idx); + for (i = 0; i < UMEM_NUM / 2; i++) { + addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE; + printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr); + *xsk_ring_prod__fill_addr(&xsk->fill, i) = addr; + } + xsk_ring_prod__submit(&xsk->fill, ret); + + return 0; +} + +static void close_xsk(struct xsk *xsk) +{ + if (xsk->umem) + xsk_umem__delete(xsk->umem); + if (xsk->socket) + xsk_socket__delete(xsk->socket); + munmap(xsk->umem, UMEM_SIZE); +} + +static void refill_rx(struct xsk *xsk, __u64 addr) +{ + __u32 idx; + + if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) { + printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr); + *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr; + xsk_ring_prod__submit(&xsk->fill, 1); + } +} + +static void verify_xdp_metadata(void *data) +{ + struct xdp_meta *meta; + + meta = data - sizeof(*meta); + + printf("rx_timestamp: %llu\n", meta->rx_timestamp); + printf("rx_hash: %u\n", meta->rx_hash); +} + +static void verify_skb_metadata(int fd) +{ + char cmsg_buf[1024]; + char packet_buf[128]; + + struct scm_timestamping *ts; + struct iovec packet_iov; + struct cmsghdr *cmsg; + struct msghdr hdr; + + memset(&hdr, 0, sizeof(hdr)); + hdr.msg_iov = &packet_iov; + hdr.msg_iovlen = 1; + packet_iov.iov_base = packet_buf; + packet_iov.iov_len = sizeof(packet_buf); + + hdr.msg_control = cmsg_buf; + hdr.msg_controllen = sizeof(cmsg_buf); + + if (recvmsg(fd, &hdr, 0) < 0) + error(-1, errno, "recvmsg"); + + for (cmsg = CMSG_FIRSTHDR(&hdr); cmsg != NULL; + cmsg = CMSG_NXTHDR(&hdr, cmsg)) { + + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + switch (cmsg->cmsg_type) { + case SCM_TIMESTAMPING: + ts = (struct scm_timestamping *)CMSG_DATA(cmsg); + if (ts->ts[2].tv_sec || ts->ts[2].tv_nsec) { + printf("found skb hwtstamp = %lu.%lu\n", + ts->ts[2].tv_sec, ts->ts[2].tv_nsec); + return; + } + break; + default: + break; + } + } + + printf("skb hwtstamp is not found!\n"); +} + +static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd) +{ + const struct xdp_desc *rx_desc; + struct pollfd fds[rxq + 1]; + __u64 comp_addr; + __u64 addr; + __u32 idx; + int ret; + int i; + + for (i = 0; i < rxq; i++) { + fds[i].fd = xsk_socket__fd(rx_xsk[i].socket); + fds[i].events = POLLIN; + fds[i].revents = 0; + } + + fds[rxq].fd = server_fd; + fds[rxq].events = POLLIN; + fds[rxq].revents = 0; + + while (true) { + errno = 0; + ret = poll(fds, rxq + 1, 1000); + printf("poll: %d (%d)\n", ret, errno); + if (ret < 0) + break; + if (ret == 0) + continue; + + if (fds[rxq].revents) + verify_skb_metadata(server_fd); + + for (i = 0; i < rxq; i++) { + if (fds[i].revents == 0) + continue; + + struct xsk *xsk = &rx_xsk[i]; + + ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx); + printf("xsk_ring_cons__peek: %d\n", ret); + if (ret != 1) + continue; + + rx_desc = xsk_ring_cons__rx_desc(&xsk->rx, idx); + comp_addr = xsk_umem__extract_addr(rx_desc->addr); + addr = xsk_umem__add_offset_to_addr(rx_desc->addr); + printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n", + xsk, idx, rx_desc->addr, addr, comp_addr); + verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr)); + xsk_ring_cons__release(&xsk->rx, 1); + refill_rx(xsk, comp_addr); + } + } + + return 0; +} + +struct ethtool_channels { + __u32 cmd; + __u32 max_rx; + __u32 max_tx; + __u32 max_other; + __u32 max_combined; + __u32 rx_count; + __u32 tx_count; + __u32 other_count; + __u32 combined_count; +}; + +#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ + +static int rxq_num(const char *ifname) +{ + struct ethtool_channels ch = { + .cmd = ETHTOOL_GCHANNELS, + }; + + struct ifreq ifr = { + .ifr_data = (void *)&ch, + }; + strcpy(ifr.ifr_name, ifname); + int fd, ret; + + fd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (fd < 0) + error(-1, errno, "socket"); + + ret = ioctl(fd, SIOCETHTOOL, &ifr); + if (ret < 0) + error(-1, errno, "socket"); + + close(fd); + + return ch.rx_count + ch.combined_count; +} + +static void cleanup(void) +{ + LIBBPF_OPTS(bpf_xdp_attach_opts, opts); + int ret; + int i; + + if (bpf_obj) { + opts.old_prog_fd = bpf_program__fd(bpf_obj->progs.rx); + if (opts.old_prog_fd >= 0) { + printf("detaching bpf program....\n"); + ret = bpf_xdp_detach(ifindex, XDP_FLAGS, &opts); + if (ret) + printf("failed to detach XDP program: %d\n", ret); + } + } + + for (i = 0; i < rxq; i++) + close_xsk(&rx_xsk[i]); + + if (bpf_obj) + xdp_hw_metadata__destroy(bpf_obj); +} + +static void handle_signal(int sig) +{ + /* interrupting poll() is all we need */ +} + +static void timestamping_enable(int fd, int val) +{ + int ret; + + ret = setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); + if (ret < 0) + error(-1, errno, "setsockopt(SO_TIMESTAMPING)"); +} + +int main(int argc, char *argv[]) +{ + int server_fd = -1; + int ret; + int i; + + struct bpf_program *prog; + + if (argc != 2) { + fprintf(stderr, "pass device name\n"); + return -1; + } + + ifname = argv[1]; + ifindex = if_nametoindex(ifname); + rxq = rxq_num(ifname); + + printf("rxq: %d\n", rxq); + + rx_xsk = malloc(sizeof(struct xsk) * rxq); + if (!rx_xsk) + error(-1, ENOMEM, "malloc"); + + for (i = 0; i < rxq; i++) { + printf("open_xsk(%s, %p, %d)\n", ifname, &rx_xsk[i], i); + ret = open_xsk(ifindex, &rx_xsk[i], i); + if (ret) + error(-1, -ret, "open_xsk"); + + printf("xsk_socket__fd() -> %d\n", xsk_socket__fd(rx_xsk[i].socket)); + } + + printf("open bpf program...\n"); + bpf_obj = xdp_hw_metadata__open(); + if (libbpf_get_error(bpf_obj)) + error(-1, libbpf_get_error(bpf_obj), "xdp_hw_metadata__open"); + + prog = bpf_object__find_program_by_name(bpf_obj->obj, "rx"); + bpf_program__set_ifindex(prog, ifindex); + bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY); + + printf("load bpf program...\n"); + ret = xdp_hw_metadata__load(bpf_obj); + if (ret) + error(-1, -ret, "xdp_hw_metadata__load"); + + printf("prepare skb endpoint...\n"); + server_fd = start_server(AF_INET6, SOCK_DGRAM, NULL, 9092, 1000); + if (server_fd < 0) + error(-1, errno, "start_server"); + timestamping_enable(server_fd, + SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_RAW_HARDWARE); + + printf("prepare xsk map...\n"); + for (i = 0; i < rxq; i++) { + int sock_fd = xsk_socket__fd(rx_xsk[i].socket); + __u32 queue_id = i; + + printf("map[%d] = %d\n", queue_id, sock_fd); + ret = bpf_map_update_elem(bpf_map__fd(bpf_obj->maps.xsk), &queue_id, &sock_fd, 0); + if (ret) + error(-1, -ret, "bpf_map_update_elem"); + } + + printf("attach bpf program...\n"); + ret = bpf_xdp_attach(ifindex, + bpf_program__fd(bpf_obj->progs.rx), + XDP_FLAGS, NULL); + if (ret) + error(-1, -ret, "bpf_xdp_attach"); + + signal(SIGINT, handle_signal); + ret = verify_metadata(rx_xsk, rxq, server_fd); + close(server_fd); + cleanup(); + if (ret) + error(-1, -ret, "verify_metadata"); +} diff --git a/tools/testing/selftests/bpf/xdp_metadata.h b/tools/testing/selftests/bpf/xdp_metadata.h new file mode 100644 index 000000000000..f6780fbb0a21 --- /dev/null +++ b/tools/testing/selftests/bpf/xdp_metadata.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#pragma once + +#ifndef ETH_P_IP +#define ETH_P_IP 0x0800 +#endif + +#ifndef ETH_P_IPV6 +#define ETH_P_IPV6 0x86DD +#endif + +struct xdp_meta { + __u64 rx_timestamp; + __u32 rx_hash; +}; |
