summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c11
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_ethtool.c9
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_netdev.c125
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_rx.c124
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_tx.c274
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/mlx4_en.h44
-rw-r--r--include/linux/bpf.h1
-rw-r--r--include/linux/filter.h18
-rw-r--r--include/linux/mlx4/qp.h18
-rw-r--r--include/linux/netdevice.h34
-rw-r--r--include/uapi/linux/bpf.h21
-rw-r--r--include/uapi/linux/if_link.h12
-rw-r--r--kernel/bpf/syscall.c12
-rw-r--r--kernel/bpf/verifier.c18
-rw-r--r--net/core/dev.c33
-rw-r--r--net/core/filter.c79
-rw-r--r--net/core/rtnetlink.c64
-rw-r--r--samples/bpf/Makefile9
-rw-r--r--samples/bpf/bpf_load.c8
-rw-r--r--samples/bpf/xdp1_kern.c93
-rw-r--r--samples/bpf/xdp1_user.c181
-rw-r--r--samples/bpf/xdp2_kern.c114
22 files changed, 1206 insertions, 96 deletions
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 8db8405c1e99..768085f59566 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -232,7 +232,7 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
}
} else {
ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
- s = (ctrl->fence_size & 0x3f) << 4;
+ s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4;
for (i = 64; i < s; i += 64) {
wqe = buf + i;
*wqe = cpu_to_be32(0xffffffff);
@@ -264,7 +264,7 @@ static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
}
ctrl->srcrb_flags = 0;
- ctrl->fence_size = size / 16;
+ ctrl->qpn_vlan.fence_size = size / 16;
/*
* Make sure descriptor is fully written before setting ownership bit
* (because HW can start executing as soon as we do).
@@ -1992,7 +1992,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
ctrl = get_send_wqe(qp, i);
ctrl->owner_opcode = cpu_to_be32(1 << 31);
if (qp->sq_max_wqes_per_wr == 1)
- ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+ ctrl->qpn_vlan.fence_size =
+ 1 << (qp->sq.wqe_shift - 4);
stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
}
@@ -3169,8 +3170,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
wmb();
*lso_wqe = lso_hdr_sz;
- ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
- MLX4_WQE_CTRL_FENCE : 0) | size;
+ ctrl->qpn_vlan.fence_size = (wr->send_flags & IB_SEND_FENCE ?
+ MLX4_WQE_CTRL_FENCE : 0) | size;
/*
* Make sure descriptor is fully written before
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 51a2e8252b82..f32e272c83dd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1722,6 +1722,12 @@ static int mlx4_en_set_channels(struct net_device *dev,
!channel->tx_count || !channel->rx_count)
return -EINVAL;
+ if (channel->tx_count * MLX4_EN_NUM_UP <= priv->xdp_ring_num) {
+ en_err(priv, "Minimum %d tx channels required with XDP on\n",
+ priv->xdp_ring_num / MLX4_EN_NUM_UP + 1);
+ return -EINVAL;
+ }
+
mutex_lock(&mdev->state_lock);
if (priv->port_up) {
port_up = 1;
@@ -1740,7 +1746,8 @@ static int mlx4_en_set_channels(struct net_device *dev,
goto out;
}
- netif_set_real_num_tx_queues(dev, priv->tx_ring_num);
+ netif_set_real_num_tx_queues(dev, priv->tx_ring_num -
+ priv->xdp_ring_num);
netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
if (dev->num_tc)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 6083775dae16..9abbba6c1475 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -31,6 +31,7 @@
*
*/
+#include <linux/bpf.h>
#include <linux/etherdevice.h>
#include <linux/tcp.h>
#include <linux/if_vlan.h>
@@ -1521,6 +1522,24 @@ static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask);
}
+static void mlx4_en_init_recycle_ring(struct mlx4_en_priv *priv,
+ int tx_ring_idx)
+{
+ struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[tx_ring_idx];
+ int rr_index;
+
+ rr_index = (priv->xdp_ring_num - priv->tx_ring_num) + tx_ring_idx;
+ if (rr_index >= 0) {
+ tx_ring->free_tx_desc = mlx4_en_recycle_tx_desc;
+ tx_ring->recycle_ring = priv->rx_ring[rr_index];
+ en_dbg(DRV, priv,
+ "Set tx_ring[%d]->recycle_ring = rx_ring[%d]\n",
+ tx_ring_idx, rr_index);
+ } else {
+ tx_ring->recycle_ring = NULL;
+ }
+}
+
int mlx4_en_start_port(struct net_device *dev)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -1643,6 +1662,8 @@ int mlx4_en_start_port(struct net_device *dev)
}
tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
+ mlx4_en_init_recycle_ring(priv, i);
+
/* Arm CQ for TX completions */
mlx4_en_arm_cq(priv, cq);
@@ -2112,6 +2133,11 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
en_err(priv, "Bad MTU size:%d.\n", new_mtu);
return -EPERM;
}
+ if (priv->xdp_ring_num && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
+ en_err(priv, "MTU size:%d requires frags but XDP running\n",
+ new_mtu);
+ return -EOPNOTSUPP;
+ }
dev->mtu = new_mtu;
if (netif_running(dev)) {
@@ -2520,6 +2546,103 @@ static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 m
return err;
}
+static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct bpf_prog *old_prog;
+ int xdp_ring_num;
+ int port_up = 0;
+ int err;
+ int i;
+
+ xdp_ring_num = prog ? ALIGN(priv->rx_ring_num, MLX4_EN_NUM_UP) : 0;
+
+ /* No need to reconfigure buffers when simply swapping the
+ * program for a new one.
+ */
+ if (priv->xdp_ring_num == xdp_ring_num) {
+ if (prog) {
+ prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ /* This xchg is paired with READ_ONCE in the fastpath */
+ old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ }
+ return 0;
+ }
+
+ if (priv->num_frags > 1) {
+ en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (priv->tx_ring_num < xdp_ring_num + MLX4_EN_NUM_UP) {
+ en_err(priv,
+ "Minimum %d tx channels required to run XDP\n",
+ (xdp_ring_num + MLX4_EN_NUM_UP) / MLX4_EN_NUM_UP);
+ return -EINVAL;
+ }
+
+ if (prog) {
+ prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+
+ mutex_lock(&mdev->state_lock);
+ if (priv->port_up) {
+ port_up = 1;
+ mlx4_en_stop_port(dev, 1);
+ }
+
+ priv->xdp_ring_num = xdp_ring_num;
+ netif_set_real_num_tx_queues(dev, priv->tx_ring_num -
+ priv->xdp_ring_num);
+
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ }
+
+ if (port_up) {
+ err = mlx4_en_start_port(dev);
+ if (err) {
+ en_err(priv, "Failed starting port %d for XDP change\n",
+ priv->port);
+ queue_work(mdev->workqueue, &priv->watchdog_task);
+ }
+ }
+
+ mutex_unlock(&mdev->state_lock);
+ return 0;
+}
+
+static bool mlx4_xdp_attached(struct net_device *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+
+ return !!priv->xdp_ring_num;
+}
+
+static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ return mlx4_xdp_set(dev, xdp->prog);
+ case XDP_QUERY_PROG:
+ xdp->prog_attached = mlx4_xdp_attached(dev);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static const struct net_device_ops mlx4_netdev_ops = {
.ndo_open = mlx4_en_open,
.ndo_stop = mlx4_en_close,
@@ -2548,6 +2671,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
.ndo_udp_tunnel_del = mlx4_en_del_vxlan_port,
.ndo_features_check = mlx4_en_features_check,
.ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate,
+ .ndo_xdp = mlx4_xdp,
};
static const struct net_device_ops mlx4_netdev_ops_master = {
@@ -2584,6 +2708,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
.ndo_udp_tunnel_del = mlx4_en_del_vxlan_port,
.ndo_features_check = mlx4_en_features_check,
.ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate,
+ .ndo_xdp = mlx4_xdp,
};
struct mlx4_en_bond {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index c1b3a9c8cf3b..11d88c817137 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -32,6 +32,7 @@
*/
#include <net/busy_poll.h>
+#include <linux/bpf.h>
#include <linux/mlx4/cq.h>
#include <linux/slab.h>
#include <linux/mlx4/qp.h>
@@ -57,7 +58,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
struct page *page;
dma_addr_t dma;
- for (order = MLX4_EN_ALLOC_PREFER_ORDER; ;) {
+ for (order = frag_info->order; ;) {
gfp_t gfp = _gfp;
if (order)
@@ -70,7 +71,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
return -ENOMEM;
}
dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
- PCI_DMA_FROMDEVICE);
+ frag_info->dma_dir);
if (dma_mapping_error(priv->ddev, dma)) {
put_page(page);
return -ENOMEM;
@@ -124,7 +125,8 @@ out:
while (i--) {
if (page_alloc[i].page != ring_alloc[i].page) {
dma_unmap_page(priv->ddev, page_alloc[i].dma,
- page_alloc[i].page_size, PCI_DMA_FROMDEVICE);
+ page_alloc[i].page_size,
+ priv->frag_info[i].dma_dir);
page = page_alloc[i].page;
/* Revert changes done by mlx4_alloc_pages */
page_ref_sub(page, page_alloc[i].page_size /
@@ -145,7 +147,7 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
if (next_frag_end > frags[i].page_size)
dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
- PCI_DMA_FROMDEVICE);
+ frag_info->dma_dir);
if (frags[i].page)
put_page(frags[i].page);
@@ -176,7 +178,8 @@ out:
page_alloc = &ring->page_alloc[i];
dma_unmap_page(priv->ddev, page_alloc->dma,
- page_alloc->page_size, PCI_DMA_FROMDEVICE);
+ page_alloc->page_size,
+ priv->frag_info[i].dma_dir);
page = page_alloc->page;
/* Revert changes done by mlx4_alloc_pages */
page_ref_sub(page, page_alloc->page_size /
@@ -201,7 +204,7 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
i, page_count(page_alloc->page));
dma_unmap_page(priv->ddev, page_alloc->dma,
- page_alloc->page_size, PCI_DMA_FROMDEVICE);
+ page_alloc->page_size, frag_info->dma_dir);
while (page_alloc->page_offset + frag_info->frag_stride <
page_alloc->page_size) {
put_page(page_alloc->page);
@@ -244,6 +247,12 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc *frags = ring->rx_info +
(index << priv->log_rx_info);
+ if (ring->page_cache.index > 0) {
+ frags[0] = ring->page_cache.buf[--ring->page_cache.index];
+ rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
+ return 0;
+ }
+
return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp);
}
@@ -502,6 +511,24 @@ void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
}
}
+/* When the rx ring is running in page-per-packet mode, a released frame can go
+ * directly into a small cache, to avoid unmapping or touching the page
+ * allocator. In bpf prog performance scenarios, buffers are either forwarded
+ * or dropped, never converted to skbs, so every page can come directly from
+ * this cache when it is sized to be a multiple of the napi budget.
+ */
+bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
+ struct mlx4_en_rx_alloc *frame)
+{
+ struct mlx4_en_page_cache *cache = &ring->page_cache;
+
+ if (cache->index >= MLX4_EN_CACHE_SIZE)
+ return false;
+
+ cache->buf[cache->index++] = *frame;
+ return true;
+}
+
void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring **pring,
u32 size, u16 stride)
@@ -509,6 +536,8 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_dev *mdev = priv->mdev;
struct mlx4_en_rx_ring *ring = *pring;
+ if (ring->xdp_prog)
+ bpf_prog_put(ring->xdp_prog);
mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
vfree(ring->rx_info);
ring->rx_info = NULL;
@@ -522,6 +551,16 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring *ring)
{
+ int i;
+
+ for (i = 0; i < ring->page_cache.index; i++) {
+ struct mlx4_en_rx_alloc *frame = &ring->page_cache.buf[i];
+
+ dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
+ priv->frag_info[0].dma_dir);
+ put_page(frame->page);
+ }
+ ring->page_cache.index = 0;
mlx4_en_free_rx_buf(priv, ring);
if (ring->stride <= TXBB_SIZE)
ring->buf -= TXBB_SIZE;
@@ -743,7 +782,10 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
struct mlx4_en_rx_alloc *frags;
struct mlx4_en_rx_desc *rx_desc;
+ struct bpf_prog *xdp_prog;
+ int doorbell_pending;
struct sk_buff *skb;
+ int tx_index;
int index;
int nr;
unsigned int length;
@@ -759,6 +801,10 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
if (budget <= 0)
return polled;
+ xdp_prog = READ_ONCE(ring->xdp_prog);
+ doorbell_pending = 0;
+ tx_index = (priv->tx_ring_num - priv->xdp_ring_num) + cq->ring;
+
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
* descriptor offset can be deduced from the CQE index instead of
* reading 'cqe->index' */
@@ -835,6 +881,43 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
+ /* A bpf program gets first chance to drop the packet. It may
+ * read bytes but not past the end of the frag.
+ */
+ if (xdp_prog) {
+ struct xdp_buff xdp;
+ dma_addr_t dma;
+ u32 act;
+
+ dma = be64_to_cpu(rx_desc->data[0].addr);
+ dma_sync_single_for_cpu(priv->ddev, dma,
+ priv->frag_info[0].frag_size,
+ DMA_FROM_DEVICE);
+
+ xdp.data = page_address(frags[0].page) +
+ frags[0].page_offset;
+ xdp.data_end = xdp.data + length;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ break;
+ case XDP_TX:
+ if (!mlx4_en_xmit_frame(frags, dev,
+ length, tx_index,
+ &doorbell_pending))
+ goto consumed;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ case XDP_ABORTED:
+ case XDP_DROP:
+ if (mlx4_en_rx_recycle(ring, frags))
+ goto consumed;
+ goto next;
+ }
+ }
+
if (likely(dev->features & NETIF_F_RXCSUM)) {
if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
MLX4_CQE_STATUS_UDP)) {
@@ -986,6 +1069,7 @@ next:
for (nr = 0; nr < priv->num_frags; nr++)
mlx4_en_free_frag(priv, frags, nr);
+consumed:
++cq->mcq.cons_index;
index = (cq->mcq.cons_index) & ring->size_mask;
cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
@@ -994,6 +1078,9 @@ next:
}
out:
+ if (doorbell_pending)
+ mlx4_en_xmit_doorbell(priv->tx_ring[tx_index]);
+
AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
mlx4_cq_set_ci(&cq->mcq);
wmb(); /* ensure HW sees CQ consumer before we post new buffers */
@@ -1061,22 +1148,35 @@ static const int frag_sizes[] = {
void mlx4_en_calc_rx_buf(struct net_device *dev)
{
+ enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE;
struct mlx4_en_priv *priv = netdev_priv(dev);
- /* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
- * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
- */
- int eff_mtu = dev->mtu + ETH_HLEN + (2 * VLAN_HLEN);
+ int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
+ int order = MLX4_EN_ALLOC_PREFER_ORDER;
+ u32 align = SMP_CACHE_BYTES;
int buf_size = 0;
int i = 0;
+ /* bpf requires buffers to be set up as 1 packet per page.
+ * This only works when num_frags == 1.
+ */
+ if (priv->xdp_ring_num) {
+ dma_dir = PCI_DMA_BIDIRECTIONAL;
+ /* This will gain efficient xdp frame recycling at the expense
+ * of more costly truesize accounting
+ */
+ align = PAGE_SIZE;
+ order = 0;
+ }
+
while (buf_size < eff_mtu) {
+ priv->frag_info[i].order = order;
priv->frag_info[i].frag_size =
(eff_mtu > buf_size + frag_sizes[i]) ?
frag_sizes[i] : eff_mtu - buf_size;
priv->frag_info[i].frag_prefix_size = buf_size;
priv->frag_info[i].frag_stride =
- ALIGN(priv->frag_info[i].frag_size,
- SMP_CACHE_BYTES);
+ ALIGN(priv->frag_info[i].frag_size, align);
+ priv->frag_info[i].dma_dir = dma_dir;
buf_size += priv->frag_info[i].frag_size;
i++;
}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 76aa4d27183c..9df87ca0515a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -196,6 +196,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
ring->last_nr_txbb = 1;
memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
memset(ring->buf, 0, ring->buf_size);
+ ring->free_tx_desc = mlx4_en_free_tx_desc;
ring->qp_state = MLX4_QP_STATE_RST;
ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
@@ -265,10 +266,10 @@ static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
}
-static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
- struct mlx4_en_tx_ring *ring,
- int index, u8 owner, u64 timestamp,
- int napi_mode)
+u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ int index, u8 owner, u64 timestamp,
+ int napi_mode)
{
struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
@@ -344,6 +345,27 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
return tx_info->nr_txbb;
}
+u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ int index, u8 owner, u64 timestamp,
+ int napi_mode)
+{
+ struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+ struct mlx4_en_rx_alloc frame = {
+ .page = tx_info->page,
+ .dma = tx_info->map0_dma,
+ .page_offset = 0,
+ .page_size = PAGE_SIZE,
+ };
+
+ if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) {
+ dma_unmap_page(priv->ddev, tx_info->map0_dma,
+ PAGE_SIZE, priv->frag_info[0].dma_dir);
+ put_page(tx_info->page);
+ }
+
+ return tx_info->nr_txbb;
+}
int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
{
@@ -362,7 +384,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
}
while (ring->cons != ring->prod) {
- ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
+ ring->last_nr_txbb = ring->free_tx_desc(priv, ring,
ring->cons & ring->size_mask,
!!(ring->cons & ring->size), 0,
0 /* Non-NAPI caller */);
@@ -444,7 +466,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
timestamp = mlx4_en_get_cqe_ts(cqe);
/* free next descriptor */
- last_nr_txbb = mlx4_en_free_tx_desc(
+ last_nr_txbb = ring->free_tx_desc(
priv, ring, ring_index,
!!((ring_cons + txbbs_skipped) &
ring->size), timestamp, napi_budget);
@@ -476,6 +498,9 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
+ if (ring->free_tx_desc == mlx4_en_recycle_tx_desc)
+ return done < budget;
+
netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
/* Wakeup Tx queue if this stopped, and ring is not full.
@@ -631,8 +656,7 @@ static int get_real_size(const struct sk_buff *skb,
static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
const struct sk_buff *skb,
const struct skb_shared_info *shinfo,
- int real_size, u16 *vlan_tag,
- int tx_ind, void *fragptr)
+ void *fragptr)
{
struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
@@ -700,10 +724,66 @@ static void mlx4_bf_copy(void __iomem *dst, const void *src,
__iowrite64_copy(dst, src, bytecnt / 8);
}
+void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring)
+{
+ wmb();
+ /* Since there is no iowrite*_native() that writes the
+ * value as is, without byteswapping - using the one
+ * the doesn't do byteswapping in the relevant arch
+ * endianness.
+ */
+#if defined(__LITTLE_ENDIAN)
+ iowrite32(
+#else
+ iowrite32be(
+#endif
+ ring->doorbell_qpn,
+ ring->bf.uar->map + MLX4_SEND_DOORBELL);
+}
+
+static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
+ struct mlx4_en_tx_desc *tx_desc,
+ union mlx4_wqe_qpn_vlan qpn_vlan,
+ int desc_size, int bf_index,
+ __be32 op_own, bool bf_ok,
+ bool send_doorbell)
+{
+ tx_desc->ctrl.qpn_vlan = qpn_vlan;
+
+ if (bf_ok) {
+ op_own |= htonl((bf_index & 0xffff) << 8);
+ /* Ensure new descriptor hits memory
+ * before setting ownership of this descriptor to HW
+ */
+ dma_wmb();
+ tx_desc->ctrl.owner_opcode = op_own;
+
+ wmb();
+
+ mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
+ desc_size);
+
+ wmb();
+
+ ring->bf.offset ^= ring->bf.buf_size;
+ } else {
+ /* Ensure new descriptor hits memory
+ * before setting ownership of this descriptor to HW
+ */
+ dma_wmb();
+ tx_desc->ctrl.owner_opcode = op_own;
+ if (send_doorbell)
+ mlx4_en_xmit_doorbell(ring);
+ else
+ ring->xmit_more++;
+ }
+}
+
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct mlx4_en_priv *priv = netdev_priv(dev);
+ union mlx4_wqe_qpn_vlan qpn_vlan = {};
struct device *ddev = priv->ddev;
struct mlx4_en_tx_ring *ring;
struct mlx4_en_tx_desc *tx_desc;
@@ -715,7 +795,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
int real_size;
u32 index, bf_index;
__be32 op_own;
- u16 vlan_tag = 0;
u16 vlan_proto = 0;
int i_frag;
int lso_header_size;
@@ -725,6 +804,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
bool stop_queue;
bool inline_ok;
u32 ring_cons;
+ bool bf_ok;
tx_ind = skb_get_queue_mapping(skb);
ring = priv->tx_ring[tx_ind];
@@ -749,9 +829,17 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_drop;
}
+ bf_ok = ring->bf_enabled;
if (skb_vlan_tag_present(skb)) {
- vlan_tag = skb_vlan_tag_get(skb);
+ qpn_vlan.vlan_tag = cpu_to_be16(skb_vlan_tag_get(skb));
vlan_proto = be16_to_cpu(skb->vlan_proto);
+ if (vlan_proto == ETH_P_8021AD)
+ qpn_vlan.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN;
+ else if (vlan_proto == ETH_P_8021Q)
+ qpn_vlan.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN;
+ else
+ qpn_vlan.ins_vlan = 0;
+ bf_ok = false;
}
netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
@@ -771,6 +859,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
else {
tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
bounce = true;
+ bf_ok = false;
}
/* Save skb in tx_info ring */
@@ -907,8 +996,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);
if (tx_info->inl)
- build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
- tx_ind, fragptr);
+ build_inline_wqe(tx_desc, skb, shinfo, fragptr);
if (skb->encapsulation) {
union {
@@ -946,60 +1034,15 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
real_size = (real_size / 16) & 0x3f;
- if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
- !skb_vlan_tag_present(skb) && send_doorbell) {
- tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
- cpu_to_be32(real_size);
-
- op_own |= htonl((bf_index & 0xffff) << 8);
- /* Ensure new descriptor hits memory
- * before setting ownership of this descriptor to HW
- */
- dma_wmb();
- tx_desc->ctrl.owner_opcode = op_own;
-
- wmb();
+ bf_ok &= desc_size <= MAX_BF && send_doorbell;
- mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
- desc_size);
-
- wmb();
-
- ring->bf.offset ^= ring->bf.buf_size;
- } else {
- tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
- if (vlan_proto == ETH_P_8021AD)
- tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN;
- else if (vlan_proto == ETH_P_8021Q)
- tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN;
- else
- tx_desc->ctrl.ins_vlan = 0;
-
- tx_desc->ctrl.fence_size = real_size;
+ if (bf_ok)
+ qpn_vlan.bf_qpn = ring->doorbell_qpn | cpu_to_be32(real_size);
+ else
+ qpn_vlan.fence_size = real_size;
- /* Ensure new descriptor hits memory
- * before setting ownership of this descriptor to HW
- */
- dma_wmb();
- tx_desc->ctrl.owner_opcode = op_own;
- if (send_doorbell) {
- wmb();
- /* Since there is no iowrite*_native() that writes the
- * value as is, without byteswapping - using the one
- * the doesn't do byteswapping in the relevant arch
- * endianness.
- */
-#if defined(__LITTLE_ENDIAN)
- iowrite32(
-#else
- iowrite32be(
-#endif
- ring->doorbell_qpn,
- ring->bf.uar->map + MLX4_SEND_DOORBELL);
- } else {
- ring->xmit_more++;
- }
- }
+ mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, desc_size, bf_index,
+ op_own, bf_ok, send_doorbell);
if (unlikely(stop_queue)) {
/* If queue was emptied after the if (stop_queue) , and before
@@ -1034,3 +1077,106 @@ tx_drop:
return NETDEV_TX_OK;
}
+netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
+ struct net_device *dev, unsigned int length,
+ int tx_ind, int *doorbell_pending)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ union mlx4_wqe_qpn_vlan qpn_vlan = {};
+ struct mlx4_en_tx_ring *ring;
+ struct mlx4_en_tx_desc *tx_desc;
+ struct mlx4_wqe_data_seg *data;
+ struct mlx4_en_tx_info *tx_info;
+ int index, bf_index;
+ bool send_doorbell;
+ int nr_txbb = 1;
+ bool stop_queue;
+ dma_addr_t dma;
+ int real_size;
+ __be32 op_own;
+ u32 ring_cons;
+ bool bf_ok;
+
+ BUILD_BUG_ON_MSG(ALIGN(CTRL_SIZE + DS_SIZE, TXBB_SIZE) != TXBB_SIZE,
+ "mlx4_en_xmit_frame requires minimum size tx desc");
+
+ ring = priv->tx_ring[tx_ind];
+
+ if (!priv->port_up)
+ goto tx_drop;
+
+ if (mlx4_en_is_tx_ring_full(ring))
+ goto tx_drop;
+
+ /* fetch ring->cons far ahead before needing it to avoid stall */
+ ring_cons = READ_ONCE(ring->cons);
+
+ index = ring->prod & ring->size_mask;
+ tx_info = &ring->tx_info[index];
+
+ bf_ok = ring->bf_enabled;
+
+ /* Track current inflight packets for performance analysis */
+ AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+ (u32)(ring->prod - ring_cons - 1));
+
+ bf_index = ring->prod;
+ tx_desc = ring->buf + index * TXBB_SIZE;
+ data = &tx_desc->data;
+
+ dma = frame->dma;
+
+ tx_info->page = frame->page;
+ frame->page = NULL;
+ tx_info->map0_dma = dma;
+ tx_info->map0_byte_count = length;
+ tx_info->nr_txbb = nr_txbb;
+ tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
+ tx_info->data_offset = (void *)data - (void *)tx_desc;
+ tx_info->ts_requested = 0;
+ tx_info->nr_maps = 1;
+ tx_info->linear = 1;
+ tx_info->inl = 0;
+
+ dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE);
+
+ data->addr = cpu_to_be64(dma);
+ data->lkey = ring->mr_key;
+ dma_wmb();
+ data->byte_count = cpu_to_be32(length);
+
+ /* tx completion can avoid cache line miss for common cases */
+ tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
+
+ op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+ ((ring->prod & ring->size) ?
+ cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+ ring->packets++;
+ ring->bytes += tx_info->nr_bytes;
+ AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, length);
+
+ ring->prod += nr_txbb;
+
+ stop_queue = mlx4_en_is_tx_ring_full(ring);
+ send_doorbell = stop_queue ||
+ *doorbell_pending > MLX4_EN_DOORBELL_BUDGET;
+ bf_ok &= send_doorbell;
+
+ real_size = ((CTRL_SIZE + nr_txbb * DS_SIZE) / 16) & 0x3f;
+
+ if (bf_ok)
+ qpn_vlan.bf_qpn = ring->doorbell_qpn | cpu_to_be32(real_size);
+ else
+ qpn_vlan.fence_size = real_size;
+
+ mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, TXBB_SIZE, bf_index,
+ op_own, bf_ok, send_doorbell);
+ *doorbell_pending = send_doorbell ? 0 : *doorbell_pending + 1;
+
+ return NETDEV_TX_OK;
+
+tx_drop:
+ ring->tx_dropped++;
+ return NETDEV_TX_BUSY;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index d39bf594abe4..29c81d26f9f5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -132,6 +132,7 @@ enum {
MLX4_EN_NUM_UP)
#define MLX4_EN_DEFAULT_TX_WORK 256
+#define MLX4_EN_DOORBELL_BUDGET 8
/* Target number of packets to coalesce with interrupt moderation */
#define MLX4_EN_RX_COAL_TARGET 44
@@ -164,6 +165,10 @@ enum {
#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
#define MLX4_EN_MIN_MTU 46
+/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
+ * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
+ */
+#define MLX4_EN_EFF_MTU(mtu) ((mtu) + ETH_HLEN + (2 * VLAN_HLEN))
#define ETH_BCAST 0xffffffffffffULL
#define MLX4_EN_LOOPBACK_RETRIES 5
@@ -215,7 +220,10 @@ enum cq_type {
struct mlx4_en_tx_info {
- struct sk_buff *skb;
+ union {
+ struct sk_buff *skb;
+ struct page *page;
+ };
dma_addr_t map0_dma;
u32 map0_byte_count;
u32 nr_txbb;
@@ -255,6 +263,14 @@ struct mlx4_en_rx_alloc {
u32 page_size;
};
+#define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
+struct mlx4_en_page_cache {
+ u32 index;
+ struct mlx4_en_rx_alloc buf[MLX4_EN_CACHE_SIZE];
+};
+
+struct mlx4_en_priv;
+
struct mlx4_en_tx_ring {
/* cache line used and dirtied in tx completion
* (mlx4_en_free_tx_buf())
@@ -288,6 +304,11 @@ struct mlx4_en_tx_ring {
__be32 mr_key;
void *buf;
struct mlx4_en_tx_info *tx_info;
+ struct mlx4_en_rx_ring *recycle_ring;
+ u32 (*free_tx_desc)(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ int index, u8 owner,
+ u64 timestamp, int napi_mode);
u8 *bounce_buf;
struct mlx4_qp_context context;
int qpn;
@@ -319,6 +340,8 @@ struct mlx4_en_rx_ring {
u8 fcs_del;
void *buf;
void *rx_info;
+ struct bpf_prog *xdp_prog;
+ struct mlx4_en_page_cache page_cache;
unsigned long bytes;
unsigned long packets;
unsigned long csum_ok;
@@ -438,7 +461,9 @@ struct mlx4_en_mc_list {
struct mlx4_en_frag_info {
u16 frag_size;
u16 frag_prefix_size;
- u16 frag_stride;
+ u32 frag_stride;
+ enum dma_data_direction dma_dir;
+ int order;
};
#ifdef CONFIG_MLX4_EN_DCB
@@ -558,6 +583,7 @@ struct mlx4_en_priv {
struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
u16 num_frags;
u16 log_rx_info;
+ int xdp_ring_num;
struct mlx4_en_tx_ring **tx_ring;
struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
@@ -663,6 +689,12 @@ void mlx4_en_tx_irq(struct mlx4_cq *mcq);
u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
void *accel_priv, select_queue_fallback_t fallback);
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
+ struct net_device *dev, unsigned int length,
+ int tx_ind, int *doorbell_pending);
+void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring);
+bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
+ struct mlx4_en_rx_alloc *frame);
int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring **pring,
@@ -691,6 +723,14 @@ int mlx4_en_process_rx_cq(struct net_device *dev,
int budget);
int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget);
int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget);
+u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ int index, u8 owner, u64 timestamp,
+ int napi_mode);
+u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ int index, u8 owner, u64 timestamp,
+ int napi_mode);
void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
int is_tx, int rss, int qpn, int cqn, int user_prio,
struct mlx4_qp_context *context);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c13e92b00bf5..75a5ae6bee07 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -224,6 +224,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
struct bpf_prog *bpf_prog_get(u32 ufd);
struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
void bpf_prog_put(struct bpf_prog *prog);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6fc31ef1da2d..15d816a8b755 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -368,6 +368,11 @@ struct bpf_skb_data_end {
void *data_end;
};
+struct xdp_buff {
+ void *data;
+ void *data_end;
+};
+
/* compute the linear packet data range [data, data_end) which
* will be accessed by cls_bpf and act_bpf programs
*/
@@ -429,6 +434,18 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
return BPF_PROG_RUN(prog, skb);
}
+static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
+ struct xdp_buff *xdp)
+{
+ u32 ret;
+
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(prog, (void *)xdp);
+ rcu_read_unlock();
+
+ return ret;
+}
+
static inline unsigned int bpf_prog_size(unsigned int proglen)
{
return max(sizeof(struct bpf_prog),
@@ -509,6 +526,7 @@ bool bpf_helper_changes_skb_data(void *func);
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len);
+void bpf_warn_invalid_xdp_action(u32 act);
#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 587cdf943b52..deaa2217214d 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -291,16 +291,18 @@ enum {
MLX4_WQE_CTRL_FORCE_LOOPBACK = 1 << 0,
};
+union mlx4_wqe_qpn_vlan {
+ struct {
+ __be16 vlan_tag;
+ u8 ins_vlan;
+ u8 fence_size;
+ };
+ __be32 bf_qpn;
+};
+
struct mlx4_wqe_ctrl_seg {
__be32 owner_opcode;
- union {
- struct {
- __be16 vlan_tag;
- u8 ins_vlan;
- u8 fence_size;
- };
- __be32 bf_qpn;
- };
+ union mlx4_wqe_qpn_vlan qpn_vlan;
/*
* High 24 bits are SRC remote buffer; low 8 bits are flags:
* [7] SO (strong ordering)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 49736a31acaa..fab9a1c2a2ac 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -63,6 +63,7 @@ struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
+struct bpf_prog;
void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops);
@@ -799,6 +800,33 @@ struct tc_to_netdev {
};
};
+/* These structures hold the attributes of xdp state that are being passed
+ * to the netdevice through the xdp op.
+ */
+enum xdp_netdev_command {
+ /* Set or clear a bpf program used in the earliest stages of packet
+ * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
+ * is responsible for calling bpf_prog_put on any old progs that are
+ * stored. In case of error, the callee need not release the new prog
+ * reference, but on success it takes ownership and must bpf_prog_put
+ * when it is no longer used.
+ */
+ XDP_SETUP_PROG,
+ /* Check if a bpf program is set on the device. The callee should
+ * return true if a program is currently attached and running.
+ */
+ XDP_QUERY_PROG,
+};
+
+struct netdev_xdp {
+ enum xdp_netdev_command command;
+ union {
+ /* XDP_SETUP_PROG */
+ struct bpf_prog *prog;
+ /* XDP_QUERY_PROG */
+ bool prog_attached;
+ };
+};
/*
* This structure defines the management hooks for network devices.
@@ -1087,6 +1115,9 @@ struct tc_to_netdev {
* appropriate rx headroom value allows avoiding skb head copy on
* forward. Setting a negative value resets the rx headroom to the
* default value.
+ * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp);
+ * This function is used to set or query state related to XDP on the
+ * netdevice. See definition of enum xdp_netdev_command for details.
*
*/
struct net_device_ops {
@@ -1271,6 +1302,8 @@ struct net_device_ops {
struct sk_buff *skb);
void (*ndo_set_rx_headroom)(struct net_device *dev,
int needed_headroom);
+ int (*ndo_xdp)(struct net_device *dev,
+ struct netdev_xdp *xdp);
};
/**
@@ -3257,6 +3290,7 @@ int dev_get_phys_port_id(struct net_device *dev,
int dev_get_phys_port_name(struct net_device *dev,
char *name, size_t len);
int dev_change_proto_down(struct net_device *dev, bool proto_down);
+int dev_change_xdp_fd(struct net_device *dev, int fd);
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, int *ret);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c4d922439d20..2b7076f5b5ad 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -94,6 +94,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_CLS,
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
+ BPF_PROG_TYPE_XDP,
};
#define BPF_PSEUDO_MAP_FD 1
@@ -439,4 +440,24 @@ struct bpf_tunnel_key {
__u32 tunnel_label;
};
+/* User return codes for XDP prog type.
+ * A valid XDP program must return one of these defined values. All other
+ * return codes are reserved for future use. Unknown return codes will result
+ * in packet drop.
+ */
+enum xdp_action {
+ XDP_ABORTED = 0,
+ XDP_DROP,
+ XDP_PASS,
+ XDP_TX,
+};
+
+/* user accessible metadata for XDP packet hook
+ * new fields must be added to the end of this structure
+ */
+struct xdp_md {
+ __u32 data;
+ __u32 data_end;
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4285ac31e865..a1b5202c5f6b 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -156,6 +156,7 @@ enum {
IFLA_GSO_MAX_SEGS,
IFLA_GSO_MAX_SIZE,
IFLA_PAD,
+ IFLA_XDP,
__IFLA_MAX
};
@@ -843,4 +844,15 @@ enum {
};
#define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1)
+/* XDP section */
+
+enum {
+ IFLA_XDP_UNSPEC,
+ IFLA_XDP_FD,
+ IFLA_XDP_ATTACHED,
+ __IFLA_XDP_MAX,
+};
+
+#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1)
+
#endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 96d938a22050..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -670,14 +670,20 @@ static struct bpf_prog *____bpf_prog_get(struct fd f)
return f.file->private_data;
}
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
{
- if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
- atomic_dec(&prog->aux->refcnt);
+ if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_sub(i, &prog->aux->refcnt);
return ERR_PTR(-EBUSY);
}
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_add);
+
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ return bpf_prog_add(prog, 1);
+}
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e206c2181412..f72f23b8fdab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff
+static bool may_write_pkt_data(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_XDP:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int check_packet_access(struct verifier_env *env, u32 regno, int off,
int size)
{
@@ -713,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
switch (env->prog->type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
break;
default:
verbose("verifier is misconfigured\n");
@@ -805,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
- if (t == BPF_WRITE) {
+ if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
verbose("cannot write into packet\n");
return -EACCES;
}
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into packet\n", value_regno);
+ return -EACCES;
+ }
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
diff --git a/net/core/dev.c b/net/core/dev.c
index 7894e406c806..2a9c39f8824e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -94,6 +94,7 @@
#include <linux/ethtool.h>
#include <linux/notifier.h>
#include <linux/skbuff.h>
+#include <linux/bpf.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/busy_poll.h>
@@ -6615,6 +6616,38 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
EXPORT_SYMBOL(dev_change_proto_down);
/**
+ * dev_change_xdp_fd - set or clear a bpf program for a device rx path
+ * @dev: device
+ * @fd: new program fd or negative value to clear
+ *
+ * Set or clear a bpf program for a device
+ */
+int dev_change_xdp_fd(struct net_device *dev, int fd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct bpf_prog *prog = NULL;
+ struct netdev_xdp xdp = {};
+ int err;
+
+ if (!ops->ndo_xdp)
+ return -EOPNOTSUPP;
+ if (fd >= 0) {
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+
+ xdp.command = XDP_SETUP_PROG;
+ xdp.prog = prog;
+ err = ops->ndo_xdp(dev, &xdp);
+ if (err < 0 && prog)
+ bpf_prog_put(prog);
+
+ return err;
+}
+EXPORT_SYMBOL(dev_change_xdp_fd);
+
+/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
diff --git a/net/core/filter.c b/net/core/filter.c
index 22e3992c8b48..6c627bc4be6e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2410,6 +2410,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
}
}
+static const struct bpf_func_proto *
+xdp_func_proto(enum bpf_func_id func_id)
+{
+ return sk_filter_func_proto(func_id);
+}
+
static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2477,6 +2483,44 @@ static bool tc_cls_act_is_valid_access(int off, int size,
return __is_valid_access(off, size, type);
}
+static bool __is_valid_xdp_access(int off, int size,
+ enum bpf_access_type type)
+{
+ if (off < 0 || off >= sizeof(struct xdp_md))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (size != 4)
+ return false;
+
+ return true;
+}
+
+static bool xdp_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case offsetof(struct xdp_md, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct xdp_md, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_xdp_access(off, size, type);
+}
+
+void bpf_warn_invalid_xdp_action(u32 act)
+{
+ WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+
static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
@@ -2628,6 +2672,29 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
return insn - insn_buf;
}
+static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct xdp_md, data):
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
+ dst_reg, src_reg,
+ offsetof(struct xdp_buff, data));
+ break;
+ case offsetof(struct xdp_md, data_end):
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
+ dst_reg, src_reg,
+ offsetof(struct xdp_buff, data_end));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
@@ -2640,6 +2707,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
.convert_ctx_access = bpf_net_convert_ctx_access,
};
+static const struct bpf_verifier_ops xdp_ops = {
+ .get_func_proto = xdp_func_proto,
+ .is_valid_access = xdp_is_valid_access,
+ .convert_ctx_access = xdp_convert_ctx_access,
+};
+
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2655,11 +2728,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
.type = BPF_PROG_TYPE_SCHED_ACT,
};
+static struct bpf_prog_type_list xdp_type __read_mostly = {
+ .ops = &xdp_ops,
+ .type = BPF_PROG_TYPE_XDP,
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
+ bpf_register_prog_type(&xdp_type);
return 0;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a9e3805af739..eba2b8260dbd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -891,6 +891,16 @@ static size_t rtnl_port_size(const struct net_device *dev,
return port_self_size;
}
+static size_t rtnl_xdp_size(const struct net_device *dev)
+{
+ size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */
+
+ if (!dev->netdev_ops->ndo_xdp)
+ return 0;
+ else
+ return xdp_size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -927,6 +937,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+ + rtnl_xdp_size(dev) /* IFLA_XDP */
+ nla_total_size(1); /* IFLA_PROTO_DOWN */
}
@@ -1211,6 +1222,33 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct netdev_xdp xdp_op = {};
+ struct nlattr *xdp;
+ int err;
+
+ if (!dev->netdev_ops->ndo_xdp)
+ return 0;
+ xdp = nla_nest_start(skb, IFLA_XDP);
+ if (!xdp)
+ return -EMSGSIZE;
+ xdp_op.command = XDP_QUERY_PROG;
+ err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
+ if (err)
+ goto err_cancel;
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached);
+ if (err)
+ goto err_cancel;
+
+ nla_nest_end(skb, xdp);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, xdp);
+ return err;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, u32 ext_filter_mask)
@@ -1307,6 +1345,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
if (rtnl_port_fill(skb, dev, ext_filter_mask))
goto nla_put_failure;
+ if (rtnl_xdp_fill(skb, dev))
+ goto nla_put_failure;
+
if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
if (rtnl_link_fill(skb, dev) < 0)
goto nla_put_failure;
@@ -1392,6 +1433,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
[IFLA_PROTO_DOWN] = { .type = NLA_U8 },
+ [IFLA_XDP] = { .type = NLA_NESTED },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1429,6 +1471,11 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
};
+static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+};
+
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
{
const struct rtnl_link_ops *ops = NULL;
@@ -2054,6 +2101,23 @@ static int do_setlink(const struct sk_buff *skb,
status |= DO_SETLINK_NOTIFY;
}
+ if (tb[IFLA_XDP]) {
+ struct nlattr *xdp[IFLA_XDP_MAX + 1];
+
+ err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
+ ifla_xdp_policy);
+ if (err < 0)
+ goto errout;
+
+ if (xdp[IFLA_XDP_FD]) {
+ err = dev_change_xdp_fd(dev,
+ nla_get_s32(xdp[IFLA_XDP_FD]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+
errout:
if (status & DO_SETLINK_MODIFIED) {
if (status & DO_SETLINK_NOTIFY)
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index a98b780e974c..d2d2b35c67eb 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -21,6 +21,8 @@ hostprogs-y += spintest
hostprogs-y += map_perf_test
hostprogs-y += test_overhead
hostprogs-y += test_cgrp2_array_pin
+hostprogs-y += xdp1
+hostprogs-y += xdp2
test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
@@ -42,6 +44,9 @@ spintest-objs := bpf_load.o libbpf.o spintest_user.o
map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
+xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
+# reuse xdp1 source intentionally
+xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -64,6 +69,8 @@ always += test_overhead_tp_kern.o
always += test_overhead_kprobe_kern.o
always += parse_varlen.o parse_simple.o parse_ldabs.o
always += test_cgrp2_tc_kern.o
+always += xdp1_kern.o
+always += xdp2_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -84,6 +91,8 @@ HOSTLOADLIBES_offwaketime += -lelf
HOSTLOADLIBES_spintest += -lelf
HOSTLOADLIBES_map_perf_test += -lelf -lrt
HOSTLOADLIBES_test_overhead += -lelf -lrt
+HOSTLOADLIBES_xdp1 += -lelf
+HOSTLOADLIBES_xdp2 += -lelf
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 022af71c2bb5..0cfda2320320 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -50,6 +50,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
+ bool is_xdp = strncmp(event, "xdp", 3) == 0;
enum bpf_prog_type prog_type;
char buf[256];
int fd, efd, err, id;
@@ -66,6 +67,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_type = BPF_PROG_TYPE_KPROBE;
} else if (is_tracepoint) {
prog_type = BPF_PROG_TYPE_TRACEPOINT;
+ } else if (is_xdp) {
+ prog_type = BPF_PROG_TYPE_XDP;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -79,6 +82,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_fd[prog_cnt++] = fd;
+ if (is_xdp)
+ return 0;
+
if (is_socket) {
event += 6;
if (*event != '/')
@@ -319,6 +325,7 @@ int load_bpf_file(char *path)
if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
memcmp(shname_prog, "kretprobe/", 10) == 0 ||
memcmp(shname_prog, "tracepoint/", 11) == 0 ||
+ memcmp(shname_prog, "xdp", 3) == 0 ||
memcmp(shname_prog, "socket", 6) == 0)
load_and_attach(shname_prog, insns, data_prog->d_size);
}
@@ -336,6 +343,7 @@ int load_bpf_file(char *path)
if (memcmp(shname, "kprobe/", 7) == 0 ||
memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "tracepoint/", 11) == 0 ||
+ memcmp(shname, "xdp", 3) == 0 ||
memcmp(shname, "socket", 6) == 0)
load_and_attach(shname, data->d_buf, data->d_size);
}
diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c
new file mode 100644
index 000000000000..e7dd8ac40d12
--- /dev/null
+++ b/samples/bpf/xdp1_kern.c
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PLUMgrid
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dropcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 256,
+};
+
+static int parse_ipv4(void *data, u64 nh_off, void *data_end)
+{
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ return iph->protocol;
+}
+
+static int parse_ipv6(void *data, u64 nh_off, void *data_end)
+{
+ struct ipv6hdr *ip6h = data + nh_off;
+
+ if (ip6h + 1 > data_end)
+ return 0;
+ return ip6h->nexthdr;
+}
+
+SEC("xdp1")
+int xdp_prog1(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ int rc = XDP_DROP;
+ long *value;
+ u16 h_proto;
+ u64 nh_off;
+ u32 index;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return rc;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+
+ if (h_proto == htons(ETH_P_IP))
+ index = parse_ipv4(data, nh_off, data_end);
+ else if (h_proto == htons(ETH_P_IPV6))
+ index = parse_ipv6(data, nh_off, data_end);
+ else
+ index = 0;
+
+ value = bpf_map_lookup_elem(&dropcnt, &index);
+ if (value)
+ *value += 1;
+
+ return rc;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
new file mode 100644
index 000000000000..a5e109e398a1
--- /dev/null
+++ b/samples/bpf/xdp1_user.c
@@ -0,0 +1,181 @@
+/* Copyright (c) 2016 PLUMgrid
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "bpf_load.h"
+#include "libbpf.h"
+
+static int set_link_xdp_fd(int ifindex, int fd)
+{
+ struct sockaddr_nl sa;
+ int sock, seq = 0, len, ret = -1;
+ char buf[4096];
+ struct nlattr *nla, *nla_xdp;
+ struct {
+ struct nlmsghdr nh;
+ struct ifinfomsg ifinfo;
+ char attrbuf[64];
+ } req;
+ struct nlmsghdr *nh;
+ struct nlmsgerr *err;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ goto cleanup;
+ }
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_type = RTM_SETLINK;
+ req.nh.nlmsg_pid = 0;
+ req.nh.nlmsg_seq = ++seq;
+ req.ifinfo.ifi_family = AF_UNSPEC;
+ req.ifinfo.ifi_index = ifindex;
+ nla = (struct nlattr *)(((char *)&req)
+ + NLMSG_ALIGN(req.nh.nlmsg_len));
+ nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
+
+ nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
+ nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
+ nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+ memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+ nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len;
+
+ req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ printf("send to netlink: %s\n", strerror(errno));
+ goto cleanup;
+ }
+
+ len = recv(sock, buf, sizeof(buf), 0);
+ if (len < 0) {
+ printf("recv from netlink: %s\n", strerror(errno));
+ goto cleanup;
+ }
+
+ for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+ nh = NLMSG_NEXT(nh, len)) {
+ if (nh->nlmsg_pid != getpid()) {
+ printf("Wrong pid %d, expected %d\n",
+ nh->nlmsg_pid, getpid());
+ goto cleanup;
+ }
+ if (nh->nlmsg_seq != seq) {
+ printf("Wrong seq %d, expected %d\n",
+ nh->nlmsg_seq, seq);
+ goto cleanup;
+ }
+ switch (nh->nlmsg_type) {
+ case NLMSG_ERROR:
+ err = (struct nlmsgerr *)NLMSG_DATA(nh);
+ if (!err->error)
+ continue;
+ printf("nlmsg error %s\n", strerror(-err->error));
+ goto cleanup;
+ case NLMSG_DONE:
+ break;
+ }
+ }
+
+ ret = 0;
+
+cleanup:
+ close(sock);
+ return ret;
+}
+
+static int ifindex;
+
+static void int_exit(int sig)
+{
+ set_link_xdp_fd(ifindex, -1);
+ exit(0);
+}
+
+/* simple per-protocol drop counter
+ */
+static void poll_stats(int interval)
+{
+ unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ const unsigned int nr_keys = 256;
+ __u64 values[nr_cpus], prev[nr_keys][nr_cpus];
+ __u32 key;
+ int i;
+
+ memset(prev, 0, sizeof(prev));
+
+ while (1) {
+ sleep(interval);
+
+ for (key = 0; key < nr_keys; key++) {
+ __u64 sum = 0;
+
+ assert(bpf_lookup_elem(map_fd[0], &key, values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[key][i]);
+ if (sum)
+ printf("proto %u: %10llu pkt/s\n",
+ key, sum / interval);
+ memcpy(prev[key], values, sizeof(values));
+ }
+ }
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (ac != 2) {
+ printf("usage: %s IFINDEX\n", argv[0]);
+ return 1;
+ }
+
+ ifindex = strtoul(argv[1], NULL, 0);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ if (!prog_fd[0]) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+
+ if (set_link_xdp_fd(ifindex, prog_fd[0]) < 0) {
+ printf("link set xdp fd failed\n");
+ return 1;
+ }
+
+ poll_stats(2);
+
+ return 0;
+}
diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c
new file mode 100644
index 000000000000..38fe7e1d0db4
--- /dev/null
+++ b/samples/bpf/xdp2_kern.c
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PLUMgrid
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dropcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(long),
+ .max_entries = 256,
+};
+
+static void swap_src_dst_mac(void *data)
+{
+ unsigned short *p = data;
+ unsigned short dst[3];
+
+ dst[0] = p[0];
+ dst[1] = p[1];
+ dst[2] = p[2];
+ p[0] = p[3];
+ p[1] = p[4];
+ p[2] = p[5];
+ p[3] = dst[0];
+ p[4] = dst[1];
+ p[5] = dst[2];
+}
+
+static int parse_ipv4(void *data, u64 nh_off, void *data_end)
+{
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ return iph->protocol;
+}
+
+static int parse_ipv6(void *data, u64 nh_off, void *data_end)
+{
+ struct ipv6hdr *ip6h = data + nh_off;
+
+ if (ip6h + 1 > data_end)
+ return 0;
+ return ip6h->nexthdr;
+}
+
+SEC("xdp1")
+int xdp_prog1(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ int rc = XDP_DROP;
+ long *value;
+ u16 h_proto;
+ u64 nh_off;
+ u32 index;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return rc;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+
+ if (h_proto == htons(ETH_P_IP))
+ index = parse_ipv4(data, nh_off, data_end);
+ else if (h_proto == htons(ETH_P_IPV6))
+ index = parse_ipv6(data, nh_off, data_end);
+ else
+ index = 0;
+
+ value = bpf_map_lookup_elem(&dropcnt, &index);
+ if (value)
+ *value += 1;
+
+ if (index == 17) {
+ swap_src_dst_mac(data);
+ rc = XDP_TX;
+ }
+
+ return rc;
+}
+
+char _license[] SEC("license") = "GPL";