diff options
Diffstat (limited to 'drivers/net/veth.c')
| -rw-r--r-- | drivers/net/veth.c | 486 |
1 files changed, 290 insertions, 196 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c index dfc7d87fad59..14e6f2a2fb77 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -17,6 +17,7 @@ #include <net/rtnetlink.h> #include <net/dst.h> +#include <net/netdev_lock.h> #include <net/xfrm.h> #include <net/xdp.h> #include <linux/veth.h> @@ -26,6 +27,8 @@ #include <linux/ptr_ring.h> #include <linux/bpf_trace.h> #include <linux/net_tstamp.h> +#include <linux/skbuff_ref.h> +#include <net/page_pool/helpers.h> #define DRV_NAME "veth" #define DRV_VERSION "1.0" @@ -65,6 +68,7 @@ struct veth_rq { bool rx_notify_masked; struct ptr_ring xdp_ring; struct xdp_rxq_info xdp_rxq; + struct page_pool *page_pool; }; struct veth_priv { @@ -116,6 +120,11 @@ static struct { { "peer_ifindex" }, }; +struct veth_xdp_buff { + struct xdp_buff xdp; + struct sk_buff *skb; +}; + static int veth_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { @@ -150,6 +159,8 @@ static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) for (j = 0; j < VETH_TQ_STATS_LEN; j++) ethtool_sprintf(&p, "tx_queue_%u_%.18s", i, veth_tq_stats_desc[j].desc); + + page_pool_ethtool_stats_get_strings(p); break; } } @@ -160,18 +171,35 @@ static int veth_get_sset_count(struct net_device *dev, int sset) case ETH_SS_STATS: return ARRAY_SIZE(ethtool_stats_keys) + VETH_RQ_STATS_LEN * dev->real_num_rx_queues + - VETH_TQ_STATS_LEN * dev->real_num_tx_queues; + VETH_TQ_STATS_LEN * dev->real_num_tx_queues + + page_pool_ethtool_stats_get_count(); default: return -EOPNOTSUPP; } } +static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct veth_priv *priv = netdev_priv(dev); + struct page_pool_stats pp_stats = {}; + int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + if (!priv->rq[i].page_pool) + continue; + page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); + } + page_pool_ethtool_stats_get(data, &pp_stats); +#endif /* CONFIG_PAGE_POOL_STATS */ +} + static void veth_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); - int i, j, idx; + int i, j, idx, pp_idx; data[0] = peer ? peer->ifindex : 0; idx = 1; @@ -190,9 +218,10 @@ static void veth_get_ethtool_stats(struct net_device *dev, } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); idx += VETH_RQ_STATS_LEN; } + pp_idx = idx; if (!peer) - return; + goto page_pool_stats; rcv_priv = netdev_priv(peer); for (i = 0; i < peer->real_num_rx_queues; i++) { @@ -210,6 +239,10 @@ static void veth_get_ethtool_stats(struct net_device *dev, } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); } + pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; + +page_pool_stats: + veth_get_page_pool_stats(dev, &data[pp_idx]); } static void veth_get_channels(struct net_device *dev, @@ -274,12 +307,10 @@ static void __veth_xdp_flush(struct veth_rq *rq) static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) { - if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { - dev_kfree_skb_any(skb); - return NET_RX_DROP; - } + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) + return NETDEV_TX_BUSY; /* signal qdisc layer */ - return NET_RX_SUCCESS; + return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ } static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, @@ -313,10 +344,11 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct veth_rq *rq = NULL; + struct netdev_queue *txq; struct net_device *rcv; int length = skb->len; bool use_napi = false; - int rxq; + int ret, rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); @@ -339,28 +371,46 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) } skb_tx_timestamp(skb); - if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { + + ret = veth_forward_skb(rcv, skb, rq, use_napi); + switch (ret) { + case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ if (!use_napi) - dev_lstats_add(dev, length); - } else { + dev_sw_netstats_tx_add(dev, 1, length); + else + __veth_xdp_flush(rq); + break; + case NETDEV_TX_BUSY: + /* If a qdisc is attached to our virtual device, returning + * NETDEV_TX_BUSY is allowed. + */ + txq = netdev_get_tx_queue(dev, rxq); + + if (qdisc_txq_has_no_queue(txq)) { + dev_kfree_skb_any(skb); + goto drop; + } + /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ + __skb_push(skb, ETH_HLEN); + netif_tx_stop_queue(txq); + /* Makes sure NAPI peer consumer runs. Consumer is responsible + * for starting txq again, until then ndo_start_xmit (this + * function) will not be invoked by the netstack again. + */ + __veth_xdp_flush(rq); + break; + case NET_RX_DROP: /* same as NET_XMIT_DROP */ drop: atomic64_inc(&priv->dropped); + ret = NET_XMIT_DROP; + break; + default: + net_crit_ratelimited("%s(%s): Invalid return code(%d)", + __func__, dev->name, ret); } - - if (use_napi) - __veth_xdp_flush(rq); - rcu_read_unlock(); - return NETDEV_TX_OK; -} - -static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) -{ - struct veth_priv *priv = netdev_priv(dev); - - dev_lstats_read(dev, packets, bytes); - return atomic64_read(&priv->dropped); + return ret; } static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) @@ -400,24 +450,24 @@ static void veth_get_stats64(struct net_device *dev, struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; struct veth_stats rx; - u64 packets, bytes; - tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); - tot->tx_bytes = bytes; - tot->tx_packets = packets; + tot->tx_dropped = atomic64_read(&priv->dropped); + dev_fetch_sw_netstats(tot, dev->tstats); veth_stats_rx(&rx, dev); tot->tx_dropped += rx.xdp_tx_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; - tot->rx_bytes = rx.xdp_bytes; - tot->rx_packets = rx.xdp_packets; + tot->rx_bytes += rx.xdp_bytes; + tot->rx_packets += rx.xdp_packets; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (peer) { - veth_stats_tx(peer, &packets, &bytes); - tot->rx_bytes += bytes; - tot->rx_packets += packets; + struct rtnl_link_stats64 tot_peer = {}; + + dev_fetch_sw_netstats(&tot_peer, peer->tstats); + tot->rx_bytes += tot_peer.tx_bytes; + tot->rx_packets += tot_peer.tx_packets; veth_stats_rx(&rx, peer); tot->tx_dropped += rx.peer_tq_xdp_xmit_err; @@ -592,23 +642,25 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { - struct xdp_buff xdp; + struct veth_xdp_buff vxbuf; + struct xdp_buff *xdp = &vxbuf.xdp; u32 act; - xdp_convert_frame_to_buff(frame, &xdp); - xdp.rxq = &rq->xdp_rxq; + xdp_convert_frame_to_buff(frame, xdp); + xdp->rxq = &rq->xdp_rxq; + vxbuf.skb = NULL; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: - if (xdp_update_frame_from_buff(&xdp, frame)) + if (xdp_update_frame_from_buff(xdp, frame)) goto err_xdp; break; case XDP_TX: orig_frame = *frame; - xdp.rxq->mem = frame->mem; - if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { + xdp->rxq->mem.type = frame->mem_type; + if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; stats->rx_drops++; @@ -619,8 +671,8 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp.rxq->mem = frame->mem; - if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { + xdp->rxq->mem.type = frame->mem_type; + if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; goto err_xdp; @@ -657,8 +709,7 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, void *skbs[VETH_XDP_BATCH]; int i; - if (xdp_alloc_skb_bulk(skbs, n_xdpf, - GFP_ATOMIC | __GFP_ZERO) < 0) { + if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { for (i = 0; i < n_xdpf; i++) xdp_return_frame(frames[i]); stats->rx_drops += n_xdpf; @@ -701,74 +752,12 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, u32 frame_sz; if (skb_shared(skb) || skb_head_is_locked(skb) || - skb_shinfo(skb)->nr_frags) { - u32 size, len, max_head_size, off; - struct sk_buff *nskb; - struct page *page; - int i, head_off; - - /* We need a private copy of the skb and data buffers since - * the ebpf program can modify it. We segment the original skb - * into order-0 pages without linearize it. - * - * Make sure we have enough space for linear and paged area - */ - max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - - VETH_XDP_HEADROOM); - if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) - goto drop; - - /* Allocate skb head */ - page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); - if (!page) - goto drop; - - nskb = build_skb(page_address(page), PAGE_SIZE); - if (!nskb) { - put_page(page); + skb_shinfo(skb)->nr_frags || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) goto drop; - } - - skb_reserve(nskb, VETH_XDP_HEADROOM); - size = min_t(u32, skb->len, max_head_size); - if (skb_copy_bits(skb, 0, nskb->data, size)) { - consume_skb(nskb); - goto drop; - } - skb_put(nskb, size); - - skb_copy_header(nskb, skb); - head_off = skb_headroom(nskb) - skb_headroom(skb); - skb_headers_offset_update(nskb, head_off); - /* Allocate paged area of new skb */ - off = size; - len = skb->len - off; - - for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { - page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); - if (!page) { - consume_skb(nskb); - goto drop; - } - - size = min_t(u32, len, PAGE_SIZE); - skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); - if (skb_copy_bits(skb, off, page_address(page), - size)) { - consume_skb(nskb); - goto drop; - } - - len -= size; - off += size; - } - - consume_skb(skb); - skb = nskb; - } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && - pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { - goto drop; + skb = *pskb; } /* SKB "head" area always have tailroom for skb_shared_info */ @@ -801,7 +790,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, { void *orig_data, *orig_data_end; struct bpf_prog *xdp_prog; - struct xdp_buff xdp; + struct veth_xdp_buff vxbuf; + struct xdp_buff *xdp = &vxbuf.xdp; u32 act, metalen; int off; @@ -815,22 +805,23 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, } __skb_push(skb, skb->data - skb_mac_header(skb)); - if (veth_convert_skb_to_xdp_buff(rq, &xdp, &skb)) + if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) goto drop; + vxbuf.skb = skb; - orig_data = xdp.data; - orig_data_end = xdp.data_end; + orig_data = xdp->data; + orig_data_end = xdp->data_end; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: - veth_xdp_get(&xdp); + veth_xdp_get(xdp); consume_skb(skb); - xdp.rxq->mem = rq->xdp_mem; - if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { + xdp->rxq->mem = rq->xdp_mem; + if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); stats->rx_drops++; goto err_xdp; @@ -839,10 +830,10 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: - veth_xdp_get(&xdp); + veth_xdp_get(xdp); consume_skb(skb); - xdp.rxq->mem = rq->xdp_mem; - if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { + xdp->rxq->mem = rq->xdp_mem; + if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { stats->rx_drops++; goto err_xdp; } @@ -862,7 +853,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); /* check if bpf_xdp_adjust_head was used */ - off = orig_data - xdp.data; + off = orig_data - xdp->data; if (off > 0) __skb_push(skb, off); else if (off < 0) @@ -871,21 +862,21 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, skb_reset_mac_header(skb); /* check if bpf_xdp_adjust_tail was used */ - off = xdp.data_end - orig_data_end; + off = xdp->data_end - orig_data_end; if (off != 0) __skb_put(skb, off); /* positive on grow, negative on shrink */ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ - if (xdp_buff_has_frags(&xdp)) + if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; skb->protocol = eth_type_trans(skb, rq->dev); - metalen = xdp.data - xdp.data_meta; + metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); out: @@ -898,7 +889,7 @@ xdp_drop: return NULL; err_xdp: rcu_read_unlock(); - xdp_return_buff(&xdp); + xdp_return_buff(xdp); xdp_xmit: return NULL; } @@ -965,17 +956,28 @@ static int veth_poll(struct napi_struct *napi, int budget) { struct veth_rq *rq = container_of(napi, struct veth_rq, xdp_napi); + struct veth_priv *priv = netdev_priv(rq->dev); + int queue_idx = rq->xdp_rxq.queue_index; + struct netdev_queue *peer_txq; struct veth_stats stats = {}; + struct net_device *peer_dev; struct veth_xdp_tx_bq bq; int done; bq.count = 0; + /* NAPI functions as RCU section */ + peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); + peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; + xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); if (stats.xdp_redirect > 0) xdp_do_flush(); + if (stats.xdp_tx > 0) + veth_xdp_flush(rq, &bq); + xdp_clear_return_frame_no_direct(); if (done < budget && napi_complete_done(napi, done)) { /* Write rx_notify_masked before reading ptr_ring */ @@ -988,19 +990,48 @@ static int veth_poll(struct napi_struct *napi, int budget) } } - if (stats.xdp_tx > 0) - veth_xdp_flush(rq, &bq); - xdp_clear_return_frame_no_direct(); + /* Release backpressure per NAPI poll */ + smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */ + if (peer_txq && netif_tx_queue_stopped(peer_txq)) { + txq_trans_cond_update(peer_txq); + netif_tx_wake_queue(peer_txq); + } return done; } +static int veth_create_page_pool(struct veth_rq *rq) +{ + struct page_pool_params pp_params = { + .order = 0, + .pool_size = VETH_RING_SIZE, + .nid = NUMA_NO_NODE, + .dev = &rq->dev->dev, + }; + + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->page_pool)) { + int err = PTR_ERR(rq->page_pool); + + rq->page_pool = NULL; + return err; + } + + return 0; +} + static int __veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; for (i = start; i < end; i++) { + err = veth_create_page_pool(&priv->rq[i]); + if (err) + goto err_page_pool; + } + + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); @@ -1020,6 +1051,12 @@ static int __veth_napi_enable_range(struct net_device *dev, int start, int end) err_xdp_ring: for (i--; i >= start; i--) ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); + i = end; +err_page_pool: + for (i--; i >= start; i--) { + page_pool_destroy(priv->rq[i].page_pool); + priv->rq[i].page_pool = NULL; + } return err; } @@ -1049,6 +1086,11 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end) rq->rx_notify_masked = false; ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); } + + for (i = start; i < end; i++) { + page_pool_destroy(priv->rq[i].page_pool); + priv->rq[i].page_pool = NULL; + } } static void veth_napi_del(struct net_device *dev) @@ -1135,14 +1177,6 @@ static int veth_enable_xdp(struct net_device *dev) veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); return err; } - - if (!veth_gro_requested(dev)) { - /* user-space did not require GRO, but adding XDP - * is supposed to get GRO working - */ - dev->features |= NETIF_F_GRO; - netdev_features_change(dev); - } } } @@ -1162,18 +1196,9 @@ static void veth_disable_xdp(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); - if (!netif_running(dev) || !veth_gro_requested(dev)) { + if (!netif_running(dev) || !veth_gro_requested(dev)) veth_napi_del(dev); - /* if user-space did not require GRO, since adding XDP - * enabled it, clear it now - */ - if (!veth_gro_requested(dev) && netif_running(dev)) { - dev->features &= ~NETIF_F_GRO; - netdev_features_change(dev); - } - } - veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); } @@ -1248,6 +1273,27 @@ static int veth_enable_range_safe(struct net_device *dev, int start, int end) return 0; } +static void veth_set_xdp_features(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; + + peer = rtnl_dereference(priv->peer); + if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { + struct veth_priv *priv_peer = netdev_priv(peer); + xdp_features_t val = NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_RX_SG; + + if (priv_peer->_xdp_prog || veth_gro_requested(peer)) + val |= NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_set_features_flag(dev, val); + } else { + xdp_clear_features_flag(dev); + } +} + static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch) { @@ -1278,7 +1324,7 @@ static int veth_set_channels(struct net_device *dev, if (peer) netif_carrier_off(peer); - /* try to allocate new resurces, as needed*/ + /* try to allocate new resources, as needed*/ err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); if (err) goto out; @@ -1314,6 +1360,12 @@ out: if (peer) netif_carrier_on(peer); } + + /* update XDP supported features */ + veth_set_xdp_features(dev); + if (peer) + veth_set_xdp_features(peer); + return err; revert: @@ -1346,6 +1398,8 @@ static int veth_open(struct net_device *dev) netif_carrier_on(peer); } + veth_set_xdp_features(dev); + return 0; } @@ -1376,7 +1430,8 @@ static int veth_alloc_queues(struct net_device *dev) struct veth_priv *priv = netdev_priv(dev); int i; - priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT); + priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), + GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!priv->rq) return -ENOMEM; @@ -1392,30 +1447,18 @@ static void veth_free_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); - kfree(priv->rq); + kvfree(priv->rq); } static int veth_dev_init(struct net_device *dev) { - int err; - - dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); - if (!dev->lstats) - return -ENOMEM; - - err = veth_alloc_queues(dev); - if (err) { - free_percpu(dev->lstats); - return err; - } - - return 0; + netdev_lockdep_set_classes(dev); + return veth_alloc_queues(dev); } static void veth_dev_free(struct net_device *dev) { veth_free_queues(dev); - free_percpu(dev->lstats); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -1441,7 +1484,7 @@ static int veth_get_iflink(const struct net_device *dev) rcu_read_lock(); peer = rcu_dereference(priv->peer); - iflink = peer ? peer->ifindex : 0; + iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; @@ -1460,8 +1503,6 @@ static netdev_features_t veth_fix_features(struct net_device *dev, if (peer_priv->_xdp_prog) features &= ~NETIF_F_GSO_SOFTWARE; } - if (priv->_xdp_prog) - features |= NETIF_F_GRO; return features; } @@ -1471,16 +1512,23 @@ static int veth_set_features(struct net_device *dev, { netdev_features_t changed = features ^ dev->features; struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; int err; if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) return 0; + peer = rtnl_dereference(priv->peer); if (features & NETIF_F_GRO) { err = veth_napi_enable(dev); if (err) return err; + + if (peer) + xdp_features_set_redirect_target(peer, true); } else { + if (peer) + xdp_features_clear_redirect_target(peer); veth_napi_del(dev); } return 0; @@ -1561,10 +1609,15 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; peer->max_mtu = max_mtu; } + + xdp_features_set_redirect_target(peer, true); } if (old_prog) { if (!prog) { + if (peer && !veth_gro_requested(dev)) + xdp_features_clear_redirect_target(peer); + if (dev->flags & IFF_UP) veth_disable_xdp(dev); @@ -1596,6 +1649,50 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } +static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + struct veth_xdp_buff *_ctx = (void *)ctx; + + if (!_ctx->skb) + return -ENODATA; + + *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; + return 0; +} + +static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) +{ + struct veth_xdp_buff *_ctx = (void *)ctx; + struct sk_buff *skb = _ctx->skb; + + if (!skb) + return -ENODATA; + + *hash = skb_get_hash(skb); + *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; + + return 0; +} + +static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, + u16 *vlan_tci) +{ + const struct veth_xdp_buff *_ctx = (void *)ctx; + const struct sk_buff *skb = _ctx->skb; + int err; + + if (!skb) + return -ENODATA; + + err = __vlan_hwaccel_get_tag(skb, vlan_tci); + if (err) + return err; + + *vlan_proto = skb->vlan_proto; + return err; +} + static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, @@ -1617,6 +1714,12 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_get_peer_dev = veth_peer_dev, }; +static const struct xdp_metadata_ops veth_xdp_metadata_ops = { + .xmo_rx_timestamp = veth_xdp_rx_timestamp, + .xmo_rx_hash = veth_xdp_rx_hash, + .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, +}; + #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ @@ -1631,10 +1734,12 @@ static void veth_setup(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_PHONY_HEADROOM; + dev->priv_flags |= IFF_DISABLE_NETPOLL; + dev->lltx = true; dev->netdev_ops = &veth_netdev_ops; + dev->xdp_metadata_ops = &veth_xdp_metadata_ops; dev->ethtool_ops = &veth_ethtool_ops; - dev->features |= NETIF_F_LLTX; dev->features |= VETH_FEATURES; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | @@ -1643,6 +1748,7 @@ static void veth_setup(struct net_device *dev) NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; @@ -1697,10 +1803,13 @@ static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) return 0; } -static int veth_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int veth_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *peer_net = rtnl_newlink_peer_net(params); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; int err; struct net_device *peer; struct veth_priv *priv; @@ -1708,27 +1817,15 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; unsigned char name_assign_type; struct ifinfomsg *ifmp; - struct net *net; /* * create and register peer first */ - if (data != NULL && data[VETH_INFO_PEER] != NULL) { - struct nlattr *nla_peer; + if (data && data[VETH_INFO_PEER]) { + struct nlattr *nla_peer = data[VETH_INFO_PEER]; - nla_peer = data[VETH_INFO_PEER]; ifmp = nla_data(nla_peer); - err = rtnl_nla_parse_ifla(peer_tb, - nla_data(nla_peer) + sizeof(struct ifinfomsg), - nla_len(nla_peer) - sizeof(struct ifinfomsg), - NULL); - if (err < 0) - return err; - - err = veth_validate(peer_tb, NULL, extack); - if (err < 0) - return err; - + rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } else { ifmp = NULL; @@ -1743,16 +1840,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, name_assign_type = NET_NAME_ENUM; } - net = rtnl_link_get_net(src_net, tbp); - if (IS_ERR(net)) - return PTR_ERR(net); - - peer = rtnl_create_link(net, ifname, name_assign_type, + peer = rtnl_create_link(peer_net, ifname, name_assign_type, &veth_link_ops, tbp, extack); - if (IS_ERR(peer)) { - put_net(net); + if (IS_ERR(peer)) return PTR_ERR(peer); - } if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); @@ -1763,8 +1854,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, netif_inherit_tso_max(peer, dev); err = register_netdevice(peer); - put_net(net); - net = NULL; if (err < 0) goto err_register_peer; @@ -1816,6 +1905,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, goto err_queues; veth_disable_gro(dev); + /* update XDP supported features */ + veth_set_xdp_features(dev); + veth_set_xdp_features(peer); + return 0; err_queues: @@ -1883,6 +1976,7 @@ static struct rtnl_link_ops veth_link_ops = { .newlink = veth_newlink, .dellink = veth_dellink, .policy = veth_policy, + .peer_type = VETH_INFO_PEER, .maxtype = VETH_INFO_MAX, .get_link_net = veth_get_link_net, .get_num_tx_queues = veth_get_num_queues, |
