diff options
Diffstat (limited to 'drivers/net/veth.c')
| -rw-r--r-- | drivers/net/veth.c | 1190 |
1 files changed, 854 insertions, 336 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c index b594f03eeddb..14e6f2a2fb77 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -17,6 +17,7 @@ #include <net/rtnetlink.h> #include <net/dst.h> +#include <net/netdev_lock.h> #include <net/xfrm.h> #include <net/xdp.h> #include <linux/veth.h> @@ -26,6 +27,8 @@ #include <linux/ptr_ring.h> #include <linux/bpf_trace.h> #include <linux/net_tstamp.h> +#include <linux/skbuff_ref.h> +#include <net/page_pool/helpers.h> #define DRV_NAME "veth" #define DRV_VERSION "1.0" @@ -35,6 +38,7 @@ #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) #define VETH_XDP_TX_BULK_SIZE 16 +#define VETH_XDP_BATCH 16 struct veth_stats { u64 rx_drops; @@ -56,6 +60,7 @@ struct veth_rq_stats { struct veth_rq { struct napi_struct xdp_napi; + struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ struct net_device *dev; struct bpf_prog __rcu *xdp_prog; struct xdp_mem_info xdp_mem; @@ -63,6 +68,7 @@ struct veth_rq { bool rx_notify_masked; struct ptr_ring xdp_ring; struct xdp_rxq_info xdp_rxq; + struct page_pool *page_pool; }; struct veth_priv { @@ -114,6 +120,11 @@ static struct { { "peer_ifindex" }, }; +struct veth_xdp_buff { + struct xdp_buff xdp; + struct sk_buff *skb; +}; + static int veth_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { @@ -126,35 +137,30 @@ static int veth_get_link_ksettings(struct net_device *dev, static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); - strlcpy(info->version, DRV_VERSION, sizeof(info->version)); + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); + strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) { - char *p = (char *)buf; + u8 *p = buf; int i, j; switch(stringset) { case ETH_SS_STATS: memcpy(p, ðtool_stats_keys, sizeof(ethtool_stats_keys)); p += sizeof(ethtool_stats_keys); - for (i = 0; i < dev->real_num_rx_queues; i++) { - for (j = 0; j < VETH_RQ_STATS_LEN; j++) { - snprintf(p, ETH_GSTRING_LEN, - "rx_queue_%u_%.18s", - i, veth_rq_stats_desc[j].desc); - p += ETH_GSTRING_LEN; - } - } - for (i = 0; i < dev->real_num_tx_queues; i++) { - for (j = 0; j < VETH_TQ_STATS_LEN; j++) { - snprintf(p, ETH_GSTRING_LEN, - "tx_queue_%u_%.18s", - i, veth_tq_stats_desc[j].desc); - p += ETH_GSTRING_LEN; - } - } + for (i = 0; i < dev->real_num_rx_queues; i++) + for (j = 0; j < VETH_RQ_STATS_LEN; j++) + ethtool_sprintf(&p, "rx_queue_%u_%.18s", + i, veth_rq_stats_desc[j].desc); + + for (i = 0; i < dev->real_num_tx_queues; i++) + for (j = 0; j < VETH_TQ_STATS_LEN; j++) + ethtool_sprintf(&p, "tx_queue_%u_%.18s", + i, veth_tq_stats_desc[j].desc); + + page_pool_ethtool_stats_get_strings(p); break; } } @@ -165,18 +171,35 @@ static int veth_get_sset_count(struct net_device *dev, int sset) case ETH_SS_STATS: return ARRAY_SIZE(ethtool_stats_keys) + VETH_RQ_STATS_LEN * dev->real_num_rx_queues + - VETH_TQ_STATS_LEN * dev->real_num_tx_queues; + VETH_TQ_STATS_LEN * dev->real_num_tx_queues + + page_pool_ethtool_stats_get_count(); default: return -EOPNOTSUPP; } } +static void veth_get_page_pool_stats(struct net_device *dev, u64 *data) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct veth_priv *priv = netdev_priv(dev); + struct page_pool_stats pp_stats = {}; + int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + if (!priv->rq[i].page_pool) + continue; + page_pool_get_stats(priv->rq[i].page_pool, &pp_stats); + } + page_pool_ethtool_stats_get(data, &pp_stats); +#endif /* CONFIG_PAGE_POOL_STATS */ +} + static void veth_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); - int i, j, idx; + int i, j, idx, pp_idx; data[0] = peer ? peer->ifindex : 0; idx = 1; @@ -187,17 +210,18 @@ static void veth_get_ethtool_stats(struct net_device *dev, size_t offset; do { - start = u64_stats_fetch_begin_irq(&rq_stats->syncp); + start = u64_stats_fetch_begin(&rq_stats->syncp); for (j = 0; j < VETH_RQ_STATS_LEN; j++) { offset = veth_rq_stats_desc[j].offset; data[idx + j] = *(u64 *)(stats_base + offset); } - } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); + } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); idx += VETH_RQ_STATS_LEN; } + pp_idx = idx; if (!peer) - return; + goto page_pool_stats; rcv_priv = netdev_priv(peer); for (i = 0; i < peer->real_num_rx_queues; i++) { @@ -208,15 +232,31 @@ static void veth_get_ethtool_stats(struct net_device *dev, tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN; do { - start = u64_stats_fetch_begin_irq(&rq_stats->syncp); + start = u64_stats_fetch_begin(&rq_stats->syncp); for (j = 0; j < VETH_TQ_STATS_LEN; j++) { offset = veth_tq_stats_desc[j].offset; data[tx_idx + j] += *(u64 *)(base + offset); } - } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start)); + } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); } + pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; + +page_pool_stats: + veth_get_page_pool_stats(dev, &data[pp_idx]); +} + +static void veth_get_channels(struct net_device *dev, + struct ethtool_channels *channels) +{ + channels->tx_count = dev->real_num_tx_queues; + channels->rx_count = dev->real_num_rx_queues; + channels->max_tx = dev->num_tx_queues; + channels->max_rx = dev->num_rx_queues; } +static int veth_set_channels(struct net_device *dev, + struct ethtool_channels *ch); + static const struct ethtool_ops veth_ethtool_ops = { .get_drvinfo = veth_get_drvinfo, .get_link = ethtool_op_get_link, @@ -225,6 +265,8 @@ static const struct ethtool_ops veth_ethtool_ops = { .get_ethtool_stats = veth_get_ethtool_stats, .get_link_ksettings = veth_get_link_ksettings, .get_ts_info = ethtool_op_get_ts_info, + .get_channels = veth_get_channels, + .set_channels = veth_set_channels, }; /* general routines */ @@ -234,14 +276,14 @@ static bool veth_is_xdp_frame(void *ptr) return (unsigned long)ptr & VETH_XDP_FLAG; } -static void *veth_ptr_to_xdp(void *ptr) +static struct xdp_frame *veth_ptr_to_xdp(void *ptr) { return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); } -static void *veth_xdp_to_ptr(void *ptr) +static void *veth_xdp_to_ptr(struct xdp_frame *xdp) { - return (void *)((unsigned long)ptr | VETH_XDP_FLAG); + return (void *)((unsigned long)xdp | VETH_XDP_FLAG); } static void veth_ptr_free(void *ptr) @@ -256,20 +298,19 @@ static void __veth_xdp_flush(struct veth_rq *rq) { /* Write ptr_ring before reading rx_notify_masked */ smp_mb(); - if (!rq->rx_notify_masked) { - rq->rx_notify_masked = true; - napi_schedule(&rq->xdp_napi); + if (!READ_ONCE(rq->rx_notify_masked) && + napi_schedule_prep(&rq->xdp_napi)) { + WRITE_ONCE(rq->rx_notify_masked, true); + __napi_schedule(&rq->xdp_napi); } } static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) { - if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { - dev_kfree_skb_any(skb); - return NET_RX_DROP; - } + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) + return NETDEV_TX_BUSY; /* signal qdisc layer */ - return NET_RX_SUCCESS; + return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ } static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, @@ -277,21 +318,41 @@ static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, { return __dev_forward_skb(dev, skb) ?: xdp ? veth_xdp_rx(rq, skb) : - netif_rx(skb); + __netif_rx(skb); +} + +/* return true if the specified skb has chances of GRO aggregation + * Don't strive for accuracy, but try to avoid GRO overhead in the most + * common scenarios. + * When XDP is enabled, all traffic is considered eligible, as the xmit + * device has TSO off. + * When TSO is enabled on the xmit device, we are likely interested only + * in UDP aggregation, explicitly check for that if the skb is suspected + * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - + * to belong to locally generated UDP traffic. + */ +static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, + const struct net_device *rcv, + const struct sk_buff *skb) +{ + return !(dev->features & NETIF_F_ALL_TSO) || + (skb->destructor == sock_wfree && + rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); } static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct veth_rq *rq = NULL; + struct netdev_queue *txq; struct net_device *rcv; int length = skb->len; - bool rcv_xdp = false; - int rxq; + bool use_napi = false; + int ret, rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); - if (unlikely(!rcv)) { + if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) { kfree_skb(skb); goto drop; } @@ -300,34 +361,56 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) rxq = skb_get_queue_mapping(skb); if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; - rcv_xdp = rcu_access_pointer(rq->xdp_prog); - if (rcv_xdp) - skb_record_rx_queue(skb, rxq); + + /* The napi pointer is available when an XDP program is + * attached or when GRO is enabled + * Don't bother with napi/GRO if the skb can't be aggregated + */ + use_napi = rcu_access_pointer(rq->napi) && + veth_skb_is_eligible_for_gro(dev, rcv, skb); } skb_tx_timestamp(skb); - if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { - if (!rcv_xdp) - dev_lstats_add(dev, length); - } else { + + ret = veth_forward_skb(rcv, skb, rq, use_napi); + switch (ret) { + case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ + if (!use_napi) + dev_sw_netstats_tx_add(dev, 1, length); + else + __veth_xdp_flush(rq); + break; + case NETDEV_TX_BUSY: + /* If a qdisc is attached to our virtual device, returning + * NETDEV_TX_BUSY is allowed. + */ + txq = netdev_get_tx_queue(dev, rxq); + + if (qdisc_txq_has_no_queue(txq)) { + dev_kfree_skb_any(skb); + goto drop; + } + /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ + __skb_push(skb, ETH_HLEN); + netif_tx_stop_queue(txq); + /* Makes sure NAPI peer consumer runs. Consumer is responsible + * for starting txq again, until then ndo_start_xmit (this + * function) will not be invoked by the netstack again. + */ + __veth_xdp_flush(rq); + break; + case NET_RX_DROP: /* same as NET_XMIT_DROP */ drop: atomic64_inc(&priv->dropped); + ret = NET_XMIT_DROP; + break; + default: + net_crit_ratelimited("%s(%s): Invalid return code(%d)", + __func__, dev->name, ret); } - - if (rcv_xdp) - __veth_xdp_flush(rq); - rcu_read_unlock(); - return NETDEV_TX_OK; -} - -static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) -{ - struct veth_priv *priv = netdev_priv(dev); - - dev_lstats_read(dev, packets, bytes); - return atomic64_read(&priv->dropped); + return ret; } static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) @@ -346,13 +429,13 @@ static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) unsigned int start; do { - start = u64_stats_fetch_begin_irq(&stats->syncp); + start = u64_stats_fetch_begin(&stats->syncp); peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err; xdp_tx_err = stats->vs.xdp_tx_err; packets = stats->vs.xdp_packets; bytes = stats->vs.xdp_bytes; drops = stats->vs.rx_drops; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + } while (u64_stats_fetch_retry(&stats->syncp, start)); result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err; result->xdp_tx_err += xdp_tx_err; result->xdp_packets += packets; @@ -367,24 +450,24 @@ static void veth_get_stats64(struct net_device *dev, struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; struct veth_stats rx; - u64 packets, bytes; - tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); - tot->tx_bytes = bytes; - tot->tx_packets = packets; + tot->tx_dropped = atomic64_read(&priv->dropped); + dev_fetch_sw_netstats(tot, dev->tstats); veth_stats_rx(&rx, dev); tot->tx_dropped += rx.xdp_tx_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; - tot->rx_bytes = rx.xdp_bytes; - tot->rx_packets = rx.xdp_packets; + tot->rx_bytes += rx.xdp_bytes; + tot->rx_packets += rx.xdp_packets; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (peer) { - veth_stats_tx(peer, &packets, &bytes); - tot->rx_bytes += bytes; - tot->rx_packets += packets; + struct rtnl_link_stats64 tot_peer = {}; + + dev_fetch_sw_netstats(&tot_peer, peer->tstats); + tot->rx_bytes += tot_peer.tx_bytes; + tot->rx_packets += tot_peer.tx_packets; veth_stats_rx(&rx, peer); tot->tx_dropped += rx.peer_tq_xdp_xmit_err; @@ -400,24 +483,17 @@ static void veth_set_multicast_list(struct net_device *dev) { } -static struct sk_buff *veth_build_skb(void *head, int headroom, int len, - int buflen) +static int veth_select_rxq(struct net_device *dev) { - struct sk_buff *skb; - - skb = build_skb(head, buflen); - if (!skb) - return NULL; - - skb_reserve(skb, headroom); - skb_put(skb, len); - - return skb; + return smp_processor_id() % dev->real_num_rx_queues; } -static int veth_select_rxq(struct net_device *dev) +static struct net_device *veth_peer_dev(struct net_device *dev) { - return smp_processor_id() % dev->real_num_rx_queues; + struct veth_priv *priv = netdev_priv(dev); + + /* Callers must be under RCU read side. */ + return rcu_dereference(priv->peer); } static int veth_xdp_xmit(struct net_device *dev, int n, @@ -425,7 +501,7 @@ static int veth_xdp_xmit(struct net_device *dev, int n, u32 flags, bool ndo_xmit) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); - int i, ret = -ENXIO, drops = 0; + int i, ret = -ENXIO, nxmit = 0; struct net_device *rcv; unsigned int max_len; struct veth_rq *rq; @@ -440,11 +516,10 @@ static int veth_xdp_xmit(struct net_device *dev, int n, rcv_priv = netdev_priv(rcv); rq = &rcv_priv->rq[veth_select_rxq(rcv)]; - /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive - * side. This means an XDP program is loaded on the peer and the peer - * device is up. + /* The napi pointer is set if NAPI is enabled, which ensures that + * xdp_ring is initialized on receive side and the peer device is up. */ - if (!rcu_access_pointer(rq->xdp_prog)) + if (!rcu_access_pointer(rq->napi)) goto out; max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; @@ -454,22 +529,21 @@ static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame *frame = frames[i]; void *ptr = veth_xdp_to_ptr(frame); - if (unlikely(frame->len > max_len || - __ptr_ring_produce(&rq->xdp_ring, ptr))) { - xdp_return_frame_rx_napi(frame); - drops++; - } + if (unlikely(xdp_get_frame_len(frame) > max_len || + __ptr_ring_produce(&rq->xdp_ring, ptr))) + break; + nxmit++; } spin_unlock(&rq->xdp_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __veth_xdp_flush(rq); - ret = n - drops; + ret = nxmit; if (ndo_xmit) { u64_stats_update_begin(&rq->stats.syncp); - rq->stats.vs.peer_tq_xdp_xmit += n - drops; - rq->stats.vs.peer_tq_xdp_xmit_err += drops; + rq->stats.vs.peer_tq_xdp_xmit += nxmit; + rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit; u64_stats_update_end(&rq->stats.syncp); } @@ -496,20 +570,23 @@ static int veth_ndo_xdp_xmit(struct net_device *dev, int n, static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq) { - int sent, i, err = 0; + int sent, i, err = 0, drops; sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false); if (sent < 0) { err = sent; sent = 0; - for (i = 0; i < bq->count; i++) - xdp_return_frame(bq->q[i]); } - trace_xdp_bulk_tx(rq->dev, sent, bq->count - sent, err); + + for (i = sent; unlikely(i < bq->count); i++) + xdp_return_frame(bq->q[i]); + + drops = bq->count - sent; + trace_xdp_bulk_tx(rq->dev, sent, drops, err); u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_tx += sent; - rq->stats.vs.xdp_tx_err += bq->count - sent; + rq->stats.vs.xdp_tx_err += drops; u64_stats_update_end(&rq->stats.syncp); bq->count = 0; @@ -554,41 +631,36 @@ static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp, return 0; } -static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, - struct xdp_frame *frame, - struct veth_xdp_tx_bq *bq, - struct veth_stats *stats) +static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, + struct xdp_frame *frame, + struct veth_xdp_tx_bq *bq, + struct veth_stats *stats) { - void *hard_start = frame->data - frame->headroom; - int len = frame->len, delta = 0; struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; - unsigned int headroom; - struct sk_buff *skb; - - /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */ - hard_start -= sizeof(struct xdp_frame); rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { - struct xdp_buff xdp; + struct veth_xdp_buff vxbuf; + struct xdp_buff *xdp = &vxbuf.xdp; u32 act; - xdp_convert_frame_to_buff(frame, &xdp); - xdp.rxq = &rq->xdp_rxq; + xdp_convert_frame_to_buff(frame, xdp); + xdp->rxq = &rq->xdp_rxq; + vxbuf.skb = NULL; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: - delta = frame->data - xdp.data; - len = xdp.data_end - xdp.data; + if (xdp_update_frame_from_buff(xdp, frame)) + goto err_xdp; break; case XDP_TX: orig_frame = *frame; - xdp.rxq->mem = frame->mem; - if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { + xdp->rxq->mem.type = frame->mem_type; + if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; stats->rx_drops++; @@ -599,8 +671,8 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp.rxq->mem = frame->mem; - if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { + xdp->rxq->mem.type = frame->mem_type; + if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; goto err_xdp; @@ -609,11 +681,11 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, rcu_read_unlock(); goto xdp_xmit; default: - bpf_warn_invalid_xdp_action(act); - /* fall through */ + bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); + fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); - /* fall through */ + fallthrough; case XDP_DROP: stats->xdp_drops++; goto err_xdp; @@ -621,19 +693,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, } rcu_read_unlock(); - headroom = sizeof(struct xdp_frame) + frame->headroom - delta; - skb = veth_build_skb(hard_start, headroom, len, frame->frame_sz); - if (!skb) { - xdp_return_frame(frame); - stats->rx_drops++; - goto err; - } - - xdp_release_frame(frame); - xdp_scrub_frame(frame); - skb->protocol = eth_type_trans(skb, rq->dev); -err: - return skb; + return frame; err_xdp: rcu_read_unlock(); xdp_return_frame(frame); @@ -641,18 +701,101 @@ xdp_xmit: return NULL; } +/* frames array contains VETH_XDP_BATCH at most */ +static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, + int n_xdpf, struct veth_xdp_tx_bq *bq, + struct veth_stats *stats) +{ + void *skbs[VETH_XDP_BATCH]; + int i; + + if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { + for (i = 0; i < n_xdpf; i++) + xdp_return_frame(frames[i]); + stats->rx_drops += n_xdpf; + + return; + } + + for (i = 0; i < n_xdpf; i++) { + struct sk_buff *skb = skbs[i]; + + skb = __xdp_build_skb_from_frame(frames[i], skb, + rq->dev); + if (!skb) { + xdp_return_frame(frames[i]); + stats->rx_drops++; + continue; + } + napi_gro_receive(&rq->xdp_napi, skb); + } +} + +static void veth_xdp_get(struct xdp_buff *xdp) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i; + + get_page(virt_to_page(xdp->data)); + if (likely(!xdp_buff_has_frags(xdp))) + return; + + for (i = 0; i < sinfo->nr_frags; i++) + __skb_frag_ref(&sinfo->frags[i]); +} + +static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, + struct xdp_buff *xdp, + struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + u32 frame_sz; + + if (skb_shared(skb) || skb_head_is_locked(skb) || + skb_shinfo(skb)->nr_frags || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM)) + goto drop; + + skb = *pskb; + } + + /* SKB "head" area always have tailroom for skb_shared_info */ + frame_sz = skb_end_pointer(skb) - skb->head; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); + xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), + skb_headlen(skb), true); + + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } + *pskb = skb; + + return 0; +drop: + consume_skb(skb); + *pskb = NULL; + + return -ENOMEM; +} + static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - u32 pktlen, headroom, act, metalen; void *orig_data, *orig_data_end; struct bpf_prog *xdp_prog; - int mac_len, delta, off; - struct xdp_buff xdp; + struct veth_xdp_buff vxbuf; + struct xdp_buff *xdp = &vxbuf.xdp; + u32 act, metalen; + int off; - skb_orphan(skb); + skb_prepare_for_gro(skb); rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); @@ -661,70 +804,24 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, goto out; } - mac_len = skb->data - skb_mac_header(skb); - pktlen = skb->len + mac_len; - headroom = skb_headroom(skb) - mac_len; - - if (skb_shared(skb) || skb_head_is_locked(skb) || - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { - struct sk_buff *nskb; - int size, head_off; - void *head, *start; - struct page *page; - - size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - if (size > PAGE_SIZE) - goto drop; - - page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); - if (!page) - goto drop; - - head = page_address(page); - start = head + VETH_XDP_HEADROOM; - if (skb_copy_bits(skb, -mac_len, start, pktlen)) { - page_frag_free(head); - goto drop; - } - - nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, - skb->len, PAGE_SIZE); - if (!nskb) { - page_frag_free(head); - goto drop; - } - - skb_copy_header(nskb, skb); - head_off = skb_headroom(nskb) - skb_headroom(skb); - skb_headers_offset_update(nskb, head_off); - consume_skb(skb); - skb = nskb; - } - - xdp.data_hard_start = skb->head; - xdp.data = skb_mac_header(skb); - xdp.data_end = xdp.data + pktlen; - xdp.data_meta = xdp.data; - xdp.rxq = &rq->xdp_rxq; - - /* SKB "head" area always have tailroom for skb_shared_info */ - xdp.frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; - xdp.frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + __skb_push(skb, skb->data - skb_mac_header(skb)); + if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb)) + goto drop; + vxbuf.skb = skb; - orig_data = xdp.data; - orig_data_end = xdp.data_end; + orig_data = xdp->data; + orig_data_end = xdp->data_end; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: break; case XDP_TX: - get_page(virt_to_page(xdp.data)); + veth_xdp_get(xdp); consume_skb(skb); - xdp.rxq->mem = rq->xdp_mem; - if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { + xdp->rxq->mem = rq->xdp_mem; + if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); stats->rx_drops++; goto err_xdp; @@ -733,10 +830,10 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: - get_page(virt_to_page(xdp.data)); + veth_xdp_get(xdp); consume_skb(skb); - xdp.rxq->mem = rq->xdp_mem; - if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { + xdp->rxq->mem = rq->xdp_mem; + if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { stats->rx_drops++; goto err_xdp; } @@ -744,11 +841,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); goto xdp_xmit; default: - bpf_warn_invalid_xdp_action(act); - /* fall through */ + bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); + fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); - /* fall through */ + fallthrough; case XDP_DROP: stats->xdp_drops++; goto xdp_drop; @@ -756,21 +853,30 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); /* check if bpf_xdp_adjust_head was used */ - delta = orig_data - xdp.data; - off = mac_len + delta; + off = orig_data - xdp->data; if (off > 0) __skb_push(skb, off); else if (off < 0) __skb_pull(skb, -off); - skb->mac_header -= delta; + + skb_reset_mac_header(skb); /* check if bpf_xdp_adjust_tail was used */ - off = xdp.data_end - orig_data_end; + off = xdp->data_end - orig_data_end; if (off != 0) __skb_put(skb, off); /* positive on grow, negative on shrink */ + + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + skb->protocol = eth_type_trans(skb, rq->dev); - metalen = xdp.data - xdp.data_meta; + metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); out: @@ -783,7 +889,7 @@ xdp_drop: return NULL; err_xdp: rcu_read_unlock(); - page_frag_free(xdp.data); + xdp_return_buff(xdp); xdp_xmit: return NULL; } @@ -792,32 +898,49 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - int i, done = 0; + int i, done = 0, n_xdpf = 0; + void *xdpf[VETH_XDP_BATCH]; for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); - struct sk_buff *skb; if (!ptr) break; if (veth_is_xdp_frame(ptr)) { + /* ndo_xdp_xmit */ struct xdp_frame *frame = veth_ptr_to_xdp(ptr); - stats->xdp_bytes += frame->len; - skb = veth_xdp_rcv_one(rq, frame, bq, stats); + stats->xdp_bytes += xdp_get_frame_len(frame); + frame = veth_xdp_rcv_one(rq, frame, bq, stats); + if (frame) { + /* XDP_PASS */ + xdpf[n_xdpf++] = frame; + if (n_xdpf == VETH_XDP_BATCH) { + veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, + bq, stats); + n_xdpf = 0; + } + } } else { - skb = ptr; + /* ndo_start_xmit */ + struct sk_buff *skb = ptr; + stats->xdp_bytes += skb->len; skb = veth_xdp_rcv_skb(rq, skb, bq, stats); + if (skb) { + if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC)) + netif_receive_skb(skb); + else + napi_gro_receive(&rq->xdp_napi, skb); + } } - - if (skb) - napi_gro_receive(&rq->xdp_napi, skb); - done++; } + if (n_xdpf) + veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats); + u64_stats_update_begin(&rq->stats.syncp); rq->stats.vs.xdp_redirect += stats->xdp_redirect; rq->stats.vs.xdp_bytes += stats->xdp_bytes; @@ -833,39 +956,82 @@ static int veth_poll(struct napi_struct *napi, int budget) { struct veth_rq *rq = container_of(napi, struct veth_rq, xdp_napi); + struct veth_priv *priv = netdev_priv(rq->dev); + int queue_idx = rq->xdp_rxq.queue_index; + struct netdev_queue *peer_txq; struct veth_stats stats = {}; + struct net_device *peer_dev; struct veth_xdp_tx_bq bq; int done; bq.count = 0; + /* NAPI functions as RCU section */ + peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); + peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; + xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); + if (stats.xdp_redirect > 0) + xdp_do_flush(); + if (stats.xdp_tx > 0) + veth_xdp_flush(rq, &bq); + xdp_clear_return_frame_no_direct(); + if (done < budget && napi_complete_done(napi, done)) { /* Write rx_notify_masked before reading ptr_ring */ smp_store_mb(rq->rx_notify_masked, false); if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { - rq->rx_notify_masked = true; - napi_schedule(&rq->xdp_napi); + if (napi_schedule_prep(&rq->xdp_napi)) { + WRITE_ONCE(rq->rx_notify_masked, true); + __napi_schedule(&rq->xdp_napi); + } } } - if (stats.xdp_tx > 0) - veth_xdp_flush(rq, &bq); - if (stats.xdp_redirect > 0) - xdp_do_flush(); - xdp_clear_return_frame_no_direct(); + /* Release backpressure per NAPI poll */ + smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */ + if (peer_txq && netif_tx_queue_stopped(peer_txq)) { + txq_trans_cond_update(peer_txq); + netif_tx_wake_queue(peer_txq); + } return done; } -static int veth_napi_add(struct net_device *dev) +static int veth_create_page_pool(struct veth_rq *rq) +{ + struct page_pool_params pp_params = { + .order = 0, + .pool_size = VETH_RING_SIZE, + .nid = NUMA_NO_NODE, + .dev = &rq->dev->dev, + }; + + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->page_pool)) { + int err = PTR_ERR(rq->page_pool); + + rq->page_pool = NULL; + return err; + } + + return 0; +} + +static int __veth_napi_enable_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int err, i; - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { + err = veth_create_page_pool(&priv->rq[i]); + if (err) + goto err_page_pool; + } + + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); @@ -873,84 +1039,155 @@ static int veth_napi_add(struct net_device *dev) goto err_xdp_ring; } - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; - netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); napi_enable(&rq->xdp_napi); + rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); } return 0; + err_xdp_ring: - for (i--; i >= 0; i--) + for (i--; i >= start; i--) ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); + i = end; +err_page_pool: + for (i--; i >= start; i--) { + page_pool_destroy(priv->rq[i].page_pool); + priv->rq[i].page_pool = NULL; + } return err; } -static void veth_napi_del(struct net_device *dev) +static int __veth_napi_enable(struct net_device *dev) +{ + return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); +} + +static void veth_napi_del_range(struct net_device *dev, int start, int end) { struct veth_priv *priv = netdev_priv(dev); int i; - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; + rcu_assign_pointer(priv->rq[i].napi, NULL); napi_disable(&rq->xdp_napi); - napi_hash_del(&rq->xdp_napi); + __netif_napi_del(&rq->xdp_napi); } synchronize_net(); - for (i = 0; i < dev->real_num_rx_queues; i++) { + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; - netif_napi_del(&rq->xdp_napi); rq->rx_notify_masked = false; ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); } + + for (i = start; i < end; i++) { + page_pool_destroy(priv->rq[i].page_pool); + priv->rq[i].page_pool = NULL; + } } -static int veth_enable_xdp(struct net_device *dev) +static void veth_napi_del(struct net_device *dev) { - struct veth_priv *priv = netdev_priv(dev); - int err, i; - - if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { - for (i = 0; i < dev->real_num_rx_queues; i++) { - struct veth_rq *rq = &priv->rq[i]; + veth_napi_del_range(dev, 0, dev->real_num_rx_queues); +} - err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); - if (err < 0) - goto err_rxq_reg; +static bool veth_gro_requested(const struct net_device *dev) +{ + return !!(dev->wanted_features & NETIF_F_GRO); +} - err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, - MEM_TYPE_PAGE_SHARED, - NULL); - if (err < 0) - goto err_reg_mem; +static int veth_enable_xdp_range(struct net_device *dev, int start, int end, + bool napi_already_on) +{ + struct veth_priv *priv = netdev_priv(dev); + int err, i; - /* Save original mem info as it can be overwritten */ - rq->xdp_mem = rq->xdp_rxq.mem; - } + for (i = start; i < end; i++) { + struct veth_rq *rq = &priv->rq[i]; - err = veth_napi_add(dev); - if (err) + if (!napi_already_on) + netif_napi_add(dev, &rq->xdp_napi, veth_poll); + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); + if (err < 0) goto err_rxq_reg; - } - for (i = 0; i < dev->real_num_rx_queues; i++) - rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL); + if (err < 0) + goto err_reg_mem; + /* Save original mem info as it can be overwritten */ + rq->xdp_mem = rq->xdp_rxq.mem; + } return 0; + err_reg_mem: xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); err_rxq_reg: - for (i--; i >= 0; i--) - xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); + for (i--; i >= start; i--) { + struct veth_rq *rq = &priv->rq[i]; + + xdp_rxq_info_unreg(&rq->xdp_rxq); + if (!napi_already_on) + netif_napi_del(&rq->xdp_napi); + } return err; } +static void veth_disable_xdp_range(struct net_device *dev, int start, int end, + bool delete_napi) +{ + struct veth_priv *priv = netdev_priv(dev); + int i; + + for (i = start; i < end; i++) { + struct veth_rq *rq = &priv->rq[i]; + + rq->xdp_rxq.mem = rq->xdp_mem; + xdp_rxq_info_unreg(&rq->xdp_rxq); + + if (delete_napi) + netif_napi_del(&rq->xdp_napi); + } +} + +static int veth_enable_xdp(struct net_device *dev) +{ + bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); + struct veth_priv *priv = netdev_priv(dev); + int err, i; + + if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { + err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on); + if (err) + return err; + + if (!napi_already_on) { + err = __veth_napi_enable(dev); + if (err) { + veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true); + return err; + } + } + } + + for (i = 0; i < dev->real_num_rx_queues; i++) { + rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); + rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); + } + + return 0; +} + static void veth_disable_xdp(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -958,13 +1195,183 @@ static void veth_disable_xdp(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); - veth_napi_del(dev); - for (i = 0; i < dev->real_num_rx_queues; i++) { + + if (!netif_running(dev) || !veth_gro_requested(dev)) + veth_napi_del(dev); + + veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false); +} + +static int veth_napi_enable_range(struct net_device *dev, int start, int end) +{ + struct veth_priv *priv = netdev_priv(dev); + int err, i; + + for (i = start; i < end; i++) { struct veth_rq *rq = &priv->rq[i]; - rq->xdp_rxq.mem = rq->xdp_mem; - xdp_rxq_info_unreg(&rq->xdp_rxq); + netif_napi_add(dev, &rq->xdp_napi, veth_poll); } + + err = __veth_napi_enable_range(dev, start, end); + if (err) { + for (i = start; i < end; i++) { + struct veth_rq *rq = &priv->rq[i]; + + netif_napi_del(&rq->xdp_napi); + } + return err; + } + return err; +} + +static int veth_napi_enable(struct net_device *dev) +{ + return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues); +} + +static void veth_disable_range_safe(struct net_device *dev, int start, int end) +{ + struct veth_priv *priv = netdev_priv(dev); + + if (start >= end) + return; + + if (priv->_xdp_prog) { + veth_napi_del_range(dev, start, end); + veth_disable_xdp_range(dev, start, end, false); + } else if (veth_gro_requested(dev)) { + veth_napi_del_range(dev, start, end); + } +} + +static int veth_enable_range_safe(struct net_device *dev, int start, int end) +{ + struct veth_priv *priv = netdev_priv(dev); + int err; + + if (start >= end) + return 0; + + if (priv->_xdp_prog) { + /* these channels are freshly initialized, napi is not on there even + * when GRO is requeste + */ + err = veth_enable_xdp_range(dev, start, end, false); + if (err) + return err; + + err = __veth_napi_enable_range(dev, start, end); + if (err) { + /* on error always delete the newly added napis */ + veth_disable_xdp_range(dev, start, end, true); + return err; + } + } else if (veth_gro_requested(dev)) { + return veth_napi_enable_range(dev, start, end); + } + return 0; +} + +static void veth_set_xdp_features(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; + + peer = rtnl_dereference(priv->peer); + if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { + struct veth_priv *priv_peer = netdev_priv(peer); + xdp_features_t val = NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_RX_SG; + + if (priv_peer->_xdp_prog || veth_gro_requested(peer)) + val |= NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_set_features_flag(dev, val); + } else { + xdp_clear_features_flag(dev); + } +} + +static int veth_set_channels(struct net_device *dev, + struct ethtool_channels *ch) +{ + struct veth_priv *priv = netdev_priv(dev); + unsigned int old_rx_count, new_rx_count; + struct veth_priv *peer_priv; + struct net_device *peer; + int err; + + /* sanity check. Upper bounds are already enforced by the caller */ + if (!ch->rx_count || !ch->tx_count) + return -EINVAL; + + /* avoid braking XDP, if that is enabled */ + peer = rtnl_dereference(priv->peer); + peer_priv = peer ? netdev_priv(peer) : NULL; + if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues) + return -EINVAL; + + if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues) + return -EINVAL; + + old_rx_count = dev->real_num_rx_queues; + new_rx_count = ch->rx_count; + if (netif_running(dev)) { + /* turn device off */ + netif_carrier_off(dev); + if (peer) + netif_carrier_off(peer); + + /* try to allocate new resources, as needed*/ + err = veth_enable_range_safe(dev, old_rx_count, new_rx_count); + if (err) + goto out; + } + + err = netif_set_real_num_rx_queues(dev, ch->rx_count); + if (err) + goto revert; + + err = netif_set_real_num_tx_queues(dev, ch->tx_count); + if (err) { + int err2 = netif_set_real_num_rx_queues(dev, old_rx_count); + + /* this error condition could happen only if rx and tx change + * in opposite directions (e.g. tx nr raises, rx nr decreases) + * and we can't do anything to fully restore the original + * status + */ + if (err2) + pr_warn("Can't restore rx queues config %d -> %d %d", + new_rx_count, old_rx_count, err2); + else + goto revert; + } + +out: + if (netif_running(dev)) { + /* note that we need to swap the arguments WRT the enable part + * to identify the range we have to disable + */ + veth_disable_range_safe(dev, new_rx_count, old_rx_count); + netif_carrier_on(dev); + if (peer) + netif_carrier_on(peer); + } + + /* update XDP supported features */ + veth_set_xdp_features(dev); + if (peer) + veth_set_xdp_features(peer); + + return err; + +revert: + new_rx_count = old_rx_count; + old_rx_count = ch->rx_count; + goto out; } static int veth_open(struct net_device *dev) @@ -980,6 +1387,10 @@ static int veth_open(struct net_device *dev) err = veth_enable_xdp(dev); if (err) return err; + } else if (veth_gro_requested(dev)) { + err = veth_napi_enable(dev); + if (err) + return err; } if (peer->flags & IFF_UP) { @@ -987,6 +1398,8 @@ static int veth_open(struct net_device *dev) netif_carrier_on(peer); } + veth_set_xdp_features(dev); + return 0; } @@ -1001,6 +1414,8 @@ static int veth_close(struct net_device *dev) if (priv->_xdp_prog) veth_disable_xdp(dev); + else if (veth_gro_requested(dev)) + veth_napi_del(dev); return 0; } @@ -1015,7 +1430,8 @@ static int veth_alloc_queues(struct net_device *dev) struct veth_priv *priv = netdev_priv(dev); int i; - priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); + priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq), + GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!priv->rq) return -ENOMEM; @@ -1031,30 +1447,18 @@ static void veth_free_queues(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); - kfree(priv->rq); + kvfree(priv->rq); } static int veth_dev_init(struct net_device *dev) { - int err; - - dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); - if (!dev->lstats) - return -ENOMEM; - - err = veth_alloc_queues(dev); - if (err) { - free_percpu(dev->lstats); - return err; - } - - return 0; + netdev_lockdep_set_classes(dev); + return veth_alloc_queues(dev); } static void veth_dev_free(struct net_device *dev) { veth_free_queues(dev); - free_percpu(dev->lstats); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -1080,7 +1484,7 @@ static int veth_get_iflink(const struct net_device *dev) rcu_read_lock(); peer = rcu_dereference(priv->peer); - iflink = peer ? peer->ifindex : 0; + iflink = peer ? READ_ONCE(peer->ifindex) : 0; rcu_read_unlock(); return iflink; @@ -1103,6 +1507,33 @@ static netdev_features_t veth_fix_features(struct net_device *dev, return features; } +static int veth_set_features(struct net_device *dev, + netdev_features_t features) +{ + netdev_features_t changed = features ^ dev->features; + struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; + int err; + + if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) + return 0; + + peer = rtnl_dereference(priv->peer); + if (features & NETIF_F_GRO) { + err = veth_napi_enable(dev); + if (err) + return err; + + if (peer) + xdp_features_set_redirect_target(peer, true); + } else { + if (peer) + xdp_features_clear_redirect_target(peer); + veth_napi_del(dev); + } + return 0; +} + static void veth_set_rx_headroom(struct net_device *dev, int new_hr) { struct veth_priv *peer_priv, *priv = netdev_priv(dev); @@ -1146,9 +1577,14 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, goto err; } - max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - - peer->hard_header_len - - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) - + peer->hard_header_len; + /* Allow increasing the max_mtu if the program supports + * XDP fragments. + */ + if (prog->aux->xdp_has_frags) + max_mtu += PAGE_SIZE * MAX_SKB_FRAGS; + if (peer->mtu > max_mtu) { NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); err = -ERANGE; @@ -1173,10 +1609,15 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; peer->max_mtu = max_mtu; } + + xdp_features_set_redirect_target(peer, true); } if (old_prog) { if (!prog) { + if (peer && !veth_gro_requested(dev)) + xdp_features_clear_redirect_target(peer); + if (dev->flags & IFF_UP) veth_disable_xdp(dev); @@ -1198,31 +1639,60 @@ err: return err; } -static u32 veth_xdp_query(struct net_device *dev) -{ - struct veth_priv *priv = netdev_priv(dev); - const struct bpf_prog *xdp_prog; - - xdp_prog = priv->_xdp_prog; - if (xdp_prog) - return xdp_prog->aux->id; - - return 0; -} - static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return veth_xdp_set(dev, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = veth_xdp_query(dev); - return 0; default: return -EINVAL; } } +static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) +{ + struct veth_xdp_buff *_ctx = (void *)ctx; + + if (!_ctx->skb) + return -ENODATA; + + *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp; + return 0; +} + +static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) +{ + struct veth_xdp_buff *_ctx = (void *)ctx; + struct sk_buff *skb = _ctx->skb; + + if (!skb) + return -ENODATA; + + *hash = skb_get_hash(skb); + *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; + + return 0; +} + +static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, + u16 *vlan_tci) +{ + const struct veth_xdp_buff *_ctx = (void *)ctx; + const struct sk_buff *skb = _ctx->skb; + int err; + + if (!skb) + return -ENODATA; + + err = __vlan_hwaccel_get_tag(skb, vlan_tci); + if (err) + return err; + + *vlan_proto = skb->vlan_proto; + return err; +} + static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, @@ -1236,10 +1706,18 @@ static const struct net_device_ops veth_netdev_ops = { #endif .ndo_get_iflink = veth_get_iflink, .ndo_fix_features = veth_fix_features, + .ndo_set_features = veth_set_features, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, + .ndo_get_peer_dev = veth_peer_dev, +}; + +static const struct xdp_metadata_ops veth_xdp_metadata_ops = { + .xmo_rx_timestamp = veth_xdp_rx_timestamp, + .xmo_rx_hash = veth_xdp_rx_hash, + .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ @@ -1256,10 +1734,12 @@ static void veth_setup(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_PHONY_HEADROOM; + dev->priv_flags |= IFF_DISABLE_NETPOLL; + dev->lltx = true; dev->netdev_ops = &veth_netdev_ops; + dev->xdp_metadata_ops = &veth_xdp_metadata_ops; dev->ethtool_ops = &veth_ethtool_ops; - dev->features |= NETIF_F_LLTX; dev->features |= VETH_FEATURES; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | @@ -1268,11 +1748,13 @@ static void veth_setup(struct net_device *dev) NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; + netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* @@ -1297,10 +1779,37 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[], static struct rtnl_link_ops veth_link_ops; -static int veth_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static void veth_disable_gro(struct net_device *dev) +{ + dev->features &= ~NETIF_F_GRO; + dev->wanted_features &= ~NETIF_F_GRO; + netdev_update_features(dev); +} + +static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) +{ + int err; + + if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) { + err = netif_set_real_num_tx_queues(dev, 1); + if (err) + return err; + } + if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) { + err = netif_set_real_num_rx_queues(dev, 1); + if (err) + return err; + } + return 0; +} + +static int veth_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *peer_net = rtnl_newlink_peer_net(params); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; int err; struct net_device *peer; struct veth_priv *priv; @@ -1308,27 +1817,15 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; unsigned char name_assign_type; struct ifinfomsg *ifmp; - struct net *net; /* * create and register peer first */ - if (data != NULL && data[VETH_INFO_PEER] != NULL) { - struct nlattr *nla_peer; + if (data && data[VETH_INFO_PEER]) { + struct nlattr *nla_peer = data[VETH_INFO_PEER]; - nla_peer = data[VETH_INFO_PEER]; ifmp = nla_data(nla_peer); - err = rtnl_nla_parse_ifla(peer_tb, - nla_data(nla_peer) + sizeof(struct ifinfomsg), - nla_len(nla_peer) - sizeof(struct ifinfomsg), - NULL); - if (err < 0) - return err; - - err = veth_validate(peer_tb, NULL, extack); - if (err < 0) - return err; - + rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack); tbp = peer_tb; } else { ifmp = NULL; @@ -1336,23 +1833,17 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, } if (ifmp && tbp[IFLA_IFNAME]) { - nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); name_assign_type = NET_NAME_ENUM; } - net = rtnl_link_get_net(src_net, tbp); - if (IS_ERR(net)) - return PTR_ERR(net); - - peer = rtnl_create_link(net, ifname, name_assign_type, + peer = rtnl_create_link(peer_net, ifname, name_assign_type, &veth_link_ops, tbp, extack); - if (IS_ERR(peer)) { - put_net(net); + if (IS_ERR(peer)) return PTR_ERR(peer); - } if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); @@ -1360,18 +1851,19 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, if (ifmp && (dev->ifindex != 0)) peer->ifindex = ifmp->ifi_index; - peer->gso_max_size = dev->gso_max_size; - peer->gso_max_segs = dev->gso_max_segs; + netif_inherit_tso_max(peer, dev); err = register_netdevice(peer); - put_net(net); - net = NULL; if (err < 0) goto err_register_peer; + /* keep GRO disabled by default to be consistent with the established + * veth behavior + */ + veth_disable_gro(peer); netif_carrier_off(peer); - err = rtnl_configure_link(peer, ifmp); + err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto err_configure_peer; @@ -1386,7 +1878,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, eth_hw_addr_random(dev); if (tb[IFLA_IFNAME]) - nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); @@ -1402,12 +1894,25 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, priv = netdev_priv(dev); rcu_assign_pointer(priv->peer, peer); + err = veth_init_queues(dev, tb); + if (err) + goto err_queues; priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + err = veth_init_queues(peer, tb); + if (err) + goto err_queues; + + veth_disable_gro(dev); + /* update XDP supported features */ + veth_set_xdp_features(dev); + veth_set_xdp_features(peer); return 0; +err_queues: + unregister_netdevice(dev); err_register_dev: /* nothing to do */ err_configure_peer: @@ -1453,6 +1958,16 @@ static struct net *veth_get_link_net(const struct net_device *dev) return peer ? dev_net(peer) : dev_net(dev); } +static unsigned int veth_get_num_queues(void) +{ + /* enforce the same queue limit as rtnl_create_link */ + int queues = num_possible_cpus(); + + if (queues > 4096) + queues = 4096; + return queues; +} + static struct rtnl_link_ops veth_link_ops = { .kind = DRV_NAME, .priv_size = sizeof(struct veth_priv), @@ -1461,8 +1976,11 @@ static struct rtnl_link_ops veth_link_ops = { .newlink = veth_newlink, .dellink = veth_dellink, .policy = veth_policy, + .peer_type = VETH_INFO_PEER, .maxtype = VETH_INFO_MAX, .get_link_net = veth_get_link_net, + .get_num_tx_queues = veth_get_num_queues, + .get_num_rx_queues = veth_get_num_queues, }; /* |
