summaryrefslogtreecommitdiff
path: root/drivers/net/veth.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/veth.c')
-rw-r--r--drivers/net/veth.c277
1 files changed, 114 insertions, 163 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index d43e62ebc2fc..14e6f2a2fb77 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -17,6 +17,7 @@
#include <net/rtnetlink.h>
#include <net/dst.h>
+#include <net/netdev_lock.h>
#include <net/xfrm.h>
#include <net/xdp.h>
#include <linux/veth.h>
@@ -26,6 +27,7 @@
#include <linux/ptr_ring.h>
#include <linux/bpf_trace.h>
#include <linux/net_tstamp.h>
+#include <linux/skbuff_ref.h>
#include <net/page_pool/helpers.h>
#define DRV_NAME "veth"
@@ -236,8 +238,8 @@ static void veth_get_ethtool_stats(struct net_device *dev,
data[tx_idx + j] += *(u64 *)(base + offset);
}
} while (u64_stats_fetch_retry(&rq_stats->syncp, start));
- pp_idx = tx_idx + VETH_TQ_STATS_LEN;
}
+ pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN;
page_pool_stats:
veth_get_page_pool_stats(dev, &data[pp_idx]);
@@ -305,12 +307,10 @@ static void __veth_xdp_flush(struct veth_rq *rq)
static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
{
- if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
- dev_kfree_skb_any(skb);
- return NET_RX_DROP;
- }
+ if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb)))
+ return NETDEV_TX_BUSY; /* signal qdisc layer */
- return NET_RX_SUCCESS;
+ return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */
}
static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
@@ -344,10 +344,11 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
struct veth_rq *rq = NULL;
+ struct netdev_queue *txq;
struct net_device *rcv;
int length = skb->len;
bool use_napi = false;
- int rxq;
+ int ret, rxq;
rcu_read_lock();
rcv = rcu_dereference(priv->peer);
@@ -370,27 +371,46 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
}
skb_tx_timestamp(skb);
- if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
+
+ ret = veth_forward_skb(rcv, skb, rq, use_napi);
+ switch (ret) {
+ case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */
if (!use_napi)
- dev_lstats_add(dev, length);
+ dev_sw_netstats_tx_add(dev, 1, length);
else
__veth_xdp_flush(rq);
- } else {
+ break;
+ case NETDEV_TX_BUSY:
+ /* If a qdisc is attached to our virtual device, returning
+ * NETDEV_TX_BUSY is allowed.
+ */
+ txq = netdev_get_tx_queue(dev, rxq);
+
+ if (qdisc_txq_has_no_queue(txq)) {
+ dev_kfree_skb_any(skb);
+ goto drop;
+ }
+ /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */
+ __skb_push(skb, ETH_HLEN);
+ netif_tx_stop_queue(txq);
+ /* Makes sure NAPI peer consumer runs. Consumer is responsible
+ * for starting txq again, until then ndo_start_xmit (this
+ * function) will not be invoked by the netstack again.
+ */
+ __veth_xdp_flush(rq);
+ break;
+ case NET_RX_DROP: /* same as NET_XMIT_DROP */
drop:
atomic64_inc(&priv->dropped);
+ ret = NET_XMIT_DROP;
+ break;
+ default:
+ net_crit_ratelimited("%s(%s): Invalid return code(%d)",
+ __func__, dev->name, ret);
}
-
rcu_read_unlock();
- return NETDEV_TX_OK;
-}
-
-static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes)
-{
- struct veth_priv *priv = netdev_priv(dev);
-
- dev_lstats_read(dev, packets, bytes);
- return atomic64_read(&priv->dropped);
+ return ret;
}
static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
@@ -430,24 +450,24 @@ static void veth_get_stats64(struct net_device *dev,
struct veth_priv *priv = netdev_priv(dev);
struct net_device *peer;
struct veth_stats rx;
- u64 packets, bytes;
- tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes);
- tot->tx_bytes = bytes;
- tot->tx_packets = packets;
+ tot->tx_dropped = atomic64_read(&priv->dropped);
+ dev_fetch_sw_netstats(tot, dev->tstats);
veth_stats_rx(&rx, dev);
tot->tx_dropped += rx.xdp_tx_err;
tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
- tot->rx_bytes = rx.xdp_bytes;
- tot->rx_packets = rx.xdp_packets;
+ tot->rx_bytes += rx.xdp_bytes;
+ tot->rx_packets += rx.xdp_packets;
rcu_read_lock();
peer = rcu_dereference(priv->peer);
if (peer) {
- veth_stats_tx(peer, &packets, &bytes);
- tot->rx_bytes += bytes;
- tot->rx_packets += packets;
+ struct rtnl_link_stats64 tot_peer = {};
+
+ dev_fetch_sw_netstats(&tot_peer, peer->tstats);
+ tot->rx_bytes += tot_peer.tx_bytes;
+ tot->rx_packets += tot_peer.tx_packets;
veth_stats_rx(&rx, peer);
tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
@@ -639,7 +659,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
break;
case XDP_TX:
orig_frame = *frame;
- xdp->rxq->mem = frame->mem;
+ xdp->rxq->mem.type = frame->mem_type;
if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
trace_xdp_exception(rq->dev, xdp_prog, act);
frame = &orig_frame;
@@ -651,7 +671,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
goto xdp_xmit;
case XDP_REDIRECT:
orig_frame = *frame;
- xdp->rxq->mem = frame->mem;
+ xdp->rxq->mem.type = frame->mem_type;
if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
frame = &orig_frame;
stats->rx_drops++;
@@ -689,8 +709,7 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
void *skbs[VETH_XDP_BATCH];
int i;
- if (xdp_alloc_skb_bulk(skbs, n_xdpf,
- GFP_ATOMIC | __GFP_ZERO) < 0) {
+ if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) {
for (i = 0; i < n_xdpf; i++)
xdp_return_frame(frames[i]);
stats->rx_drops += n_xdpf;
@@ -735,72 +754,10 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
if (skb_shared(skb) || skb_head_is_locked(skb) ||
skb_shinfo(skb)->nr_frags ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
- u32 size, len, max_head_size, off;
- struct sk_buff *nskb;
- struct page *page;
- int i, head_off;
-
- /* We need a private copy of the skb and data buffers since
- * the ebpf program can modify it. We segment the original skb
- * into order-0 pages without linearize it.
- *
- * Make sure we have enough space for linear and paged area
- */
- max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
- VETH_XDP_HEADROOM);
- if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
+ if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM))
goto drop;
- /* Allocate skb head */
- page = page_pool_dev_alloc_pages(rq->page_pool);
- if (!page)
- goto drop;
-
- nskb = napi_build_skb(page_address(page), PAGE_SIZE);
- if (!nskb) {
- page_pool_put_full_page(rq->page_pool, page, true);
- goto drop;
- }
-
- skb_reserve(nskb, VETH_XDP_HEADROOM);
- skb_copy_header(nskb, skb);
- skb_mark_for_recycle(nskb);
-
- size = min_t(u32, skb->len, max_head_size);
- if (skb_copy_bits(skb, 0, nskb->data, size)) {
- consume_skb(nskb);
- goto drop;
- }
- skb_put(nskb, size);
-
- head_off = skb_headroom(nskb) - skb_headroom(skb);
- skb_headers_offset_update(nskb, head_off);
-
- /* Allocate paged area of new skb */
- off = size;
- len = skb->len - off;
-
- for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
- page = page_pool_dev_alloc_pages(rq->page_pool);
- if (!page) {
- consume_skb(nskb);
- goto drop;
- }
-
- size = min_t(u32, len, PAGE_SIZE);
- skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
- if (skb_copy_bits(skb, off, page_address(page),
- size)) {
- consume_skb(nskb);
- goto drop;
- }
-
- len -= size;
- off += size;
- }
-
- consume_skb(skb);
- skb = nskb;
+ skb = *pskb;
}
/* SKB "head" area always have tailroom for skb_shared_info */
@@ -999,17 +956,28 @@ static int veth_poll(struct napi_struct *napi, int budget)
{
struct veth_rq *rq =
container_of(napi, struct veth_rq, xdp_napi);
+ struct veth_priv *priv = netdev_priv(rq->dev);
+ int queue_idx = rq->xdp_rxq.queue_index;
+ struct netdev_queue *peer_txq;
struct veth_stats stats = {};
+ struct net_device *peer_dev;
struct veth_xdp_tx_bq bq;
int done;
bq.count = 0;
+ /* NAPI functions as RCU section */
+ peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held());
+ peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL;
+
xdp_set_return_frame_no_direct();
done = veth_xdp_rcv(rq, budget, &bq, &stats);
if (stats.xdp_redirect > 0)
xdp_do_flush();
+ if (stats.xdp_tx > 0)
+ veth_xdp_flush(rq, &bq);
+ xdp_clear_return_frame_no_direct();
if (done < budget && napi_complete_done(napi, done)) {
/* Write rx_notify_masked before reading ptr_ring */
@@ -1022,9 +990,12 @@ static int veth_poll(struct napi_struct *napi, int budget)
}
}
- if (stats.xdp_tx > 0)
- veth_xdp_flush(rq, &bq);
- xdp_clear_return_frame_no_direct();
+ /* Release backpressure per NAPI poll */
+ smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */
+ if (peer_txq && netif_tx_queue_stopped(peer_txq)) {
+ txq_trans_cond_update(peer_txq);
+ netif_tx_wake_queue(peer_txq);
+ }
return done;
}
@@ -1206,14 +1177,6 @@ static int veth_enable_xdp(struct net_device *dev)
veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
return err;
}
-
- if (!veth_gro_requested(dev)) {
- /* user-space did not require GRO, but adding XDP
- * is supposed to get GRO working
- */
- dev->features |= NETIF_F_GRO;
- netdev_features_change(dev);
- }
}
}
@@ -1233,18 +1196,9 @@ static void veth_disable_xdp(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++)
rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
- if (!netif_running(dev) || !veth_gro_requested(dev)) {
+ if (!netif_running(dev) || !veth_gro_requested(dev))
veth_napi_del(dev);
- /* if user-space did not require GRO, since adding XDP
- * enabled it, clear it now
- */
- if (!veth_gro_requested(dev) && netif_running(dev)) {
- dev->features &= ~NETIF_F_GRO;
- netdev_features_change(dev);
- }
- }
-
veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
}
@@ -1370,7 +1324,7 @@ static int veth_set_channels(struct net_device *dev,
if (peer)
netif_carrier_off(peer);
- /* try to allocate new resurces, as needed*/
+ /* try to allocate new resources, as needed*/
err = veth_enable_range_safe(dev, old_rx_count, new_rx_count);
if (err)
goto out;
@@ -1444,6 +1398,8 @@ static int veth_open(struct net_device *dev)
netif_carrier_on(peer);
}
+ veth_set_xdp_features(dev);
+
return 0;
}
@@ -1474,7 +1430,8 @@ static int veth_alloc_queues(struct net_device *dev)
struct veth_priv *priv = netdev_priv(dev);
int i;
- priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT);
+ priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq),
+ GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!priv->rq)
return -ENOMEM;
@@ -1490,30 +1447,18 @@ static void veth_free_queues(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
- kfree(priv->rq);
+ kvfree(priv->rq);
}
static int veth_dev_init(struct net_device *dev)
{
- int err;
-
- dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
- if (!dev->lstats)
- return -ENOMEM;
-
- err = veth_alloc_queues(dev);
- if (err) {
- free_percpu(dev->lstats);
- return err;
- }
-
- return 0;
+ netdev_lockdep_set_classes(dev);
+ return veth_alloc_queues(dev);
}
static void veth_dev_free(struct net_device *dev)
{
veth_free_queues(dev);
- free_percpu(dev->lstats);
}
#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -1539,7 +1484,7 @@ static int veth_get_iflink(const struct net_device *dev)
rcu_read_lock();
peer = rcu_dereference(priv->peer);
- iflink = peer ? peer->ifindex : 0;
+ iflink = peer ? READ_ONCE(peer->ifindex) : 0;
rcu_read_unlock();
return iflink;
@@ -1558,8 +1503,6 @@ static netdev_features_t veth_fix_features(struct net_device *dev,
if (peer_priv->_xdp_prog)
features &= ~NETIF_F_GSO_SOFTWARE;
}
- if (priv->_xdp_prog)
- features |= NETIF_F_GRO;
return features;
}
@@ -1732,6 +1675,24 @@ static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
return 0;
}
+static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
+ u16 *vlan_tci)
+{
+ const struct veth_xdp_buff *_ctx = (void *)ctx;
+ const struct sk_buff *skb = _ctx->skb;
+ int err;
+
+ if (!skb)
+ return -ENODATA;
+
+ err = __vlan_hwaccel_get_tag(skb, vlan_tci);
+ if (err)
+ return err;
+
+ *vlan_proto = skb->vlan_proto;
+ return err;
+}
+
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
@@ -1756,6 +1717,7 @@ static const struct net_device_ops veth_netdev_ops = {
static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
.xmo_rx_timestamp = veth_xdp_rx_timestamp,
.xmo_rx_hash = veth_xdp_rx_hash,
+ .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag,
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
@@ -1772,11 +1734,12 @@ static void veth_setup(struct net_device *dev)
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->priv_flags |= IFF_NO_QUEUE;
dev->priv_flags |= IFF_PHONY_HEADROOM;
+ dev->priv_flags |= IFF_DISABLE_NETPOLL;
+ dev->lltx = true;
dev->netdev_ops = &veth_netdev_ops;
dev->xdp_metadata_ops = &veth_xdp_metadata_ops;
dev->ethtool_ops = &veth_ethtool_ops;
- dev->features |= NETIF_F_LLTX;
dev->features |= VETH_FEATURES;
dev->vlan_features = dev->features &
~(NETIF_F_HW_VLAN_CTAG_TX |
@@ -1785,6 +1748,7 @@ static void veth_setup(struct net_device *dev)
NETIF_F_HW_VLAN_STAG_RX);
dev->needs_free_netdev = true;
dev->priv_destructor = veth_dev_free;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->max_mtu = ETH_MAX_MTU;
dev->hw_features = VETH_FEATURES;
@@ -1839,10 +1803,13 @@ static int veth_init_queues(struct net_device *dev, struct nlattr *tb[])
return 0;
}
-static int veth_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int veth_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct net *peer_net = rtnl_newlink_peer_net(params);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
int err;
struct net_device *peer;
struct veth_priv *priv;
@@ -1850,24 +1817,15 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
unsigned char name_assign_type;
struct ifinfomsg *ifmp;
- struct net *net;
/*
* create and register peer first
*/
- if (data != NULL && data[VETH_INFO_PEER] != NULL) {
- struct nlattr *nla_peer;
+ if (data && data[VETH_INFO_PEER]) {
+ struct nlattr *nla_peer = data[VETH_INFO_PEER];
- nla_peer = data[VETH_INFO_PEER];
ifmp = nla_data(nla_peer);
- err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
- if (err < 0)
- return err;
-
- err = veth_validate(peer_tb, NULL, extack);
- if (err < 0)
- return err;
-
+ rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
tbp = peer_tb;
} else {
ifmp = NULL;
@@ -1882,16 +1840,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
name_assign_type = NET_NAME_ENUM;
}
- net = rtnl_link_get_net(src_net, tbp);
- if (IS_ERR(net))
- return PTR_ERR(net);
-
- peer = rtnl_create_link(net, ifname, name_assign_type,
+ peer = rtnl_create_link(peer_net, ifname, name_assign_type,
&veth_link_ops, tbp, extack);
- if (IS_ERR(peer)) {
- put_net(net);
+ if (IS_ERR(peer))
return PTR_ERR(peer);
- }
if (!ifmp || !tbp[IFLA_ADDRESS])
eth_hw_addr_random(peer);
@@ -1902,8 +1854,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
netif_inherit_tso_max(peer, dev);
err = register_netdevice(peer);
- put_net(net);
- net = NULL;
if (err < 0)
goto err_register_peer;
@@ -2026,6 +1976,7 @@ static struct rtnl_link_ops veth_link_ops = {
.newlink = veth_newlink,
.dellink = veth_dellink,
.policy = veth_policy,
+ .peer_type = VETH_INFO_PEER,
.maxtype = VETH_INFO_MAX,
.get_link_net = veth_get_link_net,
.get_num_tx_queues = veth_get_num_queues,