summaryrefslogtreecommitdiff
path: root/drivers/net/netdevsim/netdev.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
commit8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf (patch)
treefec3039a08284cd87f4ec9c3bea5b5a439f1859f /drivers/net/netdevsim/netdev.c
parent4b290aae788e06561754b28c6842e4080957d3f7 (diff)
parentfa582ca7e187a15e772e6a72fe035f649b387a60 (diff)
Merge tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Wrap datapath globals into net_aligned_data, to avoid false sharing - Preserve MSG_ZEROCOPY in forwarding (e.g. out of a container) - Add SO_INQ and SCM_INQ support to AF_UNIX - Add SIOCINQ support to AF_VSOCK - Add TCP_MAXSEG sockopt to MPTCP - Add IPv6 force_forwarding sysctl to enable forwarding per interface - Make TCP validation of whether packet fully fits in the receive window and the rcv_buf more strict. With increased use of HW aggregation a single "packet" can be multiple 100s of kB - Add MSG_MORE flag to optimize large TCP transmissions via sockmap, improves latency up to 33% for sockmap users - Convert TCP send queue handling from tasklet to BH workque - Improve BPF iteration over TCP sockets to see each socket exactly once - Remove obsolete and unused TCP RFC3517/RFC6675 loss recovery code - Support enabling kernel threads for NAPI processing on per-NAPI instance basis rather than a whole device. Fully stop the kernel NAPI thread when threaded NAPI gets disabled. Previously thread would stick around until ifdown due to tricky synchronization - Allow multicast routing to take effect on locally-generated packets - Add output interface argument for End.X in segment routing - MCTP: add support for gateway routing, improve bind() handling - Don't require rtnl_lock when fetching an IPv6 neighbor over Netlink - Add a new neighbor flag ("extern_valid"), which cedes refresh responsibilities to userspace. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed - Support NUD_PERMANENT for proxy neighbor entries - Add a new queuing discipline for IETF RFC9332 DualQ Coupled AQM - Add sequence numbers to netconsole messages. Unregister netconsole's console when all net targets are removed. Code refactoring. Add a number of selftests - Align IPSec inbound SA lookup to RFC 4301. Only SPI and protocol should be used for an inbound SA lookup - Support inspecting ref_tracker state via DebugFS - Don't force bonding advertisement frames tx to ~333 ms boundaries. Add broadcast_neighbor option to send ARP/ND on all bonded links - Allow providing upcall pid for the 'execute' command in openvswitch - Remove DCCP support from Netfilter's conntrack - Disallow multiple packet duplications in the queuing layer - Prevent use of deprecated iptables code on PREEMPT_RT Driver API: - Support RSS and hashing configuration over ethtool Netlink - Add dedicated ethtool callbacks for getting and setting hashing fields - Add support for power budget evaluation strategy in PSE / Power-over-Ethernet. Generate Netlink events for overcurrent etc - Support DPLL phase offset monitoring across all device inputs. Support providing clock reference and SYNC over separate DPLL inputs - Support traffic classes in devlink rate API for bandwidth management - Remove rtnl_lock dependency from UDP tunnel port configuration Device drivers: - Add a new Broadcom driver for 800G Ethernet (bnge) - Add a standalone driver for Microchip ZL3073x DPLL - Remove IBM's NETIUCV device driver - Ethernet high-speed NICs: - Broadcom (bnxt): - support zero-copy Tx of DMABUF memory - take page size into account for page pool recycling rings - Intel (100G, ice, idpf): - idpf: XDP and AF_XDP support preparations - idpf: add flow steering - add link_down_events statistic - clean up the TSPLL code - preparations for live VM migration - nVidia/Mellanox: - support zero-copy Rx/Tx interfaces (DMABUF and io_uring) - optimize context memory usage for matchers - expose serial numbers in devlink info - support PCIe congestion metrics - Meta (fbnic): - add 25G, 50G, and 100G link modes to phylink - support dumping FW logs - Marvell/Cavium: - support for CN20K generation of the Octeon chips - Amazon: - add HW clock (without timestamping, just hypervisor time access) - Ethernet virtual: - VirtIO net: - support segmentation of UDP-tunnel-encapsulated packets - Google (gve): - support packet timestamping and clock synchronization - Microsoft vNIC: - add handler for device-originated servicing events - allow dynamic MSI-X vector allocation - support Tx bandwidth clamping - Ethernet NICs consumer, and embedded: - AMD: - amd-xgbe: hardware timestamping and PTP clock support - Broadcom integrated MACs (bcmgenet, bcmasp): - use napi_complete_done() return value to support NAPI polling - add support for re-starting auto-negotiation - Broadcom switches (b53): - support BCM5325 switches - add bcm63xx EPHY power control - Synopsys (stmmac): - lots of code refactoring and cleanups - TI: - icssg-prueth: read firmware-names from device tree - icssg: PRP offload support - Microchip: - lan78xx: convert to PHYLINK for improved PHY and MAC management - ksz: add KSZ8463 switch support - Intel: - support similar queue priority scheme in multi-queue and time-sensitive networking (taprio) - support packet pre-emption in both - RealTek (r8169): - enable EEE at 5Gbps on RTL8126 - Airoha: - add PPPoE offload support - MDIO bus controller for Airoha AN7583 - Ethernet PHYs: - support for the IPQ5018 internal GE PHY - micrel KSZ9477 switch-integrated PHYs: - add MDI/MDI-X control support - add RX error counters - add cable test support - add Signal Quality Indicator (SQI) reporting - dp83tg720: improve reset handling and reduce link recovery time - support bcm54811 (and its MII-Lite interface type) - air_en8811h: support resume/suspend - support PHY counters for QCA807x and QCA808x - support WoL for QCA807x - CAN drivers: - rcar_canfd: support for Transceiver Delay Compensation - kvaser: report FW versions via devlink dev info - WiFi: - extended regulatory info support (6 GHz) - add statistics and beacon monitor for Multi-Link Operation (MLO) - support S1G aggregation, improve S1G support - add Radio Measurement action fields - support per-radio RTS threshold - some work around how FIPS affects wifi, which was wrong (RC4 is used by TKIP, not only WEP) - improvements for unsolicited probe response handling - WiFi drivers: - RealTek (rtw88): - IBSS mode for SDIO devices - RealTek (rtw89): - BT coexistence for MLO/WiFi7 - concurrent station + P2P support - support for USB devices RTL8851BU/RTL8852BU - Intel (iwlwifi): - use embedded PNVM in (to be released) FW images to fix compatibility issues - many cleanups (unused FW APIs, PCIe code, WoWLAN) - some FIPS interoperability - MediaTek (mt76): - firmware recovery improvements - more MLO work - Qualcomm/Atheros (ath12k): - fix scan on multi-radio devices - more EHT/Wi-Fi 7 features - encapsulation/decapsulation offload - Broadcom (brcm80211): - support SDIO 43751 device - Bluetooth: - hci_event: add support for handling LE BIG Sync Lost event - ISO: add socket option to report packet seqnum via CMSG - ISO: support SCM_TIMESTAMPING for ISO TS - Bluetooth drivers: - intel_pcie: support Function Level Reset - nxpuart: add support for 4M baudrate - nxpuart: implement powerup sequence, reset, FW dump, and FW loading" * tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1742 commits) dpll: zl3073x: Fix build failure selftests: bpf: fix legacy netfilter options ipv6: annotate data-races around rt->fib6_nsiblings ipv6: fix possible infinite loop in fib6_info_uses_dev() ipv6: prevent infinite loop in rt6_nlmsg_size() ipv6: add a retry logic in net6_rt_notify() vrf: Drop existing dst reference in vrf_ip6_input_dst net/sched: taprio: align entry index attr validation with mqprio net: fsl_pq_mdio: use dev_err_probe selftests: rtnetlink.sh: remove esp4_offload after test vsock: remove unnecessary null check in vsock_getname() igb: xsk: solve negative overflow of nb_pkts in zerocopy mode stmmac: xsk: fix negative overflow of budget in zerocopy mode dt-bindings: ieee802154: Convert at86rf230.txt yaml format net: dsa: microchip: Disable PTP function of KSZ8463 net: dsa: microchip: Setup fiber ports for KSZ8463 net: dsa: microchip: Write switch MAC address differently for KSZ8463 net: dsa: microchip: Use different registers for KSZ8463 net: dsa: microchip: Add KSZ8463 switch support to KSZ DSA driver dt-bindings: net: dsa: microchip: Add KSZ8463 switch support ...
Diffstat (limited to 'drivers/net/netdevsim/netdev.c')
-rw-r--r--drivers/net/netdevsim/netdev.c160
1 files changed, 114 insertions, 46 deletions
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index fa5fbd97ad69..39fe28af48b9 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -37,7 +37,53 @@ MODULE_IMPORT_NS("NETDEV_INTERNAL");
#define NSIM_RING_SIZE 256
-static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb)
+static void nsim_start_peer_tx_queue(struct net_device *dev, struct nsim_rq *rq)
+{
+ struct netdevsim *ns = netdev_priv(dev);
+ struct net_device *peer_dev;
+ struct netdevsim *peer_ns;
+ struct netdev_queue *txq;
+ u16 idx;
+
+ idx = rq->napi.index;
+ rcu_read_lock();
+ peer_ns = rcu_dereference(ns->peer);
+ if (!peer_ns)
+ goto out;
+
+ /* TX device */
+ peer_dev = peer_ns->netdev;
+ if (dev->real_num_tx_queues != peer_dev->num_rx_queues)
+ goto out;
+
+ txq = netdev_get_tx_queue(peer_dev, idx);
+ if (!netif_tx_queue_stopped(txq))
+ goto out;
+
+ netif_tx_wake_queue(txq);
+out:
+ rcu_read_unlock();
+}
+
+static void nsim_stop_tx_queue(struct net_device *tx_dev,
+ struct net_device *rx_dev,
+ struct nsim_rq *rq,
+ u16 idx)
+{
+ /* If different queues size, do not stop, since it is not
+ * easy to find which TX queue is mapped here
+ */
+ if (rx_dev->real_num_tx_queues != tx_dev->num_rx_queues)
+ return;
+
+ /* rq is the queue on the receive side */
+ netif_subqueue_try_stop(tx_dev, idx,
+ NSIM_RING_SIZE - skb_queue_len(&rq->skb_queue),
+ NSIM_RING_SIZE / 2);
+}
+
+static int nsim_napi_rx(struct net_device *tx_dev, struct net_device *rx_dev,
+ struct nsim_rq *rq, struct sk_buff *skb)
{
if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) {
dev_kfree_skb_any(skb);
@@ -45,13 +91,22 @@ static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb)
}
skb_queue_tail(&rq->skb_queue, skb);
+
+ /* Stop the peer TX queue avoiding dropping packets later */
+ if (skb_queue_len(&rq->skb_queue) >= NSIM_RING_SIZE)
+ nsim_stop_tx_queue(tx_dev, rx_dev, rq,
+ skb_get_queue_mapping(skb));
+
return NET_RX_SUCCESS;
}
-static int nsim_forward_skb(struct net_device *dev, struct sk_buff *skb,
+static int nsim_forward_skb(struct net_device *tx_dev,
+ struct net_device *rx_dev,
+ struct sk_buff *skb,
struct nsim_rq *rq)
{
- return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(rq, skb);
+ return __dev_forward_skb(rx_dev, skb) ?:
+ nsim_napi_rx(tx_dev, rx_dev, rq, skb);
}
static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -86,26 +141,21 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
skb_linearize(skb);
skb_tx_timestamp(skb);
- if (unlikely(nsim_forward_skb(peer_dev, skb, rq) == NET_RX_DROP))
+ if (unlikely(nsim_forward_skb(dev, peer_dev, skb, rq) == NET_RX_DROP))
goto out_drop_cnt;
if (!hrtimer_active(&rq->napi_timer))
hrtimer_start(&rq->napi_timer, us_to_ktime(5), HRTIMER_MODE_REL);
rcu_read_unlock();
- u64_stats_update_begin(&ns->syncp);
- ns->tx_packets++;
- ns->tx_bytes += len;
- u64_stats_update_end(&ns->syncp);
+ dev_dstats_tx_add(dev, len);
return NETDEV_TX_OK;
out_drop_free:
dev_kfree_skb(skb);
out_drop_cnt:
rcu_read_unlock();
- u64_stats_update_begin(&ns->syncp);
- ns->tx_dropped++;
- u64_stats_update_end(&ns->syncp);
+ dev_dstats_tx_dropped(dev);
return NETDEV_TX_OK;
}
@@ -126,20 +176,6 @@ static int nsim_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
-static void
-nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
-{
- struct netdevsim *ns = netdev_priv(dev);
- unsigned int start;
-
- do {
- start = u64_stats_fetch_begin(&ns->syncp);
- stats->tx_bytes = ns->tx_bytes;
- stats->tx_packets = ns->tx_packets;
- stats->tx_dropped = ns->tx_dropped;
- } while (u64_stats_fetch_retry(&ns->syncp, start));
-}
-
static int
nsim_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
{
@@ -350,18 +386,46 @@ static int nsim_get_iflink(const struct net_device *dev)
static int nsim_rcv(struct nsim_rq *rq, int budget)
{
+ struct net_device *dev = rq->napi.dev;
+ struct bpf_prog *xdp_prog;
+ struct netdevsim *ns;
struct sk_buff *skb;
- int i;
+ unsigned int skblen;
+ int i, ret;
+
+ ns = netdev_priv(dev);
+ xdp_prog = READ_ONCE(ns->xdp.prog);
for (i = 0; i < budget; i++) {
if (skb_queue_empty(&rq->skb_queue))
break;
skb = skb_dequeue(&rq->skb_queue);
+
+ if (xdp_prog) {
+ /* skb might be freed directly by XDP, save the len */
+ skblen = skb->len;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb_checksum_help(skb);
+ ret = do_xdp_generic(xdp_prog, &skb);
+ if (ret != XDP_PASS) {
+ dev_dstats_rx_add(dev, skblen);
+ continue;
+ }
+ }
+
+ /* skb might be discard at netif_receive_skb, save the len */
+ skblen = skb->len;
skb_mark_napi_id(skb, &rq->napi);
- netif_receive_skb(skb);
+ ret = netif_receive_skb(skb);
+ if (ret == NET_RX_SUCCESS)
+ dev_dstats_rx_add(dev, skblen);
+ else
+ dev_dstats_rx_dropped(dev);
}
+ nsim_start_peer_tx_queue(dev, rq);
return i;
}
@@ -556,7 +620,6 @@ static const struct net_device_ops nsim_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = nsim_change_mtu,
- .ndo_get_stats64 = nsim_get_stats64,
.ndo_set_vf_mac = nsim_set_vf_mac,
.ndo_set_vf_vlan = nsim_set_vf_vlan,
.ndo_set_vf_rate = nsim_set_vf_rate,
@@ -580,7 +643,6 @@ static const struct net_device_ops nsim_vf_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = nsim_change_mtu,
- .ndo_get_stats64 = nsim_get_stats64,
.ndo_setup_tc = nsim_setup_tc,
.ndo_set_features = nsim_set_features,
};
@@ -594,7 +656,7 @@ static void nsim_get_queue_stats_rx(struct net_device *dev, int idx,
struct rtnl_link_stats64 rtstats = {};
if (!idx)
- nsim_get_stats64(dev, &rtstats);
+ dev_get_stats(dev, &rtstats);
stats->packets = rtstats.rx_packets - !!rtstats.rx_packets;
stats->bytes = rtstats.rx_bytes;
@@ -606,7 +668,7 @@ static void nsim_get_queue_stats_tx(struct net_device *dev, int idx,
struct rtnl_link_stats64 rtstats = {};
if (!idx)
- nsim_get_stats64(dev, &rtstats);
+ dev_get_stats(dev, &rtstats);
stats->packets = rtstats.tx_packets - !!rtstats.tx_packets;
stats->bytes = rtstats.tx_bytes;
@@ -618,7 +680,7 @@ static void nsim_get_base_stats(struct net_device *dev,
{
struct rtnl_link_stats64 rtstats = {};
- nsim_get_stats64(dev, &rtstats);
+ dev_get_stats(dev, &rtstats);
rx->packets = !!rtstats.rx_packets;
rx->bytes = 0;
@@ -645,9 +707,12 @@ static struct nsim_rq *nsim_queue_alloc(void)
return rq;
}
-static void nsim_queue_free(struct nsim_rq *rq)
+static void nsim_queue_free(struct net_device *dev, struct nsim_rq *rq)
{
hrtimer_cancel(&rq->napi_timer);
+ local_bh_disable();
+ dev_dstats_rx_dropped_add(dev, rq->skb_queue.qlen);
+ local_bh_enable();
skb_queue_purge_reason(&rq->skb_queue, SKB_DROP_REASON_QUEUE_PURGE);
kfree(rq);
}
@@ -694,7 +759,7 @@ nsim_queue_mem_alloc(struct net_device *dev, void *per_queue_mem, int idx)
return 0;
err_free:
- nsim_queue_free(qmem->rq);
+ nsim_queue_free(dev, qmem->rq);
return err;
}
@@ -708,7 +773,7 @@ static void nsim_queue_mem_free(struct net_device *dev, void *per_queue_mem)
if (!ns->rq_reset_mode)
netif_napi_del_locked(&qmem->rq->napi);
page_pool_destroy(qmem->rq->page_pool);
- nsim_queue_free(qmem->rq);
+ nsim_queue_free(dev, qmem->rq);
}
}
@@ -852,7 +917,8 @@ nsim_pp_hold_write(struct file *file, const char __user *data,
if (!ns->page)
ret = -ENOMEM;
} else {
- page_pool_put_full_page(ns->page->pp, ns->page, false);
+ page_pool_put_full_page(pp_page_to_nmdesc(ns->page)->pp,
+ ns->page, false);
ns->page = NULL;
}
@@ -874,10 +940,8 @@ static void nsim_setup(struct net_device *dev)
ether_setup(dev);
eth_hw_addr_random(dev);
- dev->tx_queue_len = 0;
dev->flags &= ~IFF_MULTICAST;
- dev->priv_flags |= IFF_LIVE_ADDR_CHANGE |
- IFF_NO_QUEUE;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->features |= NETIF_F_HIGHDMA |
NETIF_F_SG |
NETIF_F_FRAGLIST |
@@ -890,8 +954,9 @@ static void nsim_setup(struct net_device *dev)
NETIF_F_HW_CSUM |
NETIF_F_LRO |
NETIF_F_TSO;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
dev->max_mtu = ETH_MAX_MTU;
- dev->xdp_features = NETDEV_XDP_ACT_HW_OFFLOAD;
+ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_HW_OFFLOAD;
}
static int nsim_queue_init(struct netdevsim *ns)
@@ -925,7 +990,7 @@ static void nsim_queue_uninit(struct netdevsim *ns)
int i;
for (i = 0; i < dev->num_rx_queues; i++)
- nsim_queue_free(ns->rq[i]);
+ nsim_queue_free(dev, ns->rq[i]);
kfree(ns->rq);
ns->rq = NULL;
@@ -1007,8 +1072,9 @@ static void nsim_exit_netdevsim(struct netdevsim *ns)
mock_phc_destroy(ns->phc);
}
-struct netdevsim *
-nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port)
+struct netdevsim *nsim_create(struct nsim_dev *nsim_dev,
+ struct nsim_dev_port *nsim_dev_port,
+ u8 perm_addr[ETH_ALEN])
{
struct net_device *dev;
struct netdevsim *ns;
@@ -1019,10 +1085,12 @@ nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port)
if (!dev)
return ERR_PTR(-ENOMEM);
+ if (perm_addr)
+ memcpy(dev->perm_addr, perm_addr, ETH_ALEN);
+
dev_net_set(dev, nsim_dev_net(nsim_dev));
ns = netdev_priv(dev);
ns->netdev = dev;
- u64_stats_init(&ns->syncp);
ns->nsim_dev = nsim_dev;
ns->nsim_dev_port = nsim_dev_port;
ns->nsim_bus_dev = nsim_dev->nsim_bus_dev;
@@ -1041,7 +1109,6 @@ nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port)
ns->qr_dfs = debugfs_create_file("queue_reset", 0200,
nsim_dev_port->ddir, ns,
&nsim_qreset_fops);
-
return ns;
err_free_netdev:
@@ -1079,7 +1146,8 @@ void nsim_destroy(struct netdevsim *ns)
/* Put this intentionally late to exercise the orphaning path */
if (ns->page) {
- page_pool_put_full_page(ns->page->pp, ns->page, false);
+ page_pool_put_full_page(pp_page_to_nmdesc(ns->page)->pp,
+ ns->page, false);
ns->page = NULL;
}