summaryrefslogtreecommitdiff
path: root/drivers/pci/controller/pci-hyperv.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
commit8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf (patch)
treefec3039a08284cd87f4ec9c3bea5b5a439f1859f /drivers/pci/controller/pci-hyperv.c
parent4b290aae788e06561754b28c6842e4080957d3f7 (diff)
parentfa582ca7e187a15e772e6a72fe035f649b387a60 (diff)
Merge tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Wrap datapath globals into net_aligned_data, to avoid false sharing - Preserve MSG_ZEROCOPY in forwarding (e.g. out of a container) - Add SO_INQ and SCM_INQ support to AF_UNIX - Add SIOCINQ support to AF_VSOCK - Add TCP_MAXSEG sockopt to MPTCP - Add IPv6 force_forwarding sysctl to enable forwarding per interface - Make TCP validation of whether packet fully fits in the receive window and the rcv_buf more strict. With increased use of HW aggregation a single "packet" can be multiple 100s of kB - Add MSG_MORE flag to optimize large TCP transmissions via sockmap, improves latency up to 33% for sockmap users - Convert TCP send queue handling from tasklet to BH workque - Improve BPF iteration over TCP sockets to see each socket exactly once - Remove obsolete and unused TCP RFC3517/RFC6675 loss recovery code - Support enabling kernel threads for NAPI processing on per-NAPI instance basis rather than a whole device. Fully stop the kernel NAPI thread when threaded NAPI gets disabled. Previously thread would stick around until ifdown due to tricky synchronization - Allow multicast routing to take effect on locally-generated packets - Add output interface argument for End.X in segment routing - MCTP: add support for gateway routing, improve bind() handling - Don't require rtnl_lock when fetching an IPv6 neighbor over Netlink - Add a new neighbor flag ("extern_valid"), which cedes refresh responsibilities to userspace. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed - Support NUD_PERMANENT for proxy neighbor entries - Add a new queuing discipline for IETF RFC9332 DualQ Coupled AQM - Add sequence numbers to netconsole messages. Unregister netconsole's console when all net targets are removed. Code refactoring. Add a number of selftests - Align IPSec inbound SA lookup to RFC 4301. Only SPI and protocol should be used for an inbound SA lookup - Support inspecting ref_tracker state via DebugFS - Don't force bonding advertisement frames tx to ~333 ms boundaries. Add broadcast_neighbor option to send ARP/ND on all bonded links - Allow providing upcall pid for the 'execute' command in openvswitch - Remove DCCP support from Netfilter's conntrack - Disallow multiple packet duplications in the queuing layer - Prevent use of deprecated iptables code on PREEMPT_RT Driver API: - Support RSS and hashing configuration over ethtool Netlink - Add dedicated ethtool callbacks for getting and setting hashing fields - Add support for power budget evaluation strategy in PSE / Power-over-Ethernet. Generate Netlink events for overcurrent etc - Support DPLL phase offset monitoring across all device inputs. Support providing clock reference and SYNC over separate DPLL inputs - Support traffic classes in devlink rate API for bandwidth management - Remove rtnl_lock dependency from UDP tunnel port configuration Device drivers: - Add a new Broadcom driver for 800G Ethernet (bnge) - Add a standalone driver for Microchip ZL3073x DPLL - Remove IBM's NETIUCV device driver - Ethernet high-speed NICs: - Broadcom (bnxt): - support zero-copy Tx of DMABUF memory - take page size into account for page pool recycling rings - Intel (100G, ice, idpf): - idpf: XDP and AF_XDP support preparations - idpf: add flow steering - add link_down_events statistic - clean up the TSPLL code - preparations for live VM migration - nVidia/Mellanox: - support zero-copy Rx/Tx interfaces (DMABUF and io_uring) - optimize context memory usage for matchers - expose serial numbers in devlink info - support PCIe congestion metrics - Meta (fbnic): - add 25G, 50G, and 100G link modes to phylink - support dumping FW logs - Marvell/Cavium: - support for CN20K generation of the Octeon chips - Amazon: - add HW clock (without timestamping, just hypervisor time access) - Ethernet virtual: - VirtIO net: - support segmentation of UDP-tunnel-encapsulated packets - Google (gve): - support packet timestamping and clock synchronization - Microsoft vNIC: - add handler for device-originated servicing events - allow dynamic MSI-X vector allocation - support Tx bandwidth clamping - Ethernet NICs consumer, and embedded: - AMD: - amd-xgbe: hardware timestamping and PTP clock support - Broadcom integrated MACs (bcmgenet, bcmasp): - use napi_complete_done() return value to support NAPI polling - add support for re-starting auto-negotiation - Broadcom switches (b53): - support BCM5325 switches - add bcm63xx EPHY power control - Synopsys (stmmac): - lots of code refactoring and cleanups - TI: - icssg-prueth: read firmware-names from device tree - icssg: PRP offload support - Microchip: - lan78xx: convert to PHYLINK for improved PHY and MAC management - ksz: add KSZ8463 switch support - Intel: - support similar queue priority scheme in multi-queue and time-sensitive networking (taprio) - support packet pre-emption in both - RealTek (r8169): - enable EEE at 5Gbps on RTL8126 - Airoha: - add PPPoE offload support - MDIO bus controller for Airoha AN7583 - Ethernet PHYs: - support for the IPQ5018 internal GE PHY - micrel KSZ9477 switch-integrated PHYs: - add MDI/MDI-X control support - add RX error counters - add cable test support - add Signal Quality Indicator (SQI) reporting - dp83tg720: improve reset handling and reduce link recovery time - support bcm54811 (and its MII-Lite interface type) - air_en8811h: support resume/suspend - support PHY counters for QCA807x and QCA808x - support WoL for QCA807x - CAN drivers: - rcar_canfd: support for Transceiver Delay Compensation - kvaser: report FW versions via devlink dev info - WiFi: - extended regulatory info support (6 GHz) - add statistics and beacon monitor for Multi-Link Operation (MLO) - support S1G aggregation, improve S1G support - add Radio Measurement action fields - support per-radio RTS threshold - some work around how FIPS affects wifi, which was wrong (RC4 is used by TKIP, not only WEP) - improvements for unsolicited probe response handling - WiFi drivers: - RealTek (rtw88): - IBSS mode for SDIO devices - RealTek (rtw89): - BT coexistence for MLO/WiFi7 - concurrent station + P2P support - support for USB devices RTL8851BU/RTL8852BU - Intel (iwlwifi): - use embedded PNVM in (to be released) FW images to fix compatibility issues - many cleanups (unused FW APIs, PCIe code, WoWLAN) - some FIPS interoperability - MediaTek (mt76): - firmware recovery improvements - more MLO work - Qualcomm/Atheros (ath12k): - fix scan on multi-radio devices - more EHT/Wi-Fi 7 features - encapsulation/decapsulation offload - Broadcom (brcm80211): - support SDIO 43751 device - Bluetooth: - hci_event: add support for handling LE BIG Sync Lost event - ISO: add socket option to report packet seqnum via CMSG - ISO: support SCM_TIMESTAMPING for ISO TS - Bluetooth drivers: - intel_pcie: support Function Level Reset - nxpuart: add support for 4M baudrate - nxpuart: implement powerup sequence, reset, FW dump, and FW loading" * tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1742 commits) dpll: zl3073x: Fix build failure selftests: bpf: fix legacy netfilter options ipv6: annotate data-races around rt->fib6_nsiblings ipv6: fix possible infinite loop in fib6_info_uses_dev() ipv6: prevent infinite loop in rt6_nlmsg_size() ipv6: add a retry logic in net6_rt_notify() vrf: Drop existing dst reference in vrf_ip6_input_dst net/sched: taprio: align entry index attr validation with mqprio net: fsl_pq_mdio: use dev_err_probe selftests: rtnetlink.sh: remove esp4_offload after test vsock: remove unnecessary null check in vsock_getname() igb: xsk: solve negative overflow of nb_pkts in zerocopy mode stmmac: xsk: fix negative overflow of budget in zerocopy mode dt-bindings: ieee802154: Convert at86rf230.txt yaml format net: dsa: microchip: Disable PTP function of KSZ8463 net: dsa: microchip: Setup fiber ports for KSZ8463 net: dsa: microchip: Write switch MAC address differently for KSZ8463 net: dsa: microchip: Use different registers for KSZ8463 net: dsa: microchip: Add KSZ8463 switch support to KSZ DSA driver dt-bindings: net: dsa: microchip: Add KSZ8463 switch support ...
Diffstat (limited to 'drivers/pci/controller/pci-hyperv.c')
-rw-r--r--drivers/pci/controller/pci-hyperv.c110
1 files changed, 83 insertions, 27 deletions
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 13680363ff19..d2b7e8ea710b 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -44,6 +44,7 @@
#include <linux/delay.h>
#include <linux/semaphore.h>
#include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
#include <linux/msi.h>
#include <linux/hyperv.h>
#include <linux/refcount.h>
@@ -508,7 +509,6 @@ struct hv_pcibus_device {
struct list_head children;
struct list_head dr_list;
- struct msi_domain_info msi_info;
struct irq_domain *irq_domain;
struct workqueue_struct *wq;
@@ -576,9 +576,8 @@ struct hv_pci_compl {
static void hv_pci_onchannelcallback(void *context);
#ifdef CONFIG_X86
-#define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED
-#define FLOW_HANDLER handle_edge_irq
-#define FLOW_NAME "edge"
+#define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED
+#define HV_MSI_CHIP_FLAGS MSI_CHIP_FLAG_SET_ACK
static int hv_pci_irqchip_init(void)
{
@@ -737,8 +736,7 @@ static void hv_arch_irq_unmask(struct irq_data *data)
#define HV_PCI_MSI_SPI_START 64
#define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START)
#define DELIVERY_MODE 0
-#define FLOW_HANDLER NULL
-#define FLOW_NAME NULL
+#define HV_MSI_CHIP_FLAGS MSI_CHIP_FLAG_SET_EOI
#define hv_msi_prepare NULL
struct hv_pci_chip_data {
@@ -1701,7 +1699,7 @@ static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
pdev = msi_desc_to_pci_dev(msi);
- hbus = info->data;
+ hbus = domain->host_data;
int_desc = irq_data_get_irq_chip_data(irq_data);
if (!int_desc)
return;
@@ -1719,7 +1717,6 @@ static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
static void hv_irq_mask(struct irq_data *data)
{
- pci_msi_mask_irq(data);
if (data->parent_data->chip->irq_mask)
irq_chip_mask_parent(data);
}
@@ -1730,7 +1727,6 @@ static void hv_irq_unmask(struct irq_data *data)
if (data->parent_data->chip->irq_unmask)
irq_chip_unmask_parent(data);
- pci_msi_unmask_irq(data);
}
struct compose_comp_ctxt {
@@ -2115,24 +2111,87 @@ return_null_message:
msg->data = 0;
}
+static bool hv_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+ struct irq_chip *chip = info->chip;
+
+ if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+ return false;
+
+ info->ops->msi_prepare = hv_msi_prepare;
+
+ chip->irq_set_affinity = irq_chip_set_affinity_parent;
+
+ if (IS_ENABLED(CONFIG_X86))
+ chip->flags |= IRQCHIP_MOVE_DEFERRED;
+
+ return true;
+}
+
+#define HV_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \
+ MSI_FLAG_USE_DEF_CHIP_OPS | \
+ MSI_FLAG_PCI_MSI_MASK_PARENT)
+#define HV_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI | \
+ MSI_FLAG_PCI_MSIX | \
+ MSI_FLAG_PCI_MSIX_ALLOC_DYN | \
+ MSI_GENERIC_FLAGS_MASK)
+
+static const struct msi_parent_ops hv_pcie_msi_parent_ops = {
+ .required_flags = HV_PCIE_MSI_FLAGS_REQUIRED,
+ .supported_flags = HV_PCIE_MSI_FLAGS_SUPPORTED,
+ .bus_select_token = DOMAIN_BUS_PCI_MSI,
+ .chip_flags = HV_MSI_CHIP_FLAGS,
+ .prefix = "HV-",
+ .init_dev_msi_info = hv_pcie_init_dev_msi_info,
+};
+
/* HW Interrupt Chip Descriptor */
static struct irq_chip hv_msi_irq_chip = {
.name = "Hyper-V PCIe MSI",
.irq_compose_msi_msg = hv_compose_msi_msg,
.irq_set_affinity = irq_chip_set_affinity_parent,
-#ifdef CONFIG_X86
.irq_ack = irq_chip_ack_parent,
- .flags = IRQCHIP_MOVE_DEFERRED,
-#elif defined(CONFIG_ARM64)
.irq_eoi = irq_chip_eoi_parent,
-#endif
.irq_mask = hv_irq_mask,
.irq_unmask = hv_irq_unmask,
};
-static struct msi_domain_ops hv_msi_ops = {
- .msi_prepare = hv_msi_prepare,
- .msi_free = hv_msi_free,
+static int hv_pcie_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs,
+ void *arg)
+{
+ /*
+ * TODO: Allocating and populating struct tran_int_desc in hv_compose_msi_msg()
+ * should be moved here.
+ */
+ int ret;
+
+ ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (int i = 0; i < nr_irqs; i++) {
+ irq_domain_set_hwirq_and_chip(d, virq + i, 0, &hv_msi_irq_chip, NULL);
+ if (IS_ENABLED(CONFIG_X86))
+ __irq_set_handler(virq + i, handle_edge_irq, 0, "edge");
+ }
+
+ return 0;
+}
+
+static void hv_pcie_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs)
+{
+ struct msi_domain_info *info = d->host_data;
+
+ for (int i = 0; i < nr_irqs; i++)
+ hv_msi_free(d, info, virq + i);
+
+ irq_domain_free_irqs_top(d, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops hv_pcie_domain_ops = {
+ .alloc = hv_pcie_domain_alloc,
+ .free = hv_pcie_domain_free,
};
/**
@@ -2150,17 +2209,14 @@ static struct msi_domain_ops hv_msi_ops = {
*/
static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
{
- hbus->msi_info.chip = &hv_msi_irq_chip;
- hbus->msi_info.ops = &hv_msi_ops;
- hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
- MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
- MSI_FLAG_PCI_MSIX);
- hbus->msi_info.handler = FLOW_HANDLER;
- hbus->msi_info.handler_name = FLOW_NAME;
- hbus->msi_info.data = hbus;
- hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode,
- &hbus->msi_info,
- hv_pci_get_root_domain());
+ struct irq_domain_info info = {
+ .fwnode = hbus->fwnode,
+ .ops = &hv_pcie_domain_ops,
+ .host_data = hbus,
+ .parent = hv_pci_get_root_domain(),
+ };
+
+ hbus->irq_domain = msi_create_parent_irq_domain(&info, &hv_pcie_msi_parent_ops);
if (!hbus->irq_domain) {
dev_err(&hbus->hdev->device,
"Failed to build an MSI IRQ domain\n");