diff options
Diffstat (limited to 'drivers/pci/controller/pci-hyperv.c')
| -rw-r--r-- | drivers/pci/controller/pci-hyperv.c | 758 |
1 files changed, 461 insertions, 297 deletions
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 084f5313895c..1e237d3538f9 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -44,11 +44,14 @@ #include <linux/delay.h> #include <linux/semaphore.h> #include <linux/irq.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/msi.h> #include <linux/hyperv.h> #include <linux/refcount.h> #include <linux/irqdomain.h> #include <linux/acpi.h> +#include <linux/sizes.h> +#include <linux/of_irq.h> #include <asm/mshyperv.h> /* @@ -308,8 +311,6 @@ struct pci_packet { void (*completion_func)(void *context, struct pci_response *resp, int resp_packet_size); void *compl_ctxt; - - struct pci_message message[]; }; /* @@ -465,7 +466,7 @@ struct pci_eject_response { u32 status; } __packed; -static int pci_ring_size = (4 * PAGE_SIZE); +static int pci_ring_size = VMBUS_RING_SIZE(SZ_16K); /* * Driver specific state. @@ -489,7 +490,10 @@ struct hv_pcibus_device { struct fwnode_handle *fwnode; /* Protocol version negotiated with the host */ enum pci_protocol_version_t protocol_version; + + struct mutex state_lock; enum hv_pcibus_state state; + struct hv_device *hdev; resource_size_t low_mmio_space; resource_size_t high_mmio_space; @@ -505,22 +509,13 @@ struct hv_pcibus_device { struct list_head children; struct list_head dr_list; - struct msi_domain_info msi_info; struct irq_domain *irq_domain; - spinlock_t retarget_msi_interrupt_lock; - struct workqueue_struct *wq; /* Highest slot of child device with resources allocated */ int wslot_res_allocated; - - /* hypercall arg, must not cross page boundary */ - struct hv_retarget_device_interrupt retarget_msi_interrupt_params; - - /* - * Don't put anything here: retarget_msi_interrupt_params must be last - */ + bool use_calls; /* Use hypercalls to access mmio cfg space */ }; /* @@ -550,22 +545,13 @@ struct hv_pcidev_description { struct hv_dr_state { struct list_head list_entry; u32 device_count; - struct hv_pcidev_description func[]; -}; - -enum hv_pcichild_state { - hv_pcichild_init = 0, - hv_pcichild_requirements, - hv_pcichild_resourced, - hv_pcichild_ejecting, - hv_pcichild_maximum + struct hv_pcidev_description func[] __counted_by(device_count); }; struct hv_pci_dev { /* List protected by pci_rescan_remove_lock */ struct list_head list_entry; refcount_t refs; - enum hv_pcichild_state state; struct pci_slot *pci_slot; struct hv_pcidev_description desc; bool reported_missing; @@ -590,9 +576,8 @@ struct hv_pci_compl { static void hv_pci_onchannelcallback(void *context); #ifdef CONFIG_X86 -#define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED -#define FLOW_HANDLER handle_edge_irq -#define FLOW_NAME "edge" +#define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED +#define HV_MSI_CHIP_FLAGS MSI_CHIP_FLAG_SET_ACK static int hv_pci_irqchip_init(void) { @@ -614,7 +599,7 @@ static unsigned int hv_msi_get_int_vector(struct irq_data *data) #define hv_msi_prepare pci_msi_prepare /** - * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current + * hv_irq_retarget_interrupt() - "Unmask" the IRQ by setting its current * affinity. * @data: Describes the IRQ * @@ -623,7 +608,7 @@ static unsigned int hv_msi_get_int_vector(struct irq_data *data) * is built out of this PCI bus's instance GUID and the function * number of the device. */ -static void hv_arch_irq_unmask(struct irq_data *data) +static void hv_irq_retarget_interrupt(struct irq_data *data) { struct msi_desc *msi_desc = irq_data_get_msi_desc(data); struct hv_retarget_device_interrupt *params; @@ -643,10 +628,15 @@ static void hv_arch_irq_unmask(struct irq_data *data) pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); int_desc = data->chip_data; + if (!int_desc) { + dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", + __func__, data->irq); + return; + } - spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); + local_irq_save(flags); - params = &hbus->retarget_msi_interrupt_params; + params = *this_cpu_ptr(hyperv_pcpu_input_arg); memset(params, 0, sizeof(*params)); params->partition_id = HV_PARTITION_ID_SELF; params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; @@ -659,13 +649,6 @@ static void hv_arch_irq_unmask(struct irq_data *data) PCI_FUNC(pdev->devfn); params->int_target.vector = hv_msi_get_int_vector(data); - /* - * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by - * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a - * spurious interrupt storm. Not doing so does not seem to have a - * negative effect (yet?). - */ - if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { /* * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the @@ -679,7 +662,7 @@ static void hv_arch_irq_unmask(struct irq_data *data) if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { res = 1; - goto exit_unlock; + goto out; } cpumask_and(tmp, dest, cpu_online_mask); @@ -688,7 +671,7 @@ static void hv_arch_irq_unmask(struct irq_data *data) if (nr_bank <= 0) { res = 1; - goto exit_unlock; + goto out; } /* @@ -707,8 +690,8 @@ static void hv_arch_irq_unmask(struct irq_data *data) res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), params, NULL); -exit_unlock: - spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); +out: + local_irq_restore(flags); /* * During hibernation, when a CPU is offlined, the kernel tries @@ -730,6 +713,20 @@ exit_unlock: dev_err(&hbus->hdev->device, "%s() failed: %#llx", __func__, res); } + +static void hv_arch_irq_unmask(struct irq_data *data) +{ + if (hv_root_partition()) + /* + * In case of the nested root partition, the nested hypervisor + * is taking care of interrupt remapping and thus the + * MAP_DEVICE_INTERRUPT hypercall is required instead of + * RETARGET_INTERRUPT. + */ + (void)hv_map_msi_interrupt(data, NULL); + else + hv_irq_retarget_interrupt(data); +} #elif defined(CONFIG_ARM64) /* * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit @@ -739,8 +736,7 @@ exit_unlock: #define HV_PCI_MSI_SPI_START 64 #define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START) #define DELIVERY_MODE 0 -#define FLOW_HANDLER NULL -#define FLOW_NAME NULL +#define HV_MSI_CHIP_FLAGS MSI_CHIP_FLAG_SET_EOI #define hv_msi_prepare NULL struct hv_pci_chip_data { @@ -832,9 +828,17 @@ static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain, int ret; fwspec.fwnode = domain->parent->fwnode; - fwspec.param_count = 2; - fwspec.param[0] = hwirq; - fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + if (is_of_node(fwspec.fwnode)) { + /* SPI lines for OF translations start at offset 32 */ + fwspec.param_count = 3; + fwspec.param[0] = 0; + fwspec.param[1] = hwirq - 32; + fwspec.param[2] = IRQ_TYPE_EDGE_RISING; + } else { + fwspec.param_count = 2; + fwspec.param[0] = hwirq; + fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + } ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); if (ret) @@ -902,10 +906,44 @@ static const struct irq_domain_ops hv_pci_domain_ops = { .activate = hv_pci_vec_irq_domain_activate, }; +#ifdef CONFIG_OF + +static struct irq_domain *hv_pci_of_irq_domain_parent(void) +{ + struct device_node *parent; + struct irq_domain *domain; + + parent = of_irq_find_parent(hv_get_vmbus_root_device()->of_node); + if (!parent) + return NULL; + domain = irq_find_host(parent); + of_node_put(parent); + + return domain; +} + +#endif + +#ifdef CONFIG_ACPI + +static struct irq_domain *hv_pci_acpi_irq_domain_parent(void) +{ + acpi_gsi_domain_disp_fn gsi_domain_disp_fn; + + gsi_domain_disp_fn = acpi_get_gsi_dispatcher(); + if (!gsi_domain_disp_fn) + return NULL; + return irq_find_matching_fwnode(gsi_domain_disp_fn(0), + DOMAIN_BUS_ANY); +} + +#endif + static int hv_pci_irqchip_init(void) { static struct hv_pci_chip_data *chip_data; struct fwnode_handle *fn = NULL; + struct irq_domain *irq_domain_parent = NULL; int ret = -ENOMEM; chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL); @@ -922,9 +960,24 @@ static int hv_pci_irqchip_init(void) * way to ensure that all the corresponding devices are also gone and * no interrupts will be generated. */ - hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR, - fn, &hv_pci_domain_ops, - chip_data); +#ifdef CONFIG_ACPI + if (!acpi_disabled) + irq_domain_parent = hv_pci_acpi_irq_domain_parent(); +#endif +#ifdef CONFIG_OF + if (!irq_domain_parent) + irq_domain_parent = hv_pci_of_irq_domain_parent(); +#endif + if (!irq_domain_parent) { + WARN_ONCE(1, "Invalid firmware configuration for VMBus interrupts\n"); + ret = -EINVAL; + goto free_chip; + } + + hv_msi_gic_irq_domain = irq_domain_create_hierarchy(irq_domain_parent, 0, + HV_PCI_MSI_SPI_NR, + fn, &hv_pci_domain_ops, + chip_data); if (!hv_msi_gic_irq_domain) { pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n"); @@ -1041,6 +1094,70 @@ static int wslot_to_devfn(u32 wslot) return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func); } +static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val) +{ + struct hv_mmio_read_input *in; + struct hv_mmio_read_output *out; + u64 ret; + + /* + * Must be called with interrupts disabled so it is safe + * to use the per-cpu input argument page. Use it for + * both input and output. + */ + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + out = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*in); + in->gpa = gpa; + in->size = size; + + ret = hv_do_hypercall(HVCALL_MMIO_READ, in, out); + if (hv_result_success(ret)) { + switch (size) { + case 1: + *val = *(u8 *)(out->data); + break; + case 2: + *val = *(u16 *)(out->data); + break; + default: + *val = *(u32 *)(out->data); + break; + } + } else + dev_err(dev, "MMIO read hypercall error %llx addr %llx size %d\n", + ret, gpa, size); +} + +static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val) +{ + struct hv_mmio_write_input *in; + u64 ret; + + /* + * Must be called with interrupts disabled so it is safe + * to use the per-cpu input argument memory. + */ + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + in->gpa = gpa; + in->size = size; + switch (size) { + case 1: + *(u8 *)(in->data) = val; + break; + case 2: + *(u16 *)(in->data) = val; + break; + default: + *(u32 *)(in->data) = val; + break; + } + + ret = hv_do_hypercall(HVCALL_MMIO_WRITE, in, NULL); + if (!hv_result_success(ret)) + dev_err(dev, "MMIO write hypercall error %llx addr %llx size %d\n", + ret, gpa, size); +} + /* * PCI Configuration Space for these root PCI buses is implemented as a pair * of pages in memory-mapped I/O space. Writing to the first page chooses @@ -1059,8 +1176,10 @@ static int wslot_to_devfn(u32 wslot) static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size, u32 *val) { + struct hv_pcibus_device *hbus = hpdev->hbus; + struct device *dev = &hbus->hdev->device; + int offset = where + CFG_PAGE_OFFSET; unsigned long flags; - void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; /* * If the attempt is to read the IDs or the ROM BAR, simulate that. @@ -1079,8 +1198,8 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, PCI_CAPABILITY_LIST) { /* ROM BARs are unimplemented */ *val = 0; - } else if (where >= PCI_INTERRUPT_LINE && where + size <= - PCI_INTERRUPT_PIN) { + } else if ((where >= PCI_INTERRUPT_LINE && where + size <= PCI_INTERRUPT_PIN) || + (where >= PCI_INTERRUPT_PIN && where + size <= PCI_MIN_GNT)) { /* * Interrupt Line and Interrupt PIN are hard-wired to zero * because this front-end only supports message-signaled @@ -1088,56 +1207,79 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, */ *val = 0; } else if (where + size <= CFG_PAGE_SIZE) { - spin_lock_irqsave(&hpdev->hbus->config_lock, flags); - /* Choose the function to be read. (See comment above) */ - writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); - /* Make sure the function was chosen before we start reading. */ - mb(); - /* Read from that function's config space. */ - switch (size) { - case 1: - *val = readb(addr); - break; - case 2: - *val = readw(addr); - break; - default: - *val = readl(addr); - break; + + spin_lock_irqsave(&hbus->config_lock, flags); + if (hbus->use_calls) { + phys_addr_t addr = hbus->mem_config->start + offset; + + hv_pci_write_mmio(dev, hbus->mem_config->start, 4, + hpdev->desc.win_slot.slot); + hv_pci_read_mmio(dev, addr, size, val); + } else { + void __iomem *addr = hbus->cfg_addr + offset; + + /* Choose the function to be read. (See comment above) */ + writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); + /* Make sure the function was chosen before reading. */ + mb(); + /* Read from that function's config space. */ + switch (size) { + case 1: + *val = readb(addr); + break; + case 2: + *val = readw(addr); + break; + default: + *val = readl(addr); + break; + } + /* + * Make sure the read was done before we release the + * spinlock allowing consecutive reads/writes. + */ + mb(); } - /* - * Make sure the read was done before we release the spinlock - * allowing consecutive reads/writes. - */ - mb(); - spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); + spin_unlock_irqrestore(&hbus->config_lock, flags); } else { - dev_err(&hpdev->hbus->hdev->device, - "Attempt to read beyond a function's config space.\n"); + dev_err(dev, "Attempt to read beyond a function's config space.\n"); } } static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) { + struct hv_pcibus_device *hbus = hpdev->hbus; + struct device *dev = &hbus->hdev->device; + u32 val; u16 ret; unsigned long flags; - void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + - PCI_VENDOR_ID; - spin_lock_irqsave(&hpdev->hbus->config_lock, flags); + spin_lock_irqsave(&hbus->config_lock, flags); - /* Choose the function to be read. (See comment above) */ - writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); - /* Make sure the function was chosen before we start reading. */ - mb(); - /* Read from that function's config space. */ - ret = readw(addr); - /* - * mb() is not required here, because the spin_unlock_irqrestore() - * is a barrier. - */ + if (hbus->use_calls) { + phys_addr_t addr = hbus->mem_config->start + + CFG_PAGE_OFFSET + PCI_VENDOR_ID; - spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); + hv_pci_write_mmio(dev, hbus->mem_config->start, 4, + hpdev->desc.win_slot.slot); + hv_pci_read_mmio(dev, addr, 2, &val); + ret = val; /* Truncates to 16 bits */ + } else { + void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET + + PCI_VENDOR_ID; + /* Choose the function to be read. (See comment above) */ + writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); + /* Make sure the function was chosen before we start reading. */ + mb(); + /* Read from that function's config space. */ + ret = readw(addr); + /* + * mb() is not required here, because the + * spin_unlock_irqrestore() is a barrier. + */ + } + + spin_unlock_irqrestore(&hbus->config_lock, flags); return ret; } @@ -1152,39 +1294,51 @@ static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size, u32 val) { + struct hv_pcibus_device *hbus = hpdev->hbus; + struct device *dev = &hbus->hdev->device; + int offset = where + CFG_PAGE_OFFSET; unsigned long flags; - void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= PCI_CAPABILITY_LIST) { /* SSIDs and ROM BARs are read-only */ } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { - spin_lock_irqsave(&hpdev->hbus->config_lock, flags); - /* Choose the function to be written. (See comment above) */ - writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); - /* Make sure the function was chosen before we start writing. */ - wmb(); - /* Write to that function's config space. */ - switch (size) { - case 1: - writeb(val, addr); - break; - case 2: - writew(val, addr); - break; - default: - writel(val, addr); - break; + spin_lock_irqsave(&hbus->config_lock, flags); + + if (hbus->use_calls) { + phys_addr_t addr = hbus->mem_config->start + offset; + + hv_pci_write_mmio(dev, hbus->mem_config->start, 4, + hpdev->desc.win_slot.slot); + hv_pci_write_mmio(dev, addr, size, val); + } else { + void __iomem *addr = hbus->cfg_addr + offset; + + /* Choose the function to write. (See comment above) */ + writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); + /* Make sure the function was chosen before writing. */ + wmb(); + /* Write to that function's config space. */ + switch (size) { + case 1: + writeb(val, addr); + break; + case 2: + writew(val, addr); + break; + default: + writel(val, addr); + break; + } + /* + * Make sure the write was done before we release the + * spinlock allowing consecutive reads/writes. + */ + mb(); } - /* - * Make sure the write was done before we release the spinlock - * allowing consecutive reads/writes. - */ - mb(); - spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); + spin_unlock_irqrestore(&hbus->config_lock, flags); } else { - dev_err(&hpdev->hbus->hdev->device, - "Attempt to write beyond a function's config space.\n"); + dev_err(dev, "Attempt to write beyond a function's config space.\n"); } } @@ -1270,7 +1424,7 @@ static struct pci_ops hv_pcifront_ops = { * * If the PF driver wishes to initiate communication, it can "invalidate" one or * more of the first 64 blocks. This invalidation is delivered via a callback - * supplied by the VF driver by this driver. + * supplied to the VF driver by this driver. * * No protocol is implied, except that supplied by the PF and VF drivers. */ @@ -1352,7 +1506,7 @@ static int hv_read_config_block(struct pci_dev *pdev, void *buf, memset(&pkt, 0, sizeof(pkt)); pkt.pkt.completion_func = hv_pci_read_config_compl; pkt.pkt.compl_ctxt = &comp_pkt; - read_blk = (struct pci_read_block *)&pkt.pkt.message; + read_blk = (struct pci_read_block *)pkt.buf; read_blk->message_type.type = PCI_READ_BLOCK; read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); read_blk->block_id = block_id; @@ -1432,7 +1586,7 @@ static int hv_write_config_block(struct pci_dev *pdev, void *buf, memset(&pkt, 0, sizeof(pkt)); pkt.pkt.completion_func = hv_pci_write_config_compl; pkt.pkt.compl_ctxt = &comp_pkt; - write_blk = (struct pci_write_block *)&pkt.pkt.message; + write_blk = (struct pci_write_block *)pkt.buf; write_blk->message_type.type = PCI_WRITE_BLOCK; write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); write_blk->block_id = block_id; @@ -1513,7 +1667,7 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev, return; } memset(&ctxt, 0, sizeof(ctxt)); - int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; + int_pkt = (struct pci_delete_interrupt *)ctxt.buffer; int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE; int_pkt->wslot.slot = hpdev->desc.win_slot.slot; @@ -1526,7 +1680,6 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev, /** * hv_msi_free() - Free the MSI. * @domain: The interrupt domain pointer - * @info: Extra MSI-related context * @irq: Identifies the IRQ. * * The Hyper-V parent partition and hypervisor are tracking the @@ -1534,8 +1687,7 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev, * table up to date. This callback sends a message that frees * the IRT entry and related tracking nonsense. */ -static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, - unsigned int irq) +static void hv_msi_free(struct irq_domain *domain, unsigned int irq) { struct hv_pcibus_device *hbus; struct hv_pci_dev *hpdev; @@ -1545,7 +1697,7 @@ static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, struct msi_desc *msi = irq_data_get_msi_desc(irq_data); pdev = msi_desc_to_pci_dev(msi); - hbus = info->data; + hbus = domain->host_data; int_desc = irq_data_get_irq_chip_data(irq_data); if (!int_desc) return; @@ -1563,7 +1715,6 @@ static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, static void hv_irq_mask(struct irq_data *data) { - pci_msi_mask_irq(data); if (data->parent_data->chip->irq_mask) irq_chip_mask_parent(data); } @@ -1574,7 +1725,6 @@ static void hv_irq_unmask(struct irq_data *data) if (data->parent_data->chip->irq_unmask) irq_chip_unmask_parent(data); - pci_msi_unmask_irq(data); } struct compose_comp_ctxt { @@ -1671,8 +1821,7 @@ static int hv_compose_multi_msi_req_get_cpu(void) spin_lock_irqsave(&multi_msi_cpu_lock, flags); - cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids, - false); + cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask); cpu = cpu_next; spin_unlock_irqrestore(&multi_msi_cpu_lock, flags); @@ -1911,12 +2060,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) hv_pci_onchannelcallback(hbus); spin_unlock_irqrestore(&channel->sched_lock, flags); - if (hpdev->state == hv_pcichild_ejecting) { - dev_err_once(&hbus->hdev->device, - "the device is being ejected\n"); - goto enable_tasklet; - } - udelay(100); } @@ -1966,23 +2109,85 @@ return_null_message: msg->data = 0; } +static bool hv_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + info->ops->msi_prepare = hv_msi_prepare; + + chip->irq_set_affinity = irq_chip_set_affinity_parent; + + if (IS_ENABLED(CONFIG_X86)) + chip->flags |= IRQCHIP_MOVE_DEFERRED; + + return true; +} + +#define HV_PCIE_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT) +#define HV_PCIE_MSI_FLAGS_SUPPORTED (MSI_FLAG_MULTI_PCI_MSI | \ + MSI_FLAG_PCI_MSIX | \ + MSI_FLAG_PCI_MSIX_ALLOC_DYN | \ + MSI_GENERIC_FLAGS_MASK) + +static const struct msi_parent_ops hv_pcie_msi_parent_ops = { + .required_flags = HV_PCIE_MSI_FLAGS_REQUIRED, + .supported_flags = HV_PCIE_MSI_FLAGS_SUPPORTED, + .bus_select_token = DOMAIN_BUS_PCI_MSI, + .chip_flags = HV_MSI_CHIP_FLAGS, + .prefix = "HV-", + .init_dev_msi_info = hv_pcie_init_dev_msi_info, +}; + /* HW Interrupt Chip Descriptor */ static struct irq_chip hv_msi_irq_chip = { .name = "Hyper-V PCIe MSI", .irq_compose_msi_msg = hv_compose_msi_msg, .irq_set_affinity = irq_chip_set_affinity_parent, -#ifdef CONFIG_X86 .irq_ack = irq_chip_ack_parent, -#elif defined(CONFIG_ARM64) .irq_eoi = irq_chip_eoi_parent, -#endif .irq_mask = hv_irq_mask, .irq_unmask = hv_irq_unmask, }; -static struct msi_domain_ops hv_msi_ops = { - .msi_prepare = hv_msi_prepare, - .msi_free = hv_msi_free, +static int hv_pcie_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + /* + * TODO: Allocating and populating struct tran_int_desc in hv_compose_msi_msg() + * should be moved here. + */ + int ret; + + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + for (int i = 0; i < nr_irqs; i++) { + irq_domain_set_hwirq_and_chip(d, virq + i, 0, &hv_msi_irq_chip, NULL); + if (IS_ENABLED(CONFIG_X86)) + __irq_set_handler(virq + i, handle_edge_irq, 0, "edge"); + } + + return 0; +} + +static void hv_pcie_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs) +{ + for (int i = 0; i < nr_irqs; i++) + hv_msi_free(d, virq + i); + + irq_domain_free_irqs_top(d, virq, nr_irqs); +} + +static const struct irq_domain_ops hv_pcie_domain_ops = { + .alloc = hv_pcie_domain_alloc, + .free = hv_pcie_domain_free, }; /** @@ -2000,17 +2205,14 @@ static struct msi_domain_ops hv_msi_ops = { */ static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) { - hbus->msi_info.chip = &hv_msi_irq_chip; - hbus->msi_info.ops = &hv_msi_ops; - hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | - MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | - MSI_FLAG_PCI_MSIX); - hbus->msi_info.handler = FLOW_HANDLER; - hbus->msi_info.handler_name = FLOW_NAME; - hbus->msi_info.data = hbus; - hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode, - &hbus->msi_info, - hv_pci_get_root_domain()); + struct irq_domain_info info = { + .fwnode = hbus->fwnode, + .ops = &hv_pcie_domain_ops, + .host_data = hbus, + .parent = hv_pci_get_root_domain(), + }; + + hbus->irq_domain = msi_create_parent_irq_domain(&info, &hv_pcie_msi_parent_ops); if (!hbus->irq_domain) { dev_err(&hbus->hdev->device, "Failed to build an MSI IRQ domain\n"); @@ -2402,7 +2604,7 @@ static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, comp_pkt.hpdev = hpdev; pkt.init_packet.compl_ctxt = &comp_pkt; pkt.init_packet.completion_func = q_resource_requirements; - res_req = (struct pci_child_message *)&pkt.init_packet.message; + res_req = (struct pci_child_message *)pkt.buffer; res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; res_req->wslot.slot = desc->win_slot.slot; @@ -2522,6 +2724,8 @@ static void pci_devices_present_work(struct work_struct *work) if (!dr) return; + mutex_lock(&hbus->state_lock); + /* First, mark all existing children as reported missing. */ spin_lock_irqsave(&hbus->device_list_lock, flags); list_for_each_entry(hpdev, &hbus->children, list_entry) { @@ -2603,6 +2807,8 @@ static void pci_devices_present_work(struct work_struct *work) break; } + mutex_unlock(&hbus->state_lock); + kfree(dr); } @@ -2751,7 +2957,7 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; - WARN_ON(hpdev->state != hv_pcichild_ejecting); + mutex_lock(&hbus->state_lock); /* * Ejection can come before or after the PCI bus has been set up, so @@ -2776,7 +2982,7 @@ static void hv_eject_device_work(struct work_struct *work) pci_destroy_slot(hpdev->pci_slot); memset(&ctxt, 0, sizeof(ctxt)); - ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; + ejct_pkt = (struct pci_eject_response *)ctxt.buffer; ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, @@ -2789,6 +2995,8 @@ static void hv_eject_device_work(struct work_struct *work) put_pcichild(hpdev); put_pcichild(hpdev); /* hpdev has been freed. Do not use it any more. */ + + mutex_unlock(&hbus->state_lock); } /** @@ -2809,7 +3017,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) return; } - hpdev->state = hv_pcichild_ejecting; get_pcichild(hpdev); INIT_WORK(&hpdev->wrk, hv_eject_device_work); queue_work(hbus->wq, &hpdev->wrk); @@ -3033,7 +3240,7 @@ static int hv_pci_protocol_negotiation(struct hv_device *hdev, init_completion(&comp_pkt.host_event); pkt->completion_func = hv_pci_generic_compl; pkt->compl_ctxt = &comp_pkt; - version_req = (struct pci_version_request *)&pkt->message; + version_req = (struct pci_version_request *)(pkt + 1); version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; for (i = 0; i < num_version; i++) { @@ -3238,8 +3445,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev) struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; + bool retry = true; int ret; +enter_d0_retry: /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region @@ -3253,7 +3462,7 @@ static int hv_pci_enter_d0(struct hv_device *hdev) init_completion(&comp_pkt.host_event); pkt->completion_func = hv_pci_generic_compl; pkt->compl_ctxt = &comp_pkt; - d0_entry = (struct pci_bus_d0_entry *)&pkt->message; + d0_entry = (struct pci_bus_d0_entry *)(pkt + 1); d0_entry->message_type.type = PCI_BUS_D0ENTRY; d0_entry->mmio_base = hbus->mem_config->start; @@ -3266,6 +3475,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev) if (ret) goto exit; + /* + * In certain case (Kdump) the pci device of interest was + * not cleanly shut down and resource is still held on host + * side, the host could return invalid device status. + * We need to explicitly request host to release the resource + * and try to enter D0 again. + */ + if (comp_pkt.completion_status < 0 && retry) { + retry = false; + + dev_err(&hdev->device, "Retrying D0 Entry\n"); + + /* + * Hv_pci_bus_exit() calls hv_send_resource_released() + * to free up resources of its child devices. + * In the kdump kernel we need to set the + * wslot_res_allocated to 255 so it scans all child + * devices to release resources allocated in the + * normal kernel before panic happened. + */ + hbus->wslot_res_allocated = 255; + + ret = hv_pci_bus_exit(hdev, true); + + if (ret == 0) { + kfree(pkt); + goto enter_d0_retry; + } + dev_err(&hdev->device, + "Retrying D0 failed with ret %d\n", ret); + } + if (comp_pkt.completion_status < 0) { dev_err(&hdev->device, "PCI Pass-through VSP failed D0 Entry with status %x\n", @@ -3308,6 +3549,24 @@ static int hv_pci_query_relations(struct hv_device *hdev) if (!ret) ret = wait_for_response(hdev, &comp); + /* + * In the case of fast device addition/removal, it's possible that + * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we + * already got a PCI_BUS_RELATIONS* message from the host and the + * channel callback already scheduled a work to hbus->wq, which can be + * running pci_devices_present_work() -> survey_child_resources() -> + * complete(&hbus->survey_event), even after hv_pci_query_relations() + * exits and the stack variable 'comp' is no longer valid; as a result, + * a hang or a page fault may happen when the complete() calls + * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from + * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is + * -ENODEV, there can't be any more work item scheduled to hbus->wq + * after the flush_workqueue(): see vmbus_onoffer_rescind() -> + * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() -> + * channel->rescind = true. + */ + flush_workqueue(hbus->wq); + return ret; } @@ -3361,20 +3620,20 @@ static int hv_send_resources_allocated(struct hv_device *hdev) if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { res_assigned = - (struct pci_resources_assigned *)&pkt->message; + (struct pci_resources_assigned *)(pkt + 1); res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED; res_assigned->wslot.slot = hpdev->desc.win_slot.slot; } else { res_assigned2 = - (struct pci_resources_assigned2 *)&pkt->message; + (struct pci_resources_assigned2 *)(pkt + 1); res_assigned2->message_type.type = PCI_RESOURCES_ASSIGNED2; res_assigned2->wslot.slot = hpdev->desc.win_slot.slot; } put_pcichild(hpdev); - ret = vmbus_sendpacket(hdev->channel, &pkt->message, + ret = vmbus_sendpacket(hdev->channel, pkt + 1, size_res, (unsigned long)pkt, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); @@ -3437,48 +3696,6 @@ static int hv_send_resources_released(struct hv_device *hdev) return 0; } -#define HVPCI_DOM_MAP_SIZE (64 * 1024) -static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); - -/* - * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 - * as invalid for passthrough PCI devices of this driver. - */ -#define HVPCI_DOM_INVALID 0 - -/** - * hv_get_dom_num() - Get a valid PCI domain number - * Check if the PCI domain number is in use, and return another number if - * it is in use. - * - * @dom: Requested domain number - * - * return: domain number on success, HVPCI_DOM_INVALID on failure - */ -static u16 hv_get_dom_num(u16 dom) -{ - unsigned int i; - - if (test_and_set_bit(dom, hvpci_dom_map) == 0) - return dom; - - for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { - if (test_and_set_bit(i, hvpci_dom_map) == 0) - return i; - } - - return HVPCI_DOM_INVALID; -} - -/** - * hv_put_dom_num() - Mark the PCI domain number as free - * @dom: Domain number to be freed - */ -static void hv_put_dom_num(u16 dom) -{ - clear_bit(dom, hvpci_dom_map); -} - /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -3491,44 +3708,20 @@ static int hv_pci_probe(struct hv_device *hdev, { struct pci_host_bridge *bridge; struct hv_pcibus_device *hbus; - u16 dom_req, dom; + int ret, dom; + u16 dom_req; char *name; - bool enter_d0_retry = true; - int ret; - - /* - * hv_pcibus_device contains the hypercall arguments for retargeting in - * hv_irq_unmask(). Those must not cross a page boundary. - */ - BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); if (!bridge) return -ENOMEM; - /* - * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural - * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate - * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and - * alignment of hbus is important because hbus's field - * retarget_msi_interrupt_params must not cross a 4KB page boundary. - * - * Here we prefer kzalloc to get_zeroed_page(), because a buffer - * allocated by the latter is not tracked and scanned by kmemleak, and - * hence kmemleak reports the pointer contained in the hbus buffer - * (i.e. the hpdev struct, which is created in new_pcichild_device() and - * is tracked by hbus->children) as memory leak (false positive). - * - * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be - * used to allocate the hbus buffer and we can avoid the kmemleak false - * positive by using kmemleak_alloc() and kmemleak_free() to ask - * kmemleak to track and scan the hbus buffer. - */ - hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); + hbus = kzalloc(sizeof(*hbus), GFP_KERNEL); if (!hbus) return -ENOMEM; hbus->bridge = bridge; + mutex_init(&hbus->state_lock); hbus->state = hv_pcibus_init; hbus->wslot_res_allocated = -1; @@ -3544,11 +3737,14 @@ static int hv_pci_probe(struct hv_device *hdev, * PCI bus (which is actually emulated by the hypervisor) is domain 0. * (2) There will be no overlap between domains (after fixing possible * collisions) in the same VM. + * + * Because Gen1 VMs use domain 0, don't allow picking domain 0 here, + * even if bytes 4 and 5 of the instance GUID are both zero. For wider + * userspace compatibility, limit the domain ID to a 16-bit value. */ dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; - dom = hv_get_dom_num(dom_req); - - if (dom == HVPCI_DOM_INVALID) { + dom = pci_bus_find_emul_domain_nr(dom_req, 1, U16_MAX); + if (dom < 0) { dev_err(&hdev->device, "Unable to use dom# 0x%x or other numbers", dom_req); ret = -EINVAL; @@ -3563,6 +3759,7 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->bridge->domain_nr = dom; #ifdef CONFIG_X86 hbus->sysdata.domain = dom; + hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS); #elif defined(CONFIG_ARM64) /* * Set the PCI bus parent to be the corresponding VMbus @@ -3572,6 +3769,7 @@ static int hv_pci_probe(struct hv_device *hdev, * information to devices created on the bus. */ hbus->sysdata.parent = hdev->device.parent; + hbus->use_calls = false; #endif hbus->hdev = hdev; @@ -3579,7 +3777,6 @@ static int hv_pci_probe(struct hv_device *hdev, INIT_LIST_HEAD(&hbus->dr_list); spin_lock_init(&hbus->config_lock); spin_lock_init(&hbus->device_list_lock); - spin_lock_init(&hbus->retarget_msi_interrupt_lock); hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0, hbus->bridge->domain_nr); if (!hbus->wq) { @@ -3633,49 +3830,15 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_fwnode; -retry: ret = hv_pci_query_relations(hdev); if (ret) goto free_irq_domain; - ret = hv_pci_enter_d0(hdev); - /* - * In certain case (Kdump) the pci device of interest was - * not cleanly shut down and resource is still held on host - * side, the host could return invalid device status. - * We need to explicitly request host to release the resource - * and try to enter D0 again. - * Since the hv_pci_bus_exit() call releases structures - * of all its child devices, we need to start the retry from - * hv_pci_query_relations() call, requesting host to send - * the synchronous child device relations message before this - * information is needed in hv_send_resources_allocated() - * call later. - */ - if (ret == -EPROTO && enter_d0_retry) { - enter_d0_retry = false; + mutex_lock(&hbus->state_lock); - dev_err(&hdev->device, "Retrying D0 Entry\n"); - - /* - * Hv_pci_bus_exit() calls hv_send_resources_released() - * to free up resources of its child devices. - * In the kdump kernel we need to set the - * wslot_res_allocated to 255 so it scans all child - * devices to release resources allocated in the - * normal kernel before panic happened. - */ - hbus->wslot_res_allocated = 255; - ret = hv_pci_bus_exit(hdev, true); - - if (ret == 0) - goto retry; - - dev_err(&hdev->device, - "Retrying D0 failed with ret %d\n", ret); - } + ret = hv_pci_enter_d0(hdev); if (ret) - goto free_irq_domain; + goto release_state_lock; ret = hv_pci_allocate_bridge_windows(hbus); if (ret) @@ -3693,12 +3856,15 @@ retry: if (ret) goto free_windows; + mutex_unlock(&hbus->state_lock); return 0; free_windows: hv_pci_free_bridge_windows(hbus); exit_d0: (void) hv_pci_bus_exit(hdev, true); +release_state_lock: + mutex_unlock(&hbus->state_lock); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: @@ -3712,7 +3878,7 @@ close: destroy_wq: destroy_workqueue(hbus->wq); free_dom: - hv_put_dom_num(hbus->bridge->domain_nr); + pci_bus_release_emul_domain_nr(hbus->bridge->domain_nr); free_bus: kfree(hbus); return ret; @@ -3726,6 +3892,7 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) struct pci_packet teardown_packet; u8 buffer[sizeof(struct pci_message)]; } pkt; + struct pci_message *msg; struct hv_pci_compl comp_pkt; struct hv_pci_dev *hpdev, *tmp; unsigned long flags; @@ -3771,10 +3938,10 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) init_completion(&comp_pkt.host_event); pkt.teardown_packet.completion_func = hv_pci_generic_compl; pkt.teardown_packet.compl_ctxt = &comp_pkt; - pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; + msg = (struct pci_message *)pkt.buffer; + msg->type = PCI_BUS_D0EXIT; - ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message, - sizeof(struct pci_message), + ret = vmbus_sendpacket_getid(chan, msg, sizeof(*msg), (unsigned long)&pkt.teardown_packet, &trans_id, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); @@ -3800,13 +3967,10 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) /** * hv_pci_remove() - Remove routine for this VMBus channel * @hdev: VMBus's tracking struct for this root PCI bus - * - * Return: 0 on success, -errno on failure */ -static int hv_pci_remove(struct hv_device *hdev) +static void hv_pci_remove(struct hv_device *hdev) { struct hv_pcibus_device *hbus; - int ret; hbus = hv_get_drvdata(hdev); if (hbus->state == hv_pcibus_installed) { @@ -3829,7 +3993,7 @@ static int hv_pci_remove(struct hv_device *hdev) pci_unlock_rescan_remove(); } - ret = hv_pci_bus_exit(hdev, false); + hv_pci_bus_exit(hdev, false); vmbus_close(hdev->channel); @@ -3839,10 +4003,7 @@ static int hv_pci_remove(struct hv_device *hdev) irq_domain_remove(hbus->irq_domain); irq_domain_free_fwnode(hbus->fwnode); - hv_put_dom_num(hbus->bridge->domain_nr); - kfree(hbus); - return ret; } static int hv_pci_suspend(struct hv_device *hdev) @@ -3896,21 +4057,18 @@ static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) { struct irq_data *irq_data; struct msi_desc *entry; - int ret = 0; - msi_lock_descs(&pdev->dev); + if (!pdev->msi_enabled && !pdev->msix_enabled) + return 0; + + guard(msi_descs_lock)(&pdev->dev); msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) { irq_data = irq_get_irq_data(entry->irq); - if (WARN_ON_ONCE(!irq_data)) { - ret = -EINVAL; - break; - } - + if (WARN_ON_ONCE(!irq_data)) + return -EINVAL; hv_compose_msi_msg(irq_data, &entry->msg); } - msi_unlock_descs(&pdev->dev); - - return ret; + return 0; } /* @@ -3952,20 +4110,26 @@ static int hv_pci_resume(struct hv_device *hdev) if (ret) goto out; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto out; + goto release_state_lock; ret = hv_send_resources_allocated(hdev); if (ret) - goto out; + goto release_state_lock; prepopulate_bars(hbus); hv_pci_restore_msi_state(hbus); hbus->state = hv_pcibus_installed; + mutex_unlock(&hbus->state_lock); return 0; + +release_state_lock: + mutex_unlock(&hbus->state_lock); out: vmbus_close(hdev->channel); return ret; @@ -4005,13 +4169,13 @@ static int __init init_hv_pci_drv(void) if (!hv_is_hyperv_initialized()) return -ENODEV; + if (hv_root_partition() && !hv_nested) + return -ENODEV; + ret = hv_pci_irqchip_init(); if (ret) return ret; - /* Set the invalid domain number's bit, so it will not be used */ - set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); - /* Initialize PCI block r/w interface */ hvpci_block_ops.read_block = hv_read_config_block; hvpci_block_ops.write_block = hv_write_config_block; |
