diff options
Diffstat (limited to 'drivers/iommu/intel/irq_remapping.c')
| -rw-r--r-- | drivers/iommu/intel/irq_remapping.c | 272 |
1 files changed, 179 insertions, 93 deletions
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index f58f5f57af78..4f9b01dc91e8 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -10,6 +10,7 @@ #include <linux/hpet.h> #include <linux/pci.h> #include <linux/irq.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/acpi.h> #include <linux/irqdomain.h> #include <linux/crash_dump.h> @@ -19,15 +20,11 @@ #include <asm/cpu.h> #include <asm/irq_remapping.h> #include <asm/pci-direct.h> +#include <asm/posted_intr.h> #include "iommu.h" #include "../irq_remapping.h" -#include "cap_audit.h" - -enum irq_mode { - IRQ_REMAPPING, - IRQ_POSTING, -}; +#include "../iommu-pages.h" struct ioapic_scope { struct intel_iommu *iommu; @@ -48,7 +45,8 @@ struct irq_2_iommu { u16 irte_index; u16 sub_handle; u8 irte_mask; - enum irq_mode mode; + bool posted_msi; + bool posted_vcpu; }; struct intel_ir_data { @@ -82,7 +80,7 @@ static const struct irq_domain_ops intel_ir_domain_ops; static void iommu_disable_irq_remapping(struct intel_iommu *iommu); static int __init parse_ioapics_under_ir(void); -static const struct msi_parent_ops dmar_msi_parent_ops, virt_dmar_msi_parent_ops; +static const struct msi_parent_ops dmar_msi_parent_ops; static bool ir_pre_enabled(struct intel_iommu *iommu) { @@ -136,7 +134,6 @@ static int alloc_irte(struct intel_iommu *iommu, irq_iommu->irte_index = index; irq_iommu->sub_handle = 0; irq_iommu->irte_mask = mask; - irq_iommu->mode = IRQ_REMAPPING; } raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); @@ -175,18 +172,14 @@ static int modify_irte(struct irq_2_iommu *irq_iommu, irte = &iommu->ir_table->base[index]; if ((irte->pst == 1) || (irte_modified->pst == 1)) { - bool ret; - - ret = cmpxchg_double(&irte->low, &irte->high, - irte->low, irte->high, - irte_modified->low, irte_modified->high); /* * We use cmpxchg16 to atomically update the 128-bit IRTE, * and it cannot be updated by the hardware or other processors * behind us, so the return value of cmpxchg16 should be the * same as the old value. */ - WARN_ON(!ret); + u128 old = irte->irte; + WARN_ON(!try_cmpxchg128(&irte->irte, &old, irte_modified->irte)); } else { WRITE_ONCE(irte->low, irte_modified->low); WRITE_ONCE(irte->high, irte_modified->high); @@ -195,8 +188,6 @@ static int modify_irte(struct irq_2_iommu *irq_iommu, rc = qi_flush_iec(iommu, index, 0); - /* Update iommu mode according to the IRTE mode */ - irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING; raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); return rc; @@ -311,14 +302,12 @@ static int set_ioapic_sid(struct irte *irte, int apic) if (!irte) return -1; - down_read(&dmar_global_lock); for (i = 0; i < MAX_IO_APICS; i++) { if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { - sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; + sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn); break; } } - up_read(&dmar_global_lock); if (sid == 0) { pr_warn("Failed to set source-id of IOAPIC (%d)\n", apic); @@ -338,14 +327,12 @@ static int set_hpet_sid(struct irte *irte, u8 id) if (!irte) return -1; - down_read(&dmar_global_lock); for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].iommu && ir_hpet[i].id == id) { - sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn); break; } } - up_read(&dmar_global_lock); if (sid == 0) { pr_warn("Failed to set source-id of HPET block (%d)\n", id); @@ -532,10 +519,16 @@ static void iommu_enable_irq_remapping(struct intel_iommu *iommu) static int intel_setup_irq_remapping(struct intel_iommu *iommu) { + struct irq_domain_info info = { + .ops = &intel_ir_domain_ops, + .parent = arch_get_ir_parent_domain(), + .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, + .size = INTR_REMAP_TABLE_ENTRIES, + .host_data = iommu, + }; struct ir_table *ir_table; - struct fwnode_handle *fn; unsigned long *bitmap; - struct page *pages; + void *ir_table_base; if (iommu->ir_table) return 0; @@ -544,43 +537,31 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (!ir_table) return -ENOMEM; - pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, - INTR_REMAP_PAGE_ORDER); - if (!pages) { - pr_err("IR%d: failed to allocate pages of order %d\n", - iommu->seq_id, INTR_REMAP_PAGE_ORDER); + /* 1MB - maximum possible interrupt remapping table size */ + ir_table_base = + iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, SZ_1M); + if (!ir_table_base) { + pr_err("IR%d: failed to allocate 1M of pages\n", iommu->seq_id); goto out_free_table; } - bitmap = bitmap_zalloc(INTR_REMAP_TABLE_ENTRIES, GFP_ATOMIC); + bitmap = bitmap_zalloc(INTR_REMAP_TABLE_ENTRIES, GFP_KERNEL); if (bitmap == NULL) { pr_err("IR%d: failed to allocate bitmap\n", iommu->seq_id); goto out_free_pages; } - fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id); - if (!fn) + info.fwnode = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id); + if (!info.fwnode) goto out_free_bitmap; - iommu->ir_domain = - irq_domain_create_hierarchy(arch_get_ir_parent_domain(), - 0, INTR_REMAP_TABLE_ENTRIES, - fn, &intel_ir_domain_ops, - iommu); + iommu->ir_domain = msi_create_parent_irq_domain(&info, &dmar_msi_parent_ops); if (!iommu->ir_domain) { pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); goto out_free_fwnode; } - irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_DMAR); - iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; - - if (cap_caching_mode(iommu->cap)) - iommu->ir_domain->msi_parent_ops = &virt_dmar_msi_parent_ops; - else - iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops; - - ir_table->base = page_address(pages); + ir_table->base = ir_table_base; ir_table->bitmap = bitmap; iommu->ir_table = ir_table; @@ -605,8 +586,8 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (ir_pre_enabled(iommu)) { if (!is_kdump_kernel()) { - pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n", - iommu->name); + pr_info_once("IRQ remapping was enabled on %s but we are not in kdump mode\n", + iommu->name); clear_ir_pre_enabled(iommu); iommu_disable_irq_remapping(iommu); } else if (iommu_load_old_irte(iommu)) @@ -625,11 +606,11 @@ out_free_ir_domain: irq_domain_remove(iommu->ir_domain); iommu->ir_domain = NULL; out_free_fwnode: - irq_domain_free_fwnode(fn); + irq_domain_free_fwnode(info.fwnode); out_free_bitmap: bitmap_free(bitmap); out_free_pages: - __free_pages(pages, INTR_REMAP_PAGE_ORDER); + iommu_free_pages(ir_table_base); out_free_table: kfree(ir_table); @@ -650,8 +631,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu) irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } - free_pages((unsigned long)iommu->ir_table->base, - INTR_REMAP_PAGE_ORDER); + iommu_free_pages(iommu->ir_table->base); bitmap_free(iommu->ir_table->bitmap); kfree(iommu->ir_table); iommu->ir_table = NULL; @@ -736,9 +716,6 @@ static int __init intel_prepare_irq_remapping(void) if (dmar_table_init() < 0) return -ENODEV; - if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL)) - return -ENODEV; - if (!dmar_ir_support()) return -ENODEV; @@ -1119,12 +1096,20 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest) * irq migration in the presence of interrupt-remapping. */ irte->trigger_mode = 0; - irte->dlvry_mode = apic->delivery_mode; + irte->dlvry_mode = APIC_DELIVERY_MODE_FIXED; irte->vector = vector; irte->dest_id = IRTE_DEST(dest); irte->redir_hint = 1; } +static void prepare_irte_posted(struct irte *irte) +{ + memset(irte, 0, sizeof(*irte)); + + irte->present = 1; + irte->p_pst = 1; +} + struct irq_remap_ops intel_irq_remap_ops = { .prepare = intel_prepare_irq_remapping, .enable = intel_enable_irq_remapping, @@ -1133,7 +1118,67 @@ struct irq_remap_ops intel_irq_remap_ops = { .enable_faulting = enable_drhd_fault_handling, }; -static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) +#ifdef CONFIG_X86_POSTED_MSI + +static phys_addr_t get_pi_desc_addr(struct irq_data *irqd) +{ + int cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); + + if (WARN_ON(cpu >= nr_cpu_ids)) + return 0; + + return __pa(per_cpu_ptr(&posted_msi_pi_desc, cpu)); +} + +static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + struct irte *irte = &ir_data->irte_entry; + struct irte irte_pi; + u64 pid_addr; + + pid_addr = get_pi_desc_addr(irqd); + + if (!pid_addr) { + pr_warn("Failed to setup IRQ %d for posted mode", irqd->irq); + return; + } + + memset(&irte_pi, 0, sizeof(irte_pi)); + + /* The shared IRTE already be set up as posted during alloc_irte */ + dmar_copy_shared_irte(&irte_pi, irte); + + irte_pi.pda_l = (pid_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); + irte_pi.pda_h = (pid_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); + + modify_irte(&ir_data->irq_2_iommu, &irte_pi); +} + +#else +static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {} +#endif + +static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + + /* + * Don't modify IRTEs for IRQs that are being posted to vCPUs if the + * host CPU affinity changes. + */ + if (ir_data->irq_2_iommu.posted_vcpu && !force_host) + return; + + ir_data->irq_2_iommu.posted_vcpu = false; + + if (ir_data->irq_2_iommu.posted_msi) + intel_ir_reconfigure_irte_posted(irqd); + else + modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); +} + +static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) { struct intel_ir_data *ir_data = irqd->chip_data; struct irte *irte = &ir_data->irte_entry; @@ -1146,9 +1191,7 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) irte->vector = cfg->vector; irte->dest_id = IRTE_DEST(cfg->dest_apicid); - /* Update the hardware only if the interrupt is in remapped mode. */ - if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) - modify_irte(&ir_data->irq_2_iommu, irte); + __intel_ir_reconfigure_irte(irqd, force_host); } /* @@ -1183,7 +1226,7 @@ intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, * at the new destination. So, time to cleanup the previous * vector allocation. */ - send_cleanup_vector(cfg); + vector_schedule_cleanup(cfg); return IRQ_SET_MASK_OK_DONE; } @@ -1199,11 +1242,11 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data, static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) { struct intel_ir_data *ir_data = data->chip_data; - struct vcpu_data *vcpu_pi_info = info; + struct intel_iommu_pi_data *pi_data = info; - /* stop posting interrupts, back to remapping mode */ - if (!vcpu_pi_info) { - modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); + /* stop posting interrupts, back to the default mode */ + if (!pi_data) { + __intel_ir_reconfigure_irte(data, true); } else { struct irte irte_pi; @@ -1220,12 +1263,13 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) /* Update the posted mode fields */ irte_pi.p_pst = 1; irte_pi.p_urgent = 0; - irte_pi.p_vector = vcpu_pi_info->vector; - irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >> + irte_pi.p_vector = pi_data->vector; + irte_pi.pda_l = (pi_data->pi_desc_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); - irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) & + irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); + ir_data->irq_2_iommu.posted_vcpu = true; modify_irte(&ir_data->irq_2_iommu, &irte_pi); } @@ -1240,6 +1284,50 @@ static struct irq_chip intel_ir_chip = { .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, }; +/* + * With posted MSIs, the MSI vectors are multiplexed into a single notification + * vector, and only the notification vector is sent to the APIC IRR. Device + * MSIs are then dispatched in a demux loop that harvests the MSIs from the + * CPU's Posted Interrupt Request bitmap. I.e. Posted MSIs never get sent to + * the APIC IRR, and thus do not need an EOI. The notification handler instead + * performs a single EOI after processing the PIR. + * + * Note! Pending SMP/CPU affinity changes, which are per MSI, must still be + * honored, only the APIC EOI is omitted. + * + * For the example below, 3 MSIs are coalesced into one CPU notification. Only + * one apic_eoi() is needed, but each MSI needs to process pending changes to + * its CPU affinity. + * + * __sysvec_posted_msi_notification() + * irq_enter(); + * handle_edge_irq() + * irq_chip_ack_parent() + * irq_move_irq(); // No EOI + * handle_irq_event() + * driver_handler() + * handle_edge_irq() + * irq_chip_ack_parent() + * irq_move_irq(); // No EOI + * handle_irq_event() + * driver_handler() + * handle_edge_irq() + * irq_chip_ack_parent() + * irq_move_irq(); // No EOI + * handle_irq_event() + * driver_handler() + * apic_eoi() + * irq_exit() + * + */ +static struct irq_chip intel_ir_chip_post_msi = { + .name = "INTEL-IR-POST", + .irq_ack = irq_move_irq, + .irq_set_affinity = intel_ir_set_affinity, + .irq_compose_msi_msg = intel_ir_compose_msi_msg, + .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, +}; + static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle) { memset(msg, 0, sizeof(*msg)); @@ -1268,12 +1356,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, case X86_IRQ_ALLOC_TYPE_IOAPIC: /* Set source-id of interrupt request */ set_ioapic_sid(irte, info->devid); - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", - info->devid, irte->present, irte->fpd, - irte->dst_mode, irte->redir_hint, - irte->trigger_mode, irte->dlvry_mode, - irte->avail, irte->vector, irte->dest_id, - irte->sid, irte->sq, irte->svt); + apic_pr_verbose("IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", + info->devid, irte->present, irte->fpd, irte->dst_mode, + irte->redir_hint, irte->trigger_mode, irte->dlvry_mode, + irte->avail, irte->vector, irte->dest_id, irte->sid, + irte->sq, irte->svt); sub_handle = info->ioapic.pin; break; case X86_IRQ_ALLOC_TYPE_HPET: @@ -1281,6 +1368,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, break; case X86_IRQ_ALLOC_TYPE_PCI_MSI: case X86_IRQ_ALLOC_TYPE_PCI_MSIX: + if (posted_msi_supported()) { + prepare_irte_posted(irte); + data->irq_2_iommu.posted_msi = 1; + } + set_msi_sid(irte, pci_real_dma_dev(msi_desc_to_pci_dev(info->desc))); break; @@ -1338,9 +1430,7 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain, if (!data) goto out_free_parent; - down_read(&dmar_global_lock); index = alloc_irte(iommu, &data->irq_2_iommu, nr_irqs); - up_read(&dmar_global_lock); if (index < 0) { pr_warn("Failed to allocate IRTE\n"); kfree(data); @@ -1370,9 +1460,13 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain, irq_data->hwirq = (index << 16) + i; irq_data->chip_data = ird; - irq_data->chip = &intel_ir_chip; + if (posted_msi_supported() && + ((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) || + (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX))) + irq_data->chip = &intel_ir_chip_post_msi; + else + irq_data->chip = &intel_ir_chip; intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i); - irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); } return 0; @@ -1403,6 +1497,9 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain, struct intel_ir_data *data = irq_data->chip_data; struct irte entry; + WARN_ON_ONCE(data->irq_2_iommu.posted_vcpu); + data->irq_2_iommu.posted_vcpu = false; + memset(&entry, 0, sizeof(entry)); modify_irte(&data->irq_2_iommu, &entry); } @@ -1430,20 +1527,13 @@ static const struct irq_domain_ops intel_ir_domain_ops = { }; static const struct msi_parent_ops dmar_msi_parent_ops = { - .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | - MSI_FLAG_MULTI_PCI_MSI | - MSI_FLAG_PCI_IMS, + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, + .bus_select_token = DOMAIN_BUS_DMAR, + .bus_select_mask = MATCH_PCI_MSI, .prefix = "IR-", .init_dev_msi_info = msi_parent_init_dev_msi_info, }; -static const struct msi_parent_ops virt_dmar_msi_parent_ops = { - .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | - MSI_FLAG_MULTI_PCI_MSI, - .prefix = "vIR-", - .init_dev_msi_info = msi_parent_init_dev_msi_info, -}; - /* * Support of Interrupt Remapping Unit Hotplug */ @@ -1452,10 +1542,6 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu) int ret; int eim = x2apic_enabled(); - ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu); - if (ret) - return ret; - if (eim && !ecap_eim_support(iommu->ecap)) { pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n", iommu->reg_phys, iommu->ecap); |
