summaryrefslogtreecommitdiff
path: root/drivers/iommu/intel/irq_remapping.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/iommu/intel/irq_remapping.c')
-rw-r--r--drivers/iommu/intel/irq_remapping.c272
1 files changed, 179 insertions, 93 deletions
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index f58f5f57af78..4f9b01dc91e8 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -10,6 +10,7 @@
#include <linux/hpet.h>
#include <linux/pci.h>
#include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
#include <linux/acpi.h>
#include <linux/irqdomain.h>
#include <linux/crash_dump.h>
@@ -19,15 +20,11 @@
#include <asm/cpu.h>
#include <asm/irq_remapping.h>
#include <asm/pci-direct.h>
+#include <asm/posted_intr.h>
#include "iommu.h"
#include "../irq_remapping.h"
-#include "cap_audit.h"
-
-enum irq_mode {
- IRQ_REMAPPING,
- IRQ_POSTING,
-};
+#include "../iommu-pages.h"
struct ioapic_scope {
struct intel_iommu *iommu;
@@ -48,7 +45,8 @@ struct irq_2_iommu {
u16 irte_index;
u16 sub_handle;
u8 irte_mask;
- enum irq_mode mode;
+ bool posted_msi;
+ bool posted_vcpu;
};
struct intel_ir_data {
@@ -82,7 +80,7 @@ static const struct irq_domain_ops intel_ir_domain_ops;
static void iommu_disable_irq_remapping(struct intel_iommu *iommu);
static int __init parse_ioapics_under_ir(void);
-static const struct msi_parent_ops dmar_msi_parent_ops, virt_dmar_msi_parent_ops;
+static const struct msi_parent_ops dmar_msi_parent_ops;
static bool ir_pre_enabled(struct intel_iommu *iommu)
{
@@ -136,7 +134,6 @@ static int alloc_irte(struct intel_iommu *iommu,
irq_iommu->irte_index = index;
irq_iommu->sub_handle = 0;
irq_iommu->irte_mask = mask;
- irq_iommu->mode = IRQ_REMAPPING;
}
raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
@@ -175,18 +172,14 @@ static int modify_irte(struct irq_2_iommu *irq_iommu,
irte = &iommu->ir_table->base[index];
if ((irte->pst == 1) || (irte_modified->pst == 1)) {
- bool ret;
-
- ret = cmpxchg_double(&irte->low, &irte->high,
- irte->low, irte->high,
- irte_modified->low, irte_modified->high);
/*
* We use cmpxchg16 to atomically update the 128-bit IRTE,
* and it cannot be updated by the hardware or other processors
* behind us, so the return value of cmpxchg16 should be the
* same as the old value.
*/
- WARN_ON(!ret);
+ u128 old = irte->irte;
+ WARN_ON(!try_cmpxchg128(&irte->irte, &old, irte_modified->irte));
} else {
WRITE_ONCE(irte->low, irte_modified->low);
WRITE_ONCE(irte->high, irte_modified->high);
@@ -195,8 +188,6 @@ static int modify_irte(struct irq_2_iommu *irq_iommu,
rc = qi_flush_iec(iommu, index, 0);
- /* Update iommu mode according to the IRTE mode */
- irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING;
raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return rc;
@@ -311,14 +302,12 @@ static int set_ioapic_sid(struct irte *irte, int apic)
if (!irte)
return -1;
- down_read(&dmar_global_lock);
for (i = 0; i < MAX_IO_APICS; i++) {
if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) {
- sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+ sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn);
break;
}
}
- up_read(&dmar_global_lock);
if (sid == 0) {
pr_warn("Failed to set source-id of IOAPIC (%d)\n", apic);
@@ -338,14 +327,12 @@ static int set_hpet_sid(struct irte *irte, u8 id)
if (!irte)
return -1;
- down_read(&dmar_global_lock);
for (i = 0; i < MAX_HPET_TBS; i++) {
if (ir_hpet[i].iommu && ir_hpet[i].id == id) {
- sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
+ sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn);
break;
}
}
- up_read(&dmar_global_lock);
if (sid == 0) {
pr_warn("Failed to set source-id of HPET block (%d)\n", id);
@@ -532,10 +519,16 @@ static void iommu_enable_irq_remapping(struct intel_iommu *iommu)
static int intel_setup_irq_remapping(struct intel_iommu *iommu)
{
+ struct irq_domain_info info = {
+ .ops = &intel_ir_domain_ops,
+ .parent = arch_get_ir_parent_domain(),
+ .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI,
+ .size = INTR_REMAP_TABLE_ENTRIES,
+ .host_data = iommu,
+ };
struct ir_table *ir_table;
- struct fwnode_handle *fn;
unsigned long *bitmap;
- struct page *pages;
+ void *ir_table_base;
if (iommu->ir_table)
return 0;
@@ -544,43 +537,31 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
if (!ir_table)
return -ENOMEM;
- pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
- INTR_REMAP_PAGE_ORDER);
- if (!pages) {
- pr_err("IR%d: failed to allocate pages of order %d\n",
- iommu->seq_id, INTR_REMAP_PAGE_ORDER);
+ /* 1MB - maximum possible interrupt remapping table size */
+ ir_table_base =
+ iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, SZ_1M);
+ if (!ir_table_base) {
+ pr_err("IR%d: failed to allocate 1M of pages\n", iommu->seq_id);
goto out_free_table;
}
- bitmap = bitmap_zalloc(INTR_REMAP_TABLE_ENTRIES, GFP_ATOMIC);
+ bitmap = bitmap_zalloc(INTR_REMAP_TABLE_ENTRIES, GFP_KERNEL);
if (bitmap == NULL) {
pr_err("IR%d: failed to allocate bitmap\n", iommu->seq_id);
goto out_free_pages;
}
- fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
- if (!fn)
+ info.fwnode = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
+ if (!info.fwnode)
goto out_free_bitmap;
- iommu->ir_domain =
- irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
- 0, INTR_REMAP_TABLE_ENTRIES,
- fn, &intel_ir_domain_ops,
- iommu);
+ iommu->ir_domain = msi_create_parent_irq_domain(&info, &dmar_msi_parent_ops);
if (!iommu->ir_domain) {
pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
goto out_free_fwnode;
}
- irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_DMAR);
- iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
-
- if (cap_caching_mode(iommu->cap))
- iommu->ir_domain->msi_parent_ops = &virt_dmar_msi_parent_ops;
- else
- iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops;
-
- ir_table->base = page_address(pages);
+ ir_table->base = ir_table_base;
ir_table->bitmap = bitmap;
iommu->ir_table = ir_table;
@@ -605,8 +586,8 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
if (ir_pre_enabled(iommu)) {
if (!is_kdump_kernel()) {
- pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n",
- iommu->name);
+ pr_info_once("IRQ remapping was enabled on %s but we are not in kdump mode\n",
+ iommu->name);
clear_ir_pre_enabled(iommu);
iommu_disable_irq_remapping(iommu);
} else if (iommu_load_old_irte(iommu))
@@ -625,11 +606,11 @@ out_free_ir_domain:
irq_domain_remove(iommu->ir_domain);
iommu->ir_domain = NULL;
out_free_fwnode:
- irq_domain_free_fwnode(fn);
+ irq_domain_free_fwnode(info.fwnode);
out_free_bitmap:
bitmap_free(bitmap);
out_free_pages:
- __free_pages(pages, INTR_REMAP_PAGE_ORDER);
+ iommu_free_pages(ir_table_base);
out_free_table:
kfree(ir_table);
@@ -650,8 +631,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
irq_domain_free_fwnode(fn);
iommu->ir_domain = NULL;
}
- free_pages((unsigned long)iommu->ir_table->base,
- INTR_REMAP_PAGE_ORDER);
+ iommu_free_pages(iommu->ir_table->base);
bitmap_free(iommu->ir_table->bitmap);
kfree(iommu->ir_table);
iommu->ir_table = NULL;
@@ -736,9 +716,6 @@ static int __init intel_prepare_irq_remapping(void)
if (dmar_table_init() < 0)
return -ENODEV;
- if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
- return -ENODEV;
-
if (!dmar_ir_support())
return -ENODEV;
@@ -1119,12 +1096,20 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
* irq migration in the presence of interrupt-remapping.
*/
irte->trigger_mode = 0;
- irte->dlvry_mode = apic->delivery_mode;
+ irte->dlvry_mode = APIC_DELIVERY_MODE_FIXED;
irte->vector = vector;
irte->dest_id = IRTE_DEST(dest);
irte->redir_hint = 1;
}
+static void prepare_irte_posted(struct irte *irte)
+{
+ memset(irte, 0, sizeof(*irte));
+
+ irte->present = 1;
+ irte->p_pst = 1;
+}
+
struct irq_remap_ops intel_irq_remap_ops = {
.prepare = intel_prepare_irq_remapping,
.enable = intel_enable_irq_remapping,
@@ -1133,7 +1118,67 @@ struct irq_remap_ops intel_irq_remap_ops = {
.enable_faulting = enable_drhd_fault_handling,
};
-static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
+#ifdef CONFIG_X86_POSTED_MSI
+
+static phys_addr_t get_pi_desc_addr(struct irq_data *irqd)
+{
+ int cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+
+ if (WARN_ON(cpu >= nr_cpu_ids))
+ return 0;
+
+ return __pa(per_cpu_ptr(&posted_msi_pi_desc, cpu));
+}
+
+static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd)
+{
+ struct intel_ir_data *ir_data = irqd->chip_data;
+ struct irte *irte = &ir_data->irte_entry;
+ struct irte irte_pi;
+ u64 pid_addr;
+
+ pid_addr = get_pi_desc_addr(irqd);
+
+ if (!pid_addr) {
+ pr_warn("Failed to setup IRQ %d for posted mode", irqd->irq);
+ return;
+ }
+
+ memset(&irte_pi, 0, sizeof(irte_pi));
+
+ /* The shared IRTE already be set up as posted during alloc_irte */
+ dmar_copy_shared_irte(&irte_pi, irte);
+
+ irte_pi.pda_l = (pid_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
+ irte_pi.pda_h = (pid_addr >> 32) & ~(-1UL << PDA_HIGH_BIT);
+
+ modify_irte(&ir_data->irq_2_iommu, &irte_pi);
+}
+
+#else
+static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {}
+#endif
+
+static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host)
+{
+ struct intel_ir_data *ir_data = irqd->chip_data;
+
+ /*
+ * Don't modify IRTEs for IRQs that are being posted to vCPUs if the
+ * host CPU affinity changes.
+ */
+ if (ir_data->irq_2_iommu.posted_vcpu && !force_host)
+ return;
+
+ ir_data->irq_2_iommu.posted_vcpu = false;
+
+ if (ir_data->irq_2_iommu.posted_msi)
+ intel_ir_reconfigure_irte_posted(irqd);
+ else
+ modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
+}
+
+static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host)
{
struct intel_ir_data *ir_data = irqd->chip_data;
struct irte *irte = &ir_data->irte_entry;
@@ -1146,9 +1191,7 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
irte->vector = cfg->vector;
irte->dest_id = IRTE_DEST(cfg->dest_apicid);
- /* Update the hardware only if the interrupt is in remapped mode. */
- if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
- modify_irte(&ir_data->irq_2_iommu, irte);
+ __intel_ir_reconfigure_irte(irqd, force_host);
}
/*
@@ -1183,7 +1226,7 @@ intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
* at the new destination. So, time to cleanup the previous
* vector allocation.
*/
- send_cleanup_vector(cfg);
+ vector_schedule_cleanup(cfg);
return IRQ_SET_MASK_OK_DONE;
}
@@ -1199,11 +1242,11 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
{
struct intel_ir_data *ir_data = data->chip_data;
- struct vcpu_data *vcpu_pi_info = info;
+ struct intel_iommu_pi_data *pi_data = info;
- /* stop posting interrupts, back to remapping mode */
- if (!vcpu_pi_info) {
- modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
+ /* stop posting interrupts, back to the default mode */
+ if (!pi_data) {
+ __intel_ir_reconfigure_irte(data, true);
} else {
struct irte irte_pi;
@@ -1220,12 +1263,13 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
/* Update the posted mode fields */
irte_pi.p_pst = 1;
irte_pi.p_urgent = 0;
- irte_pi.p_vector = vcpu_pi_info->vector;
- irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
+ irte_pi.p_vector = pi_data->vector;
+ irte_pi.pda_l = (pi_data->pi_desc_addr >>
(32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
- irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+ irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) &
~(-1UL << PDA_HIGH_BIT);
+ ir_data->irq_2_iommu.posted_vcpu = true;
modify_irte(&ir_data->irq_2_iommu, &irte_pi);
}
@@ -1240,6 +1284,50 @@ static struct irq_chip intel_ir_chip = {
.irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
};
+/*
+ * With posted MSIs, the MSI vectors are multiplexed into a single notification
+ * vector, and only the notification vector is sent to the APIC IRR. Device
+ * MSIs are then dispatched in a demux loop that harvests the MSIs from the
+ * CPU's Posted Interrupt Request bitmap. I.e. Posted MSIs never get sent to
+ * the APIC IRR, and thus do not need an EOI. The notification handler instead
+ * performs a single EOI after processing the PIR.
+ *
+ * Note! Pending SMP/CPU affinity changes, which are per MSI, must still be
+ * honored, only the APIC EOI is omitted.
+ *
+ * For the example below, 3 MSIs are coalesced into one CPU notification. Only
+ * one apic_eoi() is needed, but each MSI needs to process pending changes to
+ * its CPU affinity.
+ *
+ * __sysvec_posted_msi_notification()
+ * irq_enter();
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * irq_move_irq(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * irq_move_irq(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * irq_move_irq(); // No EOI
+ * handle_irq_event()
+ * driver_handler()
+ * apic_eoi()
+ * irq_exit()
+ *
+ */
+static struct irq_chip intel_ir_chip_post_msi = {
+ .name = "INTEL-IR-POST",
+ .irq_ack = irq_move_irq,
+ .irq_set_affinity = intel_ir_set_affinity,
+ .irq_compose_msi_msg = intel_ir_compose_msi_msg,
+ .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
+};
+
static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
{
memset(msg, 0, sizeof(*msg));
@@ -1268,12 +1356,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
case X86_IRQ_ALLOC_TYPE_IOAPIC:
/* Set source-id of interrupt request */
set_ioapic_sid(irte, info->devid);
- apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
- info->devid, irte->present, irte->fpd,
- irte->dst_mode, irte->redir_hint,
- irte->trigger_mode, irte->dlvry_mode,
- irte->avail, irte->vector, irte->dest_id,
- irte->sid, irte->sq, irte->svt);
+ apic_pr_verbose("IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
+ info->devid, irte->present, irte->fpd, irte->dst_mode,
+ irte->redir_hint, irte->trigger_mode, irte->dlvry_mode,
+ irte->avail, irte->vector, irte->dest_id, irte->sid,
+ irte->sq, irte->svt);
sub_handle = info->ioapic.pin;
break;
case X86_IRQ_ALLOC_TYPE_HPET:
@@ -1281,6 +1368,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
break;
case X86_IRQ_ALLOC_TYPE_PCI_MSI:
case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+ if (posted_msi_supported()) {
+ prepare_irte_posted(irte);
+ data->irq_2_iommu.posted_msi = 1;
+ }
+
set_msi_sid(irte,
pci_real_dma_dev(msi_desc_to_pci_dev(info->desc)));
break;
@@ -1338,9 +1430,7 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
if (!data)
goto out_free_parent;
- down_read(&dmar_global_lock);
index = alloc_irte(iommu, &data->irq_2_iommu, nr_irqs);
- up_read(&dmar_global_lock);
if (index < 0) {
pr_warn("Failed to allocate IRTE\n");
kfree(data);
@@ -1370,9 +1460,13 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
irq_data->hwirq = (index << 16) + i;
irq_data->chip_data = ird;
- irq_data->chip = &intel_ir_chip;
+ if (posted_msi_supported() &&
+ ((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) ||
+ (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX)))
+ irq_data->chip = &intel_ir_chip_post_msi;
+ else
+ irq_data->chip = &intel_ir_chip;
intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
- irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
}
return 0;
@@ -1403,6 +1497,9 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain,
struct intel_ir_data *data = irq_data->chip_data;
struct irte entry;
+ WARN_ON_ONCE(data->irq_2_iommu.posted_vcpu);
+ data->irq_2_iommu.posted_vcpu = false;
+
memset(&entry, 0, sizeof(entry));
modify_irte(&data->irq_2_iommu, &entry);
}
@@ -1430,20 +1527,13 @@ static const struct irq_domain_ops intel_ir_domain_ops = {
};
static const struct msi_parent_ops dmar_msi_parent_ops = {
- .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED |
- MSI_FLAG_MULTI_PCI_MSI |
- MSI_FLAG_PCI_IMS,
+ .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI,
+ .bus_select_token = DOMAIN_BUS_DMAR,
+ .bus_select_mask = MATCH_PCI_MSI,
.prefix = "IR-",
.init_dev_msi_info = msi_parent_init_dev_msi_info,
};
-static const struct msi_parent_ops virt_dmar_msi_parent_ops = {
- .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED |
- MSI_FLAG_MULTI_PCI_MSI,
- .prefix = "vIR-",
- .init_dev_msi_info = msi_parent_init_dev_msi_info,
-};
-
/*
* Support of Interrupt Remapping Unit Hotplug
*/
@@ -1452,10 +1542,6 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
int ret;
int eim = x2apic_enabled();
- ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu);
- if (ret)
- return ret;
-
if (eim && !ecap_eim_support(iommu->ecap)) {
pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
iommu->reg_phys, iommu->ecap);