diff options
Diffstat (limited to 'drivers/iommu/intel/iommu.c')
| -rw-r--r-- | drivers/iommu/intel/iommu.c | 4222 |
1 files changed, 4222 insertions, 0 deletions
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c new file mode 100644 index 000000000000..134302fbcd92 --- /dev/null +++ b/drivers/iommu/intel/iommu.c @@ -0,0 +1,4222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2006-2014 Intel Corporation. + * + * Authors: David Woodhouse <dwmw2@infradead.org>, + * Ashok Raj <ashok.raj@intel.com>, + * Shaohua Li <shaohua.li@intel.com>, + * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + * Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "DMAR: " fmt +#define dev_fmt(fmt) pr_fmt(fmt) + +#include <linux/crash_dump.h> +#include <linux/dma-direct.h> +#include <linux/dmi.h> +#include <linux/memory.h> +#include <linux/pci.h> +#include <linux/pci-ats.h> +#include <linux/spinlock.h> +#include <linux/syscore_ops.h> +#include <linux/tboot.h> +#include <uapi/linux/iommufd.h> + +#include "iommu.h" +#include "../dma-iommu.h" +#include "../irq_remapping.h" +#include "../iommu-pages.h" +#include "pasid.h" +#include "perfmon.h" + +#define ROOT_SIZE VTD_PAGE_SIZE +#define CONTEXT_SIZE VTD_PAGE_SIZE + +#define IS_GFX_DEVICE(pdev) pci_is_display(pdev) +#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) +#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) +#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) + +#define IOAPIC_RANGE_START (0xfee00000) +#define IOAPIC_RANGE_END (0xfeefffff) +#define IOVA_START_ADDR (0x1000) + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 + +static void __init check_tylersburg_isoch(void); +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); +static int rwbf_quirk; + +#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) + +/* + * set to 1 to panic kernel if can't successfully enable VT-d + * (used when kernel is launched w/ TXT) + */ +static int force_on = 0; +static int intel_iommu_tboot_noforce; +static int no_platform_optin; + +#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) + +/* + * Take a root_entry and return the Lower Context Table Pointer (LCTP) + * if marked present. + */ +static phys_addr_t root_entry_lctp(struct root_entry *re) +{ + if (!(re->lo & 1)) + return 0; + + return re->lo & VTD_PAGE_MASK; +} + +/* + * Take a root_entry and return the Upper Context Table Pointer (UCTP) + * if marked present. + */ +static phys_addr_t root_entry_uctp(struct root_entry *re) +{ + if (!(re->hi & 1)) + return 0; + + return re->hi & VTD_PAGE_MASK; +} + +static int device_rid_cmp_key(const void *key, const struct rb_node *node) +{ + struct device_domain_info *info = + rb_entry(node, struct device_domain_info, node); + const u16 *rid_lhs = key; + + if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) + return -1; + + if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) + return 1; + + return 0; +} + +static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) +{ + struct device_domain_info *info = + rb_entry(lhs, struct device_domain_info, node); + u16 key = PCI_DEVID(info->bus, info->devfn); + + return device_rid_cmp_key(&key, rhs); +} + +/* + * Looks up an IOMMU-probed device using its source ID. + * + * Returns the pointer to the device if there is a match. Otherwise, + * returns NULL. + * + * Note that this helper doesn't guarantee that the device won't be + * released by the iommu subsystem after being returned. The caller + * should use its own synchronization mechanism to avoid the device + * being released during its use if its possibly the case. + */ +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) +{ + struct device_domain_info *info = NULL; + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); + if (node) + info = rb_entry(node, struct device_domain_info, node); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); + + return info ? info->dev : NULL; +} + +static int device_rbtree_insert(struct intel_iommu *iommu, + struct device_domain_info *info) +{ + struct rb_node *curr; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); + if (WARN_ON(curr)) + return -EEXIST; + + return 0; +} + +static void device_rbtree_remove(struct device_domain_info *info) +{ + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + rb_erase(&info->node, &iommu->device_rbtree); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); +} + +struct dmar_rmrr_unit { + struct list_head list; /* list of rmrr units */ + struct acpi_dmar_header *hdr; /* ACPI header */ + u64 base_address; /* reserved base address*/ + u64 end_address; /* reserved end address */ + struct dmar_dev_scope *devices; /* target devices */ + int devices_cnt; /* target device count */ +}; + +struct dmar_atsr_unit { + struct list_head list; /* list of ATSR units */ + struct acpi_dmar_header *hdr; /* ACPI header */ + struct dmar_dev_scope *devices; /* target devices */ + int devices_cnt; /* target device count */ + u8 include_all:1; /* include all ports */ +}; + +struct dmar_satc_unit { + struct list_head list; /* list of SATC units */ + struct acpi_dmar_header *hdr; /* ACPI header */ + struct dmar_dev_scope *devices; /* target devices */ + struct intel_iommu *iommu; /* the corresponding iommu */ + int devices_cnt; /* target device count */ + u8 atc_required:1; /* ATS is required */ +}; + +static LIST_HEAD(dmar_atsr_units); +static LIST_HEAD(dmar_rmrr_units); +static LIST_HEAD(dmar_satc_units); + +#define for_each_rmrr_units(rmrr) \ + list_for_each_entry(rmrr, &dmar_rmrr_units, list) + +static void intel_iommu_domain_free(struct iommu_domain *domain); + +int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); +int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); + +int intel_iommu_enabled = 0; +EXPORT_SYMBOL_GPL(intel_iommu_enabled); + +static int intel_iommu_superpage = 1; +static int iommu_identity_mapping; +static int iommu_skip_te_disable; +static int disable_igfx_iommu; + +#define IDENTMAP_AZALIA 4 + +const struct iommu_ops intel_iommu_ops; + +static bool translation_pre_enabled(struct intel_iommu *iommu) +{ + return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); +} + +static void clear_translation_pre_enabled(struct intel_iommu *iommu) +{ + iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; +} + +static void init_translation_status(struct intel_iommu *iommu) +{ + u32 gsts; + + gsts = readl(iommu->reg + DMAR_GSTS_REG); + if (gsts & DMA_GSTS_TES) + iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; +} + +static int __init intel_iommu_setup(char *str) +{ + if (!str) + return -EINVAL; + + while (*str) { + if (!strncmp(str, "on", 2)) { + dmar_disabled = 0; + pr_info("IOMMU enabled\n"); + } else if (!strncmp(str, "off", 3)) { + dmar_disabled = 1; + no_platform_optin = 1; + pr_info("IOMMU disabled\n"); + } else if (!strncmp(str, "igfx_off", 8)) { + disable_igfx_iommu = 1; + pr_info("Disable GFX device mapping\n"); + } else if (!strncmp(str, "forcedac", 8)) { + pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); + iommu_dma_forcedac = true; + } else if (!strncmp(str, "strict", 6)) { + pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); + iommu_set_dma_strict(); + } else if (!strncmp(str, "sp_off", 6)) { + pr_info("Disable supported super page\n"); + intel_iommu_superpage = 0; + } else if (!strncmp(str, "sm_on", 5)) { + pr_info("Enable scalable mode if hardware supports\n"); + intel_iommu_sm = 1; + } else if (!strncmp(str, "sm_off", 6)) { + pr_info("Scalable mode is disallowed\n"); + intel_iommu_sm = 0; + } else if (!strncmp(str, "tboot_noforce", 13)) { + pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); + intel_iommu_tboot_noforce = 1; + } else { + pr_notice("Unknown option - '%s'\n", str); + } + + str += strcspn(str, ","); + while (*str == ',') + str++; + } + + return 1; +} +__setup("intel_iommu=", intel_iommu_setup); + +/* + * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. + * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of + * the returned SAGAW. + */ +static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) +{ + unsigned long fl_sagaw, sl_sagaw; + + fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); + sl_sagaw = cap_sagaw(iommu->cap); + + /* Second level only. */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return sl_sagaw; + + /* First level only. */ + if (!ecap_slts(iommu->ecap)) + return fl_sagaw; + + return fl_sagaw & sl_sagaw; +} + +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) +{ + unsigned long sagaw; + int agaw; + + sagaw = __iommu_calculate_sagaw(iommu); + for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { + if (test_bit(agaw, &sagaw)) + break; + } + + return agaw; +} + +/* + * Calculate max SAGAW for each iommu. + */ +int iommu_calculate_max_sagaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); +} + +/* + * calculate agaw for each iommu. + * "SAGAW" may be different across iommus, use a default agaw, and + * get a supported less agaw for iommus that don't support the default agaw. + */ +int iommu_calculate_agaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); +} + +static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) +{ + return sm_supported(iommu) ? + ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); +} + +struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, + u8 devfn, int alloc) +{ + struct root_entry *root = &iommu->root_entry[bus]; + struct context_entry *context; + u64 *entry; + + /* + * Except that the caller requested to allocate a new entry, + * returning a copied context entry makes no sense. + */ + if (!alloc && context_copied(iommu, bus, devfn)) + return NULL; + + entry = &root->lo; + if (sm_supported(iommu)) { + if (devfn >= 0x80) { + devfn -= 0x80; + entry = &root->hi; + } + devfn *= 2; + } + if (*entry & 1) + context = phys_to_virt(*entry & VTD_PAGE_MASK); + else { + unsigned long phy_addr; + if (!alloc) + return NULL; + + context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, + SZ_4K); + if (!context) + return NULL; + + __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); + phy_addr = virt_to_phys((void *)context); + *entry = phy_addr | 1; + __iommu_flush_cache(iommu, entry, sizeof(*entry)); + } + return &context[devfn]; +} + +/** + * is_downstream_to_pci_bridge - test if a device belongs to the PCI + * sub-hierarchy of a candidate PCI-PCI bridge + * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy + * @bridge: the candidate PCI-PCI bridge + * + * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. + */ +static bool +is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) +{ + struct pci_dev *pdev, *pbridge; + + if (!dev_is_pci(dev) || !dev_is_pci(bridge)) + return false; + + pdev = to_pci_dev(dev); + pbridge = to_pci_dev(bridge); + + if (pbridge->subordinate && + pbridge->subordinate->number <= pdev->bus->number && + pbridge->subordinate->busn_res.end >= pdev->bus->number) + return true; + + return false; +} + +static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) +{ + struct dmar_drhd_unit *drhd; + u32 vtbar; + int rc; + + /* We know that this device on this chipset has its own IOMMU. + * If we find it under a different IOMMU, then the BIOS is lying + * to us. Hope that the IOMMU for this device is actually + * disabled, and it needs no translation... + */ + rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); + if (rc) { + /* "can't" happen */ + dev_info(&pdev->dev, "failed to run vt-d quirk\n"); + return false; + } + vtbar &= 0xffff0000; + + /* we know that the this iommu should be at offset 0xa000 from vtbar */ + drhd = dmar_find_matched_drhd_unit(pdev); + if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { + pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + return true; + } + + return false; +} + +static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) +{ + if (!iommu || iommu->drhd->ignored) + return true; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if (pdev->vendor == PCI_VENDOR_ID_INTEL && + pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && + quirk_ioat_snb_local_iommu(pdev)) + return true; + } + + return false; +} + +static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) +{ + struct dmar_drhd_unit *drhd = NULL; + struct pci_dev *pdev = NULL; + struct intel_iommu *iommu; + struct device *tmp; + u16 segment = 0; + int i; + + if (!dev) + return NULL; + + if (dev_is_pci(dev)) { + struct pci_dev *pf_pdev; + + pdev = pci_real_dma_dev(to_pci_dev(dev)); + + /* VFs aren't listed in scope tables; we need to look up + * the PF instead to find the IOMMU. */ + pf_pdev = pci_physfn(pdev); + dev = &pf_pdev->dev; + segment = pci_domain_nr(pdev->bus); + } else if (has_acpi_companion(dev)) + dev = &ACPI_COMPANION(dev)->dev; + + rcu_read_lock(); + for_each_iommu(iommu, drhd) { + if (pdev && segment != drhd->segment) + continue; + + for_each_active_dev_scope(drhd->devices, + drhd->devices_cnt, i, tmp) { + if (tmp == dev) { + /* For a VF use its original BDF# not that of the PF + * which we used for the IOMMU lookup. Strictly speaking + * we could do this for all PCI devices; we only need to + * get the BDF# from the scope table for ACPI matches. */ + if (pdev && pdev->is_virtfn) + goto got_pdev; + + if (bus && devfn) { + *bus = drhd->devices[i].bus; + *devfn = drhd->devices[i].devfn; + } + goto out; + } + + if (is_downstream_to_pci_bridge(dev, tmp)) + goto got_pdev; + } + + if (pdev && drhd->include_all) { +got_pdev: + if (bus && devfn) { + *bus = pdev->bus->number; + *devfn = pdev->devfn; + } + goto out; + } + } + iommu = NULL; +out: + if (iommu_is_dummy(iommu, dev)) + iommu = NULL; + + rcu_read_unlock(); + + return iommu; +} + +static void free_context_table(struct intel_iommu *iommu) +{ + struct context_entry *context; + int i; + + if (!iommu->root_entry) + return; + + for (i = 0; i < ROOT_ENTRY_NR; i++) { + context = iommu_context_addr(iommu, i, 0, 0); + if (context) + iommu_free_pages(context); + + if (!sm_supported(iommu)) + continue; + + context = iommu_context_addr(iommu, i, 0x80, 0); + if (context) + iommu_free_pages(context); + } + + iommu_free_pages(iommu->root_entry); + iommu->root_entry = NULL; +} + +#ifdef CONFIG_DMAR_DEBUG +static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, + u8 bus, u8 devfn, struct dma_pte *parent, int level) +{ + struct dma_pte *pte; + int offset; + + while (1) { + offset = pfn_level_offset(pfn, level); + pte = &parent[offset]; + + pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); + + if (!dma_pte_present(pte)) { + pr_info("page table not present at level %d\n", level - 1); + break; + } + + if (level == 1 || dma_pte_superpage(pte)) + break; + + parent = phys_to_virt(dma_pte_addr(pte)); + level--; + } +} + +void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, + unsigned long long addr, u32 pasid) +{ + struct pasid_dir_entry *dir, *pde; + struct pasid_entry *entries, *pte; + struct context_entry *ctx_entry; + struct root_entry *rt_entry; + int i, dir_index, index, level; + u8 devfn = source_id & 0xff; + u8 bus = source_id >> 8; + struct dma_pte *pgtable; + + pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); + + /* root entry dump */ + if (!iommu->root_entry) { + pr_info("root table is not present\n"); + return; + } + rt_entry = &iommu->root_entry[bus]; + + if (sm_supported(iommu)) + pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", + rt_entry->hi, rt_entry->lo); + else + pr_info("root entry: 0x%016llx", rt_entry->lo); + + /* context entry dump */ + ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); + if (!ctx_entry) { + pr_info("context table is not present\n"); + return; + } + + pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", + ctx_entry->hi, ctx_entry->lo); + + /* legacy mode does not require PASID entries */ + if (!sm_supported(iommu)) { + if (!context_present(ctx_entry)) { + pr_info("legacy mode page table is not present\n"); + return; + } + level = agaw_to_level(ctx_entry->hi & 7); + pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + goto pgtable_walk; + } + + if (!context_present(ctx_entry)) { + pr_info("pasid directory table is not present\n"); + return; + } + + /* get the pointer to pasid directory entry */ + dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + + /* For request-without-pasid, get the pasid from context entry */ + if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) + pasid = IOMMU_NO_PASID; + + dir_index = pasid >> PASID_PDE_SHIFT; + pde = &dir[dir_index]; + pr_info("pasid dir entry: 0x%016llx\n", pde->val); + + /* get the pointer to the pasid table entry */ + entries = get_pasid_table_from_pde(pde); + if (!entries) { + pr_info("pasid table is not present\n"); + return; + } + index = pasid & PASID_PTE_MASK; + pte = &entries[index]; + for (i = 0; i < ARRAY_SIZE(pte->val); i++) + pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); + + if (!pasid_pte_is_present(pte)) { + pr_info("scalable mode page table is not present\n"); + return; + } + + if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { + level = pte->val[2] & BIT_ULL(2) ? 5 : 4; + pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); + } else { + level = agaw_to_level((pte->val[0] >> 2) & 0x7); + pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); + } + +pgtable_walk: + pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); +} +#endif + +/* iommu handling */ +static int iommu_alloc_root_entry(struct intel_iommu *iommu) +{ + struct root_entry *root; + + root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); + if (!root) { + pr_err("Allocating root entry for %s failed\n", + iommu->name); + return -ENOMEM; + } + + __iommu_flush_cache(iommu, root, ROOT_SIZE); + iommu->root_entry = root; + + return 0; +} + +static void iommu_set_root_entry(struct intel_iommu *iommu) +{ + u64 addr; + u32 sts; + unsigned long flag; + + addr = virt_to_phys(iommu->root_entry); + if (sm_supported(iommu)) + addr |= DMA_RTADDR_SMT; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); + + writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_RTPS), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* + * Hardware invalidates all DMA remapping hardware translation + * caches as part of SRTP flow. + */ + if (cap_esrtps(iommu->cap)) + return; + + iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); + if (sm_supported(iommu)) + qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); +} + +void iommu_flush_write_buffer(struct intel_iommu *iommu) +{ + u32 val; + unsigned long flag; + + if (!rwbf_quirk && !cap_rwbf(iommu->cap)) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (!(val & DMA_GSTS_WBFS)), val); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +/* return value determine if we need a write buffer flush */ +static void __iommu_flush_context(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, + u64 type) +{ + u64 val = 0; + unsigned long flag; + + switch (type) { + case DMA_CCMD_GLOBAL_INVL: + val = DMA_CCMD_GLOBAL_INVL; + break; + case DMA_CCMD_DOMAIN_INVL: + val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); + break; + case DMA_CCMD_DEVICE_INVL: + val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) + | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); + break; + default: + pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", + iommu->name, type); + return; + } + val |= DMA_CCMD_ICC; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, + dmar_readq, (!(val & DMA_CCMD_ICC)), val); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type) +{ + int tlb_offset = ecap_iotlb_offset(iommu->ecap); + u64 val = 0, val_iva = 0; + unsigned long flag; + + switch (type) { + case DMA_TLB_GLOBAL_FLUSH: + /* global flush doesn't need set IVA_REG */ + val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; + break; + case DMA_TLB_DSI_FLUSH: + val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + break; + case DMA_TLB_PSI_FLUSH: + val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + /* IH bit is passed in as part of address */ + val_iva = size_order | addr; + break; + default: + pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", + iommu->name, type); + return; + } + + if (cap_write_drain(iommu->cap)) + val |= DMA_TLB_WRITE_DRAIN; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + /* Note: Only uses first TLB reg currently */ + if (val_iva) + dmar_writeq(iommu->reg + tlb_offset, val_iva); + dmar_writeq(iommu->reg + tlb_offset + 8, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, tlb_offset + 8, + dmar_readq, (!(val & DMA_TLB_IVT)), val); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* check IOTLB invalidation granularity */ + if (DMA_TLB_IAIG(val) == 0) + pr_err("Flush IOTLB failed\n"); + if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) + pr_debug("TLB flush request %Lx, actual %Lx\n", + (unsigned long long)DMA_TLB_IIRG(type), + (unsigned long long)DMA_TLB_IAIG(val)); +} + +static struct device_domain_info * +domain_lookup_dev_info(struct dmar_domain *domain, + struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + struct device_domain_info *info; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); + list_for_each_entry(info, &domain->devices, link) { + if (info->iommu == iommu && info->bus == bus && + info->devfn == devfn) { + spin_unlock_irqrestore(&domain->lock, flags); + return info; + } + } + spin_unlock_irqrestore(&domain->lock, flags); + + return NULL; +} + +/* + * The extra devTLB flush quirk impacts those QAT devices with PCI device + * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() + * check because it applies only to the built-in QAT devices and it doesn't + * grant additional privileges. + */ +#define BUGGY_QAT_DEVID_MASK 0x4940 +static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) +{ + if (pdev->vendor != PCI_VENDOR_ID_INTEL) + return false; + + if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) + return false; + + return true; +} + +static void iommu_enable_pci_ats(struct device_domain_info *info) +{ + struct pci_dev *pdev; + + if (!info->ats_supported) + return; + + pdev = to_pci_dev(info->dev); + if (!pci_ats_page_aligned(pdev)) + return; + + if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) + info->ats_enabled = 1; +} + +static void iommu_disable_pci_ats(struct device_domain_info *info) +{ + if (!info->ats_enabled) + return; + + pci_disable_ats(to_pci_dev(info->dev)); + info->ats_enabled = 0; +} + +static void iommu_enable_pci_pri(struct device_domain_info *info) +{ + struct pci_dev *pdev; + + if (!info->ats_enabled || !info->pri_supported) + return; + + pdev = to_pci_dev(info->dev); + /* PASID is required in PRG Response Message. */ + if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) + return; + + if (pci_reset_pri(pdev)) + return; + + if (!pci_enable_pri(pdev, PRQ_DEPTH)) + info->pri_enabled = 1; +} + +static void iommu_disable_pci_pri(struct device_domain_info *info) +{ + if (!info->pri_enabled) + return; + + if (WARN_ON(info->iopf_refcount)) + iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); + + pci_disable_pri(to_pci_dev(info->dev)); + info->pri_enabled = 0; +} + +static void intel_flush_iotlb_all(struct iommu_domain *domain) +{ + cache_tag_flush_all(to_dmar_domain(domain)); +} + +static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) +{ + u32 pmen; + unsigned long flags; + + if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + pmen = readl(iommu->reg + DMAR_PMEN_REG); + pmen &= ~DMA_PMEN_EPM; + writel(pmen, iommu->reg + DMAR_PMEN_REG); + + /* wait for the protected region status bit to clear */ + IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, + readl, !(pmen & DMA_PMEN_PRS), pmen); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static void iommu_enable_translation(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flags; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + iommu->gcmd |= DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_TES), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static void iommu_disable_translation(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flag; + + if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && + (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + iommu->gcmd &= ~DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (!(sts & DMA_GSTS_TES)), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +static void disable_dmar_iommu(struct intel_iommu *iommu) +{ + /* + * All iommu domains must have been detached from the devices, + * hence there should be no domain IDs in use. + */ + if (WARN_ON(!ida_is_empty(&iommu->domain_ida))) + return; + + if (iommu->gcmd & DMA_GCMD_TE) + iommu_disable_translation(iommu); +} + +static void free_dmar_iommu(struct intel_iommu *iommu) +{ + if (iommu->copied_tables) { + bitmap_free(iommu->copied_tables); + iommu->copied_tables = NULL; + } + + /* free context mapping */ + free_context_table(iommu); + + if (ecap_prs(iommu->ecap)) + intel_iommu_finish_prq(iommu); +} + +/* + * Check and return whether first level is used by default for + * DMA translation. + */ +static bool first_level_by_default(struct intel_iommu *iommu) +{ + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu)) + return false; + + /* Only level (either FL or SL) is available, just use it */ + if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) + return ecap_flts(iommu->ecap); + + return true; +} + +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) +{ + struct iommu_domain_info *info, *curr; + int num, ret = -ENOSPC; + + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return 0; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + guard(mutex)(&iommu->did_lock); + curr = xa_load(&domain->iommu_array, iommu->seq_id); + if (curr) { + curr->refcnt++; + kfree(info); + return 0; + } + + num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, + cap_ndoms(iommu->cap) - 1, GFP_KERNEL); + if (num < 0) { + pr_err("%s: No free domain ids\n", iommu->name); + goto err_unlock; + } + + info->refcnt = 1; + info->did = num; + info->iommu = iommu; + curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, + NULL, info, GFP_KERNEL); + if (curr) { + ret = xa_err(curr) ? : -EBUSY; + goto err_clear; + } + + return 0; + +err_clear: + ida_free(&iommu->domain_ida, info->did); +err_unlock: + kfree(info); + return ret; +} + +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) +{ + struct iommu_domain_info *info; + + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return; + + guard(mutex)(&iommu->did_lock); + info = xa_load(&domain->iommu_array, iommu->seq_id); + if (--info->refcnt == 0) { + ida_free(&iommu->domain_ida, info->did); + xa_erase(&domain->iommu_array, iommu->seq_id); + kfree(info); + } +} + +/* + * For kdump cases, old valid entries may be cached due to the + * in-flight DMA and copied pgtable, but there is no unmapping + * behaviour for them, thus we need an explicit cache flush for + * the newly-mapped device. For kdump, at this point, the device + * is supposed to finish reset at its driver probe stage, so no + * in-flight DMA will exist, and we don't need to worry anymore + * hereafter. + */ +static void copied_context_tear_down(struct intel_iommu *iommu, + struct context_entry *context, + u8 bus, u8 devfn) +{ + u16 did_old; + + if (!context_copied(iommu, bus, devfn)) + return; + + assert_spin_locked(&iommu->lock); + + did_old = context_domain_id(context); + context_clear_entry(context); + + if (did_old < cap_ndoms(iommu->cap)) { + iommu->flush.flush_context(iommu, did_old, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did_old, 0, 0, + DMA_TLB_DSI_FLUSH); + } + + clear_context_copied(iommu, bus, devfn); +} + +/* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ +static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, + u8 bus, u8 devfn) +{ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + } else { + iommu_flush_write_buffer(iommu); + } +} + +static int domain_context_mapping_one(struct dmar_domain *domain, + struct intel_iommu *iommu, + u8 bus, u8 devfn) +{ + struct device_domain_info *info = + domain_lookup_dev_info(domain, iommu, bus, devfn); + u16 did = domain_id_iommu(domain, iommu); + int translation = CONTEXT_TT_MULTI_LEVEL; + struct pt_iommu_vtdss_hw_info pt_info; + struct context_entry *context; + int ret; + + if (WARN_ON(!intel_domain_is_ss_paging(domain))) + return -EINVAL; + + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); + + pr_debug("Set context mapping for %02x:%02x.%d\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + spin_lock(&iommu->lock); + ret = -ENOMEM; + context = iommu_context_addr(iommu, bus, devfn, 1); + if (!context) + goto out_unlock; + + ret = 0; + if (context_present(context) && !context_copied(iommu, bus, devfn)) + goto out_unlock; + + copied_context_tear_down(iommu, context, bus, devfn); + context_clear_entry(context); + context_set_domain_id(context, did); + + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; + + context_set_address_root(context, pt_info.ssptptr); + context_set_address_width(context, pt_info.aw); + context_set_translation_type(context, translation); + context_set_fault_enable(context); + context_set_present(context); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(context, sizeof(*context)); + context_present_cache_flush(iommu, did, bus, devfn); + ret = 0; + +out_unlock: + spin_unlock(&iommu->lock); + + return ret; +} + +static int domain_context_mapping_cb(struct pci_dev *pdev, + u16 alias, void *opaque) +{ + struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); + struct intel_iommu *iommu = info->iommu; + struct dmar_domain *domain = opaque; + + return domain_context_mapping_one(domain, iommu, + PCI_BUS_NUM(alias), alias & 0xff); +} + +static int +domain_context_mapping(struct dmar_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; + int ret; + + if (!dev_is_pci(dev)) + return domain_context_mapping_one(domain, iommu, bus, devfn); + + ret = pci_for_each_dma_alias(to_pci_dev(dev), + domain_context_mapping_cb, domain); + if (ret) + return ret; + + iommu_enable_pci_ats(info); + + return 0; +} + +static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) +{ + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + u16 did; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, 0); + if (!context) { + spin_unlock(&iommu->lock); + return; + } + + did = context_domain_id(context); + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + spin_unlock(&iommu->lock); + intel_context_flush_no_pasid(info, context, did); +} + +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, + did, flags); + return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did, + iommu_domain_did(old, iommu), + flags); +} + +static int domain_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_second_level(iommu, domain, + dev, pasid); + return intel_pasid_replace_second_level(iommu, domain, dev, + iommu_domain_did(old, iommu), + pasid); +} + +static int domain_setup_passthrough(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_pass_through(iommu, dev, pasid); + return intel_pasid_replace_pass_through(iommu, dev, + iommu_domain_did(old, iommu), + pasid); +} + +static int domain_setup_first_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, + u32 pasid, struct iommu_domain *old) +{ + struct pt_iommu_x86_64_hw_info pt_info; + unsigned int flags = 0; + + pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info); + if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5)) + return -EINVAL; + + if (pt_info.levels == 5) + flags |= PASID_FLAG_FL5LP; + + if (domain->force_snooping) + flags |= PASID_FLAG_PAGE_SNOOP; + + if (!(domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + flags |= PASID_FLAG_PWSNP; + + return __domain_setup_first_level(iommu, dev, pasid, + domain_id_iommu(domain, iommu), + pt_info.gcr3_pt, flags, old); +} + +static int dmar_domain_attach_device(struct dmar_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + int ret; + + ret = domain_attach_iommu(domain, iommu); + if (ret) + return ret; + + info->domain = domain; + info->domain_attached = true; + spin_lock_irqsave(&domain->lock, flags); + list_add(&info->link, &domain->devices); + spin_unlock_irqrestore(&domain->lock, flags); + + if (dev_is_real_dma_subdevice(dev)) + return 0; + + if (!sm_supported(iommu)) + ret = domain_context_mapping(domain, dev); + else if (intel_domain_is_fs_paging(domain)) + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (intel_domain_is_ss_paging(domain)) + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (WARN_ON(true)) + ret = -EINVAL; + + if (ret) + goto out_block_translation; + + ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); + if (ret) + goto out_block_translation; + + return 0; + +out_block_translation: + device_block_translation(dev); + return ret; +} + +/** + * device_rmrr_is_relaxable - Test whether the RMRR of this device + * is relaxable (ie. is allowed to be not enforced under some conditions) + * @dev: device handle + * + * We assume that PCI USB devices with RMRRs have them largely + * for historical reasons and that the RMRR space is not actively used post + * boot. This exclusion may change if vendors begin to abuse it. + * + * The same exception is made for graphics devices, with the requirement that + * any use of the RMRR regions will be torn down before assigning the device + * to a guest. + * + * Return: true if the RMRR is relaxable, false otherwise + */ +static bool device_rmrr_is_relaxable(struct device *dev) +{ + struct pci_dev *pdev; + + if (!dev_is_pci(dev)) + return false; + + pdev = to_pci_dev(dev); + if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) + return true; + else + return false; +} + +static int device_def_domain_type(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * Hardware does not support the passthrough translation mode. + * Always use a dynamaic mapping domain. + */ + if (!ecap_pass_through(iommu->ecap)) + return IOMMU_DOMAIN_DMA; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) + return IOMMU_DOMAIN_IDENTITY; + } + + return 0; +} + +static void intel_iommu_init_qi(struct intel_iommu *iommu) +{ + /* + * Start from the sane iommu hardware state. + * If the queued invalidation is already initialized by us + * (for example, while enabling interrupt-remapping) then + * we got the things already rolling from a sane state. + */ + if (!iommu->qi) { + /* + * Clear any previous faults. + */ + dmar_fault(-1, iommu); + /* + * Disable queued invalidation if supported and already enabled + * before OS handover. + */ + dmar_disable_qi(iommu); + } + + if (dmar_enable_qi(iommu)) { + /* + * Queued Invalidate not enabled, use Register Based Invalidate + */ + iommu->flush.flush_context = __iommu_flush_context; + iommu->flush.flush_iotlb = __iommu_flush_iotlb; + pr_info("%s: Using Register based invalidation\n", + iommu->name); + } else { + iommu->flush.flush_context = qi_flush_context; + iommu->flush.flush_iotlb = qi_flush_iotlb; + pr_info("%s: Using Queued invalidation\n", iommu->name); + } +} + +static int copy_context_table(struct intel_iommu *iommu, + struct root_entry *old_re, + struct context_entry **tbl, + int bus, bool ext) +{ + int tbl_idx, pos = 0, idx, devfn, ret = 0, did; + struct context_entry *new_ce = NULL, ce; + struct context_entry *old_ce = NULL; + struct root_entry re; + phys_addr_t old_ce_phys; + + tbl_idx = ext ? bus * 2 : bus; + memcpy(&re, old_re, sizeof(re)); + + for (devfn = 0; devfn < 256; devfn++) { + /* First calculate the correct index */ + idx = (ext ? devfn * 2 : devfn) % 256; + + if (idx == 0) { + /* First save what we may have and clean up */ + if (new_ce) { + tbl[tbl_idx] = new_ce; + __iommu_flush_cache(iommu, new_ce, + VTD_PAGE_SIZE); + pos = 1; + } + + if (old_ce) + memunmap(old_ce); + + ret = 0; + if (devfn < 0x80) + old_ce_phys = root_entry_lctp(&re); + else + old_ce_phys = root_entry_uctp(&re); + + if (!old_ce_phys) { + if (ext && devfn == 0) { + /* No LCTP, try UCTP */ + devfn = 0x7f; + continue; + } else { + goto out; + } + } + + ret = -ENOMEM; + old_ce = memremap(old_ce_phys, PAGE_SIZE, + MEMREMAP_WB); + if (!old_ce) + goto out; + + new_ce = iommu_alloc_pages_node_sz(iommu->node, + GFP_KERNEL, SZ_4K); + if (!new_ce) + goto out_unmap; + + ret = 0; + } + + /* Now copy the context entry */ + memcpy(&ce, old_ce + idx, sizeof(ce)); + + if (!context_present(&ce)) + continue; + + did = context_domain_id(&ce); + if (did >= 0 && did < cap_ndoms(iommu->cap)) + ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL); + + set_context_copied(iommu, bus, devfn); + new_ce[idx] = ce; + } + + tbl[tbl_idx + pos] = new_ce; + + __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); + +out_unmap: + memunmap(old_ce); + +out: + return ret; +} + +static int copy_translation_tables(struct intel_iommu *iommu) +{ + struct context_entry **ctxt_tbls; + struct root_entry *old_rt; + phys_addr_t old_rt_phys; + int ctxt_table_entries; + u64 rtaddr_reg; + int bus, ret; + bool new_ext, ext; + + rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); + ext = !!(rtaddr_reg & DMA_RTADDR_SMT); + new_ext = !!sm_supported(iommu); + + /* + * The RTT bit can only be changed when translation is disabled, + * but disabling translation means to open a window for data + * corruption. So bail out and don't copy anything if we would + * have to change the bit. + */ + if (new_ext != ext) + return -EINVAL; + + iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); + if (!iommu->copied_tables) + return -ENOMEM; + + old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; + if (!old_rt_phys) + return -EINVAL; + + old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); + if (!old_rt) + return -ENOMEM; + + /* This is too big for the stack - allocate it from slab */ + ctxt_table_entries = ext ? 512 : 256; + ret = -ENOMEM; + ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); + if (!ctxt_tbls) + goto out_unmap; + + for (bus = 0; bus < 256; bus++) { + ret = copy_context_table(iommu, &old_rt[bus], + ctxt_tbls, bus, ext); + if (ret) { + pr_err("%s: Failed to copy context table for bus %d\n", + iommu->name, bus); + continue; + } + } + + spin_lock(&iommu->lock); + + /* Context tables are copied, now write them to the root_entry table */ + for (bus = 0; bus < 256; bus++) { + int idx = ext ? bus * 2 : bus; + u64 val; + + if (ctxt_tbls[idx]) { + val = virt_to_phys(ctxt_tbls[idx]) | 1; + iommu->root_entry[bus].lo = val; + } + + if (!ext || !ctxt_tbls[idx + 1]) + continue; + + val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; + iommu->root_entry[bus].hi = val; + } + + spin_unlock(&iommu->lock); + + kfree(ctxt_tbls); + + __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); + + ret = 0; + +out_unmap: + memunmap(old_rt); + + return ret; +} + +static int __init init_dmars(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int ret; + + for_each_iommu(iommu, drhd) { + if (drhd->ignored) { + iommu_disable_translation(iommu); + continue; + } + + /* + * Find the max pasid size of all IOMMU's in the system. + * We need to ensure the system pasid table is no bigger + * than the smallest supported. + */ + if (pasid_supported(iommu)) { + u32 temp = 2 << ecap_pss(iommu->ecap); + + intel_pasid_max_id = min_t(u32, temp, + intel_pasid_max_id); + } + + intel_iommu_init_qi(iommu); + init_translation_status(iommu); + + if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { + iommu_disable_translation(iommu); + clear_translation_pre_enabled(iommu); + pr_warn("Translation was enabled for %s but we are not in kdump mode\n", + iommu->name); + } + + /* + * TBD: + * we could share the same root & context tables + * among all IOMMU's. Need to Split it later. + */ + ret = iommu_alloc_root_entry(iommu); + if (ret) + goto free_iommu; + + if (translation_pre_enabled(iommu)) { + pr_info("Translation already enabled - trying to copy translation structures\n"); + + ret = copy_translation_tables(iommu); + if (ret) { + /* + * We found the IOMMU with translation + * enabled - but failed to copy over the + * old root-entry table. Try to proceed + * by disabling translation now and + * allocating a clean root-entry table. + * This might cause DMAR faults, but + * probably the dump will still succeed. + */ + pr_err("Failed to copy translation tables from previous kernel for %s\n", + iommu->name); + iommu_disable_translation(iommu); + clear_translation_pre_enabled(iommu); + } else { + pr_info("Copied translation tables from previous kernel for %s\n", + iommu->name); + } + } + + intel_svm_check(iommu); + } + + /* + * Now that qi is enabled on all iommus, set the root entry and flush + * caches. This is required on some Intel X58 chipsets, otherwise the + * flush_context function will loop forever and the boot hangs. + */ + for_each_active_iommu(iommu, drhd) { + iommu_flush_write_buffer(iommu); + iommu_set_root_entry(iommu); + } + + check_tylersburg_isoch(); + + /* + * for each drhd + * enable fault log + * global invalidate context cache + * global invalidate iotlb + * enable translation + */ + for_each_iommu(iommu, drhd) { + if (drhd->ignored) { + /* + * we always have to disable PMRs or DMA may fail on + * this device + */ + if (force_on) + iommu_disable_protect_mem_regions(iommu); + continue; + } + + iommu_flush_write_buffer(iommu); + + if (ecap_prs(iommu->ecap)) { + /* + * Call dmar_alloc_hwirq() with dmar_global_lock held, + * could cause possible lock race condition. + */ + up_write(&dmar_global_lock); + ret = intel_iommu_enable_prq(iommu); + down_write(&dmar_global_lock); + if (ret) + goto free_iommu; + } + + ret = dmar_set_interrupt(iommu); + if (ret) + goto free_iommu; + } + + return 0; + +free_iommu: + for_each_active_iommu(iommu, drhd) { + disable_dmar_iommu(iommu); + free_dmar_iommu(iommu); + } + + return ret; +} + +static void __init init_no_remapping_devices(void) +{ + struct dmar_drhd_unit *drhd; + struct device *dev; + int i; + + for_each_drhd_unit(drhd) { + if (!drhd->include_all) { + for_each_active_dev_scope(drhd->devices, + drhd->devices_cnt, i, dev) + break; + /* ignore DMAR unit if no devices exist */ + if (i == drhd->devices_cnt) + drhd->ignored = 1; + } + } + + for_each_active_drhd_unit(drhd) { + if (drhd->include_all) + continue; + + for_each_active_dev_scope(drhd->devices, + drhd->devices_cnt, i, dev) + if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) + break; + if (i < drhd->devices_cnt) + continue; + + /* This IOMMU has *only* gfx devices. Either bypass it or + set the gfx_mapped flag, as appropriate */ + drhd->gfx_dedicated = 1; + if (disable_igfx_iommu) + drhd->ignored = 1; + } +} + +#ifdef CONFIG_SUSPEND +static int init_iommu_hw(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + int ret; + + for_each_active_iommu(iommu, drhd) { + if (iommu->qi) { + ret = dmar_reenable_qi(iommu); + if (ret) + return ret; + } + } + + for_each_iommu(iommu, drhd) { + if (drhd->ignored) { + /* + * we always have to disable PMRs or DMA may fail on + * this device + */ + if (force_on) + iommu_disable_protect_mem_regions(iommu); + continue; + } + + iommu_flush_write_buffer(iommu); + iommu_set_root_entry(iommu); + iommu_enable_translation(iommu); + iommu_disable_protect_mem_regions(iommu); + } + + return 0; +} + +static void iommu_flush_all(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + for_each_active_iommu(iommu, drhd) { + iommu->flush.flush_context(iommu, 0, 0, 0, + DMA_CCMD_GLOBAL_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, + DMA_TLB_GLOBAL_FLUSH); + } +} + +static int iommu_suspend(void *data) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + unsigned long flag; + + iommu_flush_all(); + + for_each_active_iommu(iommu, drhd) { + iommu_disable_translation(iommu); + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + + iommu->iommu_state[SR_DMAR_FECTL_REG] = + readl(iommu->reg + DMAR_FECTL_REG); + iommu->iommu_state[SR_DMAR_FEDATA_REG] = + readl(iommu->reg + DMAR_FEDATA_REG); + iommu->iommu_state[SR_DMAR_FEADDR_REG] = + readl(iommu->reg + DMAR_FEADDR_REG); + iommu->iommu_state[SR_DMAR_FEUADDR_REG] = + readl(iommu->reg + DMAR_FEUADDR_REG); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } + return 0; +} + +static void iommu_resume(void *data) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + unsigned long flag; + + if (init_iommu_hw()) { + if (force_on) + panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); + else + WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); + return; + } + + for_each_active_iommu(iommu, drhd) { + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + + writel(iommu->iommu_state[SR_DMAR_FECTL_REG], + iommu->reg + DMAR_FECTL_REG); + writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], + iommu->reg + DMAR_FEDATA_REG); + writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], + iommu->reg + DMAR_FEADDR_REG); + writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], + iommu->reg + DMAR_FEUADDR_REG); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } +} + +static const struct syscore_ops iommu_syscore_ops = { + .resume = iommu_resume, + .suspend = iommu_suspend, +}; + +static struct syscore iommu_syscore = { + .ops = &iommu_syscore_ops, +}; + +static void __init init_iommu_pm_ops(void) +{ + register_syscore(&iommu_syscore); +} + +#else +static inline void init_iommu_pm_ops(void) {} +#endif /* CONFIG_PM */ + +static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) +{ + if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || + !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || + rmrr->end_address <= rmrr->base_address || + arch_rmrr_sanity_check(rmrr)) + return -EINVAL; + + return 0; +} + +int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) +{ + struct acpi_dmar_reserved_memory *rmrr; + struct dmar_rmrr_unit *rmrru; + + rmrr = (struct acpi_dmar_reserved_memory *)header; + if (rmrr_sanity_check(rmrr)) { + pr_warn(FW_BUG + "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + rmrr->base_address, rmrr->end_address, + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + } + + rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); + if (!rmrru) + goto out; + + rmrru->hdr = header; + + rmrru->base_address = rmrr->base_address; + rmrru->end_address = rmrr->end_address; + + rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), + ((void *)rmrr) + rmrr->header.length, + &rmrru->devices_cnt); + if (rmrru->devices_cnt && rmrru->devices == NULL) + goto free_rmrru; + + list_add(&rmrru->list, &dmar_rmrr_units); + + return 0; +free_rmrru: + kfree(rmrru); +out: + return -ENOMEM; +} + +static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) +{ + struct dmar_atsr_unit *atsru; + struct acpi_dmar_atsr *tmp; + + list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, + dmar_rcu_check()) { + tmp = (struct acpi_dmar_atsr *)atsru->hdr; + if (atsr->segment != tmp->segment) + continue; + if (atsr->header.length != tmp->header.length) + continue; + if (memcmp(atsr, tmp, atsr->header.length) == 0) + return atsru; + } + + return NULL; +} + +int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) +{ + struct acpi_dmar_atsr *atsr; + struct dmar_atsr_unit *atsru; + + if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) + return 0; + + atsr = container_of(hdr, struct acpi_dmar_atsr, header); + atsru = dmar_find_atsr(atsr); + if (atsru) + return 0; + + atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); + if (!atsru) + return -ENOMEM; + + /* + * If memory is allocated from slab by ACPI _DSM method, we need to + * copy the memory content because the memory buffer will be freed + * on return. + */ + atsru->hdr = (void *)(atsru + 1); + memcpy(atsru->hdr, hdr, hdr->length); + atsru->include_all = atsr->flags & 0x1; + if (!atsru->include_all) { + atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), + (void *)atsr + atsr->header.length, + &atsru->devices_cnt); + if (atsru->devices_cnt && atsru->devices == NULL) { + kfree(atsru); + return -ENOMEM; + } + } + + list_add_rcu(&atsru->list, &dmar_atsr_units); + + return 0; +} + +static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) +{ + dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); + kfree(atsru); +} + +int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) +{ + struct acpi_dmar_atsr *atsr; + struct dmar_atsr_unit *atsru; + + atsr = container_of(hdr, struct acpi_dmar_atsr, header); + atsru = dmar_find_atsr(atsr); + if (atsru) { + list_del_rcu(&atsru->list); + synchronize_rcu(); + intel_iommu_free_atsr(atsru); + } + + return 0; +} + +int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) +{ + int i; + struct device *dev; + struct acpi_dmar_atsr *atsr; + struct dmar_atsr_unit *atsru; + + atsr = container_of(hdr, struct acpi_dmar_atsr, header); + atsru = dmar_find_atsr(atsr); + if (!atsru) + return 0; + + if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { + for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, + i, dev) + return -EBUSY; + } + + return 0; +} + +static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) +{ + struct dmar_satc_unit *satcu; + struct acpi_dmar_satc *tmp; + + list_for_each_entry_rcu(satcu, &dmar_satc_units, list, + dmar_rcu_check()) { + tmp = (struct acpi_dmar_satc *)satcu->hdr; + if (satc->segment != tmp->segment) + continue; + if (satc->header.length != tmp->header.length) + continue; + if (memcmp(satc, tmp, satc->header.length) == 0) + return satcu; + } + + return NULL; +} + +int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) +{ + struct acpi_dmar_satc *satc; + struct dmar_satc_unit *satcu; + + if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) + return 0; + + satc = container_of(hdr, struct acpi_dmar_satc, header); + satcu = dmar_find_satc(satc); + if (satcu) + return 0; + + satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); + if (!satcu) + return -ENOMEM; + + satcu->hdr = (void *)(satcu + 1); + memcpy(satcu->hdr, hdr, hdr->length); + satcu->atc_required = satc->flags & 0x1; + satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), + (void *)satc + satc->header.length, + &satcu->devices_cnt); + if (satcu->devices_cnt && !satcu->devices) { + kfree(satcu); + return -ENOMEM; + } + list_add_rcu(&satcu->list, &dmar_satc_units); + + return 0; +} + +static int intel_iommu_add(struct dmar_drhd_unit *dmaru) +{ + struct intel_iommu *iommu = dmaru->iommu; + int ret; + + /* + * Disable translation if already enabled prior to OS handover. + */ + if (iommu->gcmd & DMA_GCMD_TE) + iommu_disable_translation(iommu); + + ret = iommu_alloc_root_entry(iommu); + if (ret) + goto out; + + intel_svm_check(iommu); + + if (dmaru->ignored) { + /* + * we always have to disable PMRs or DMA may fail on this device + */ + if (force_on) + iommu_disable_protect_mem_regions(iommu); + return 0; + } + + intel_iommu_init_qi(iommu); + iommu_flush_write_buffer(iommu); + + if (ecap_prs(iommu->ecap)) { + ret = intel_iommu_enable_prq(iommu); + if (ret) + goto disable_iommu; + } + + ret = dmar_set_interrupt(iommu); + if (ret) + goto disable_iommu; + + iommu_set_root_entry(iommu); + iommu_enable_translation(iommu); + + iommu_disable_protect_mem_regions(iommu); + return 0; + +disable_iommu: + disable_dmar_iommu(iommu); +out: + free_dmar_iommu(iommu); + return ret; +} + +int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) +{ + int ret = 0; + struct intel_iommu *iommu = dmaru->iommu; + + if (!intel_iommu_enabled) + return 0; + if (iommu == NULL) + return -EINVAL; + + if (insert) { + ret = intel_iommu_add(dmaru); + } else { + disable_dmar_iommu(iommu); + free_dmar_iommu(iommu); + } + + return ret; +} + +static void intel_iommu_free_dmars(void) +{ + struct dmar_rmrr_unit *rmrru, *rmrr_n; + struct dmar_atsr_unit *atsru, *atsr_n; + struct dmar_satc_unit *satcu, *satc_n; + + list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { + list_del(&rmrru->list); + dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); + kfree(rmrru); + } + + list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { + list_del(&atsru->list); + intel_iommu_free_atsr(atsru); + } + list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { + list_del(&satcu->list); + dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); + kfree(satcu); + } +} + +static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) +{ + struct dmar_satc_unit *satcu; + struct acpi_dmar_satc *satc; + struct device *tmp; + int i; + + rcu_read_lock(); + + list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { + satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); + if (satc->segment != pci_domain_nr(dev->bus)) + continue; + for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) + if (to_pci_dev(tmp) == dev) + goto out; + } + satcu = NULL; +out: + rcu_read_unlock(); + return satcu; +} + +static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) +{ + struct pci_dev *bridge = NULL; + struct dmar_atsr_unit *atsru; + struct dmar_satc_unit *satcu; + struct acpi_dmar_atsr *atsr; + bool supported = true; + struct pci_bus *bus; + struct device *tmp; + int i; + + dev = pci_physfn(dev); + satcu = dmar_find_matched_satc_unit(dev); + if (satcu) + /* + * This device supports ATS as it is in SATC table. + * When IOMMU is in legacy mode, enabling ATS is done + * automatically by HW for the device that requires + * ATS, hence OS should not enable this device ATS + * to avoid duplicated TLB invalidation. + */ + return !(satcu->atc_required && !sm_supported(iommu)); + + for (bus = dev->bus; bus; bus = bus->parent) { + bridge = bus->self; + /* If it's an integrated device, allow ATS */ + if (!bridge) + return true; + /* Connected via non-PCIe: no ATS */ + if (!pci_is_pcie(bridge) || + pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) + return false; + /* If we found the root port, look it up in the ATSR */ + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) + break; + } + + rcu_read_lock(); + list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { + atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); + if (atsr->segment != pci_domain_nr(dev->bus)) + continue; + + for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) + if (tmp == &bridge->dev) + goto out; + + if (atsru->include_all) + goto out; + } + supported = false; +out: + rcu_read_unlock(); + + return supported; +} + +int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) +{ + int ret; + struct dmar_rmrr_unit *rmrru; + struct dmar_atsr_unit *atsru; + struct dmar_satc_unit *satcu; + struct acpi_dmar_atsr *atsr; + struct acpi_dmar_reserved_memory *rmrr; + struct acpi_dmar_satc *satc; + + if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) + return 0; + + list_for_each_entry(rmrru, &dmar_rmrr_units, list) { + rmrr = container_of(rmrru->hdr, + struct acpi_dmar_reserved_memory, header); + if (info->event == BUS_NOTIFY_ADD_DEVICE) { + ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), + ((void *)rmrr) + rmrr->header.length, + rmrr->segment, rmrru->devices, + rmrru->devices_cnt); + if (ret < 0) + return ret; + } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { + dmar_remove_dev_scope(info, rmrr->segment, + rmrru->devices, rmrru->devices_cnt); + } + } + + list_for_each_entry(atsru, &dmar_atsr_units, list) { + if (atsru->include_all) + continue; + + atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); + if (info->event == BUS_NOTIFY_ADD_DEVICE) { + ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), + (void *)atsr + atsr->header.length, + atsr->segment, atsru->devices, + atsru->devices_cnt); + if (ret > 0) + break; + else if (ret < 0) + return ret; + } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { + if (dmar_remove_dev_scope(info, atsr->segment, + atsru->devices, atsru->devices_cnt)) + break; + } + } + list_for_each_entry(satcu, &dmar_satc_units, list) { + satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); + if (info->event == BUS_NOTIFY_ADD_DEVICE) { + ret = dmar_insert_dev_scope(info, (void *)(satc + 1), + (void *)satc + satc->header.length, + satc->segment, satcu->devices, + satcu->devices_cnt); + if (ret > 0) + break; + else if (ret < 0) + return ret; + } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { + if (dmar_remove_dev_scope(info, satc->segment, + satcu->devices, satcu->devices_cnt)) + break; + } + } + + return 0; +} + +static void intel_disable_iommus(void) +{ + struct intel_iommu *iommu = NULL; + struct dmar_drhd_unit *drhd; + + for_each_iommu(iommu, drhd) + iommu_disable_translation(iommu); +} + +void intel_iommu_shutdown(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + + if (no_iommu || dmar_disabled) + return; + + /* + * All other CPUs were brought down, hotplug interrupts were disabled, + * no lock and RCU checking needed anymore + */ + list_for_each_entry(drhd, &dmar_drhd_units, list) { + iommu = drhd->iommu; + + /* Disable PMRs explicitly here. */ + iommu_disable_protect_mem_regions(iommu); + + /* Make sure the IOMMUs are switched off */ + iommu_disable_translation(iommu); + } +} + +static struct intel_iommu *dev_to_intel_iommu(struct device *dev) +{ + struct iommu_device *iommu_dev = dev_to_iommu_device(dev); + + return container_of(iommu_dev, struct intel_iommu, iommu); +} + +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + u32 ver = readl(iommu->reg + DMAR_VER_REG); + return sysfs_emit(buf, "%d:%d\n", + DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); +} +static DEVICE_ATTR_RO(version); + +static ssize_t address_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + return sysfs_emit(buf, "%llx\n", iommu->reg_phys); +} +static DEVICE_ATTR_RO(address); + +static ssize_t cap_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + return sysfs_emit(buf, "%llx\n", iommu->cap); +} +static DEVICE_ATTR_RO(cap); + +static ssize_t ecap_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + return sysfs_emit(buf, "%llx\n", iommu->ecap); +} +static DEVICE_ATTR_RO(ecap); + +static ssize_t domains_supported_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); +} +static DEVICE_ATTR_RO(domains_supported); + +static ssize_t domains_used_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + unsigned int count = 0; + int id; + + for (id = 0; id < cap_ndoms(iommu->cap); id++) + if (ida_exists(&iommu->domain_ida, id)) + count++; + + return sysfs_emit(buf, "%d\n", count); +} +static DEVICE_ATTR_RO(domains_used); + +static struct attribute *intel_iommu_attrs[] = { + &dev_attr_version.attr, + &dev_attr_address.attr, + &dev_attr_cap.attr, + &dev_attr_ecap.attr, + &dev_attr_domains_supported.attr, + &dev_attr_domains_used.attr, + NULL, +}; + +static struct attribute_group intel_iommu_group = { + .name = "intel-iommu", + .attrs = intel_iommu_attrs, +}; + +const struct attribute_group *intel_iommu_groups[] = { + &intel_iommu_group, + NULL, +}; + +static bool has_external_pci(void) +{ + struct pci_dev *pdev = NULL; + + for_each_pci_dev(pdev) + if (pdev->external_facing) { + pci_dev_put(pdev); + return true; + } + + return false; +} + +static int __init platform_optin_force_iommu(void) +{ + if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) + return 0; + + if (no_iommu || dmar_disabled) + pr_info("Intel-IOMMU force enabled due to platform opt in\n"); + + /* + * If Intel-IOMMU is disabled by default, we will apply identity + * map for all devices except those marked as being untrusted. + */ + if (dmar_disabled) + iommu_set_default_passthrough(false); + + dmar_disabled = 0; + no_iommu = 0; + + return 1; +} + +static int __init probe_acpi_namespace_devices(void) +{ + struct dmar_drhd_unit *drhd; + /* To avoid a -Wunused-but-set-variable warning. */ + struct intel_iommu *iommu __maybe_unused; + struct device *dev; + int i, ret = 0; + + for_each_active_iommu(iommu, drhd) { + for_each_active_dev_scope(drhd->devices, + drhd->devices_cnt, i, dev) { + struct acpi_device_physical_node *pn; + struct acpi_device *adev; + + if (dev->bus != &acpi_bus_type) + continue; + + up_read(&dmar_global_lock); + adev = to_acpi_device(dev); + mutex_lock(&adev->physical_node_lock); + list_for_each_entry(pn, + &adev->physical_node_list, node) { + ret = iommu_probe_device(pn->dev); + if (ret) + break; + } + mutex_unlock(&adev->physical_node_lock); + down_read(&dmar_global_lock); + + if (ret) + return ret; + } + } + + return 0; +} + +static __init int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || dmar_disabled) + pr_warn("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; + no_iommu = 0; + + return 1; +} + +int __init intel_iommu_init(void) +{ + int ret = -ENODEV; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + /* + * Intel IOMMU is required for a TXT/tboot launch or platform + * opt in, so enforce that. + */ + force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || + platform_optin_force_iommu(); + + down_write(&dmar_global_lock); + if (dmar_table_init()) { + if (force_on) + panic("tboot: Failed to initialize DMAR table\n"); + goto out_free_dmar; + } + + if (dmar_dev_scope_init() < 0) { + if (force_on) + panic("tboot: Failed to initialize DMAR device scope\n"); + goto out_free_dmar; + } + + up_write(&dmar_global_lock); + + /* + * The bus notifier takes the dmar_global_lock, so lockdep will + * complain later when we register it under the lock. + */ + dmar_register_bus_notifier(); + + down_write(&dmar_global_lock); + + if (!no_iommu) + intel_iommu_debugfs_init(); + + if (no_iommu || dmar_disabled) { + /* + * We exit the function here to ensure IOMMU's remapping and + * mempool aren't setup, which means that the IOMMU's PMRs + * won't be disabled via the call to init_dmars(). So disable + * it explicitly here. The PMRs were setup by tboot prior to + * calling SENTER, but the kernel is expected to reset/tear + * down the PMRs. + */ + if (intel_iommu_tboot_noforce) { + for_each_iommu(iommu, drhd) + iommu_disable_protect_mem_regions(iommu); + } + + /* + * Make sure the IOMMUs are switched off, even when we + * boot into a kexec kernel and the previous kernel left + * them enabled + */ + intel_disable_iommus(); + goto out_free_dmar; + } + + if (list_empty(&dmar_rmrr_units)) + pr_info("No RMRR found\n"); + + if (list_empty(&dmar_atsr_units)) + pr_info("No ATSR found\n"); + + if (list_empty(&dmar_satc_units)) + pr_info("No SATC found\n"); + + init_no_remapping_devices(); + + ret = init_dmars(); + if (ret) { + if (force_on) + panic("tboot: Failed to initialize DMARs\n"); + pr_err("Initialization failed\n"); + goto out_free_dmar; + } + up_write(&dmar_global_lock); + + init_iommu_pm_ops(); + + down_read(&dmar_global_lock); + for_each_active_iommu(iommu, drhd) { + /* + * The flush queue implementation does not perform + * page-selective invalidations that are required for efficient + * TLB flushes in virtual environments. The benefit of batching + * is likely to be much lower than the overhead of synchronizing + * the virtual and physical IOMMU page-tables. + */ + if (cap_caching_mode(iommu->cap) && + !first_level_by_default(iommu)) { + pr_info_once("IOMMU batching disallowed due to virtualization\n"); + iommu_set_dma_strict(); + } + iommu_device_sysfs_add(&iommu->iommu, NULL, + intel_iommu_groups, + "%s", iommu->name); + /* + * The iommu device probe is protected by the iommu_probe_device_lock. + * Release the dmar_global_lock before entering the device probe path + * to avoid unnecessary lock order splat. + */ + up_read(&dmar_global_lock); + iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + down_read(&dmar_global_lock); + + iommu_pmu_register(iommu); + } + + if (probe_acpi_namespace_devices()) + pr_warn("ACPI name space devices didn't probe correctly\n"); + + /* Finally, we enable the DMA remapping hardware. */ + for_each_iommu(iommu, drhd) { + if (!drhd->ignored && !translation_pre_enabled(iommu)) + iommu_enable_translation(iommu); + + iommu_disable_protect_mem_regions(iommu); + } + up_read(&dmar_global_lock); + + pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); + + intel_iommu_enabled = 1; + + return 0; + +out_free_dmar: + intel_iommu_free_dmars(); + up_write(&dmar_global_lock); + return ret; +} + +static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) +{ + struct device_domain_info *info = opaque; + + domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); + return 0; +} + +/* + * NB - intel-iommu lacks any sort of reference counting for the users of + * dependent devices. If multiple endpoints have intersecting dependent + * devices, unbinding the driver from any one of them will possibly leave + * the others unable to operate. + */ +static void domain_context_clear(struct device_domain_info *info) +{ + if (!dev_is_pci(info->dev)) { + domain_context_clear_one(info, info->bus, info->devfn); + return; + } + + pci_for_each_dma_alias(to_pci_dev(info->dev), + &domain_context_clear_one_cb, info); + iommu_disable_pci_ats(info); +} + +/* + * Clear the page table pointer in context or pasid table entries so that + * all DMA requests without PASID from the device are blocked. If the page + * table has been set, clean up the data structures. + */ +void device_block_translation(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + + /* Device in DMA blocking state. Noting to do. */ + if (!info->domain_attached) + return; + + if (info->domain) + cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); + + if (!dev_is_real_dma_subdevice(dev)) { + if (sm_supported(iommu)) + intel_pasid_tear_down_entry(iommu, dev, + IOMMU_NO_PASID, false); + else + domain_context_clear(info); + } + + /* Device now in DMA blocking state. */ + info->domain_attached = false; + + if (!info->domain) + return; + + spin_lock_irqsave(&info->domain->lock, flags); + list_del(&info->link); + spin_unlock_irqrestore(&info->domain->lock, flags); + + domain_detach_iommu(info->domain, iommu); + info->domain = NULL; +} + +static int blocking_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev); + device_block_translation(dev); + return 0; +} + +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); + +static struct iommu_domain blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocking_domain_attach_dev, + .set_dev_pasid = blocking_domain_set_dev_pasid, + } +}; + +static struct dmar_domain *paging_domain_alloc(void) +{ + struct dmar_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); + spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); + xa_init(&domain->iommu_array); + INIT_LIST_HEAD(&domain->s1_domains); + spin_lock_init(&domain->s1_lock); + + return domain; +} + +static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int mgaw = cap_mgaw(iommu->cap); + + /* + * Spec 3.6 First-Stage Translation: + * + * Software must limit addresses to less than the minimum of MGAW + * and the lower canonical address width implied by FSPM (i.e., + * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level). + */ + if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) { + *top_level = 4; + return min(57, mgaw); + } + + /* Four level is always supported */ + *top_level = 3; + return min(48, mgaw); +} + +static struct iommu_domain * +intel_iommu_domain_alloc_first_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) +{ + struct pt_iommu_x86_64_cfg cfg = {}; + struct dmar_domain *dmar_domain; + int ret; + + if (flags & ~IOMMU_HWPT_ALLOC_PASID) + return ERR_PTR(-EOPNOTSUPP); + + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + + dmar_domain = paging_domain_alloc(); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + cfg.common.hw_max_vasz_lg2 = + compute_vasz_lg2_fs(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | + BIT(PT_FEAT_FLUSH_RANGE); + /* First stage always uses scalable mode */ + if (!ecap_smpwc(iommu->ecap)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); + dmar_domain->domain.ops = &intel_fs_paging_domain_ops; + /* + * iotlb sync for map is only needed for legacy implementations that + * explicitly require flushing internal write buffers to ensure memory + * coherence. + */ + if (rwbf_required(iommu)) + dmar_domain->iotlb_sync_map = true; + + ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); + } + + if (!cap_fl1gp_support(iommu->cap)) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; + + return &dmar_domain->domain; +} + +static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int sagaw = cap_sagaw(iommu->cap); + unsigned int mgaw = cap_mgaw(iommu->cap); + + /* + * Find the largest table size that both the mgaw and sagaw support. + * This sets the valid range of IOVA and the top starting level. + * Some HW may only support a 4 or 5 level walk but must limit IOVA to + * 3 levels. + */ + if (mgaw > 48 && sagaw >= BIT(3)) { + *top_level = 4; + return min(57, mgaw); + } else if (mgaw > 39 && sagaw >= BIT(2)) { + *top_level = 3 + ffs(sagaw >> 3); + return min(48, mgaw); + } else if (mgaw > 30 && sagaw >= BIT(1)) { + *top_level = 2 + ffs(sagaw >> 2); + return min(39, mgaw); + } + return 0; +} + +static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { + IOMMU_PT_DIRTY_OPS(vtdss), + .set_dirty_tracking = intel_iommu_set_dirty_tracking, +}; + +static struct iommu_domain * +intel_iommu_domain_alloc_second_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) +{ + struct pt_iommu_vtdss_cfg cfg = {}; + struct dmar_domain *dmar_domain; + unsigned int sslps; + int ret; + + if (flags & + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID))) + return ERR_PTR(-EOPNOTSUPP); + + if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && + !nested_supported(iommu)) || + ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + !ssads_supported(iommu))) + return ERR_PTR(-EOPNOTSUPP); + + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + + dmar_domain = paging_domain_alloc(); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE); + + /* + * Read-only mapping is disallowed on the domain which serves as the + * parent in a nested configuration, due to HW errata + * (ERRATA_772415_SPR17) + */ + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) + cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE); + + if (!iommu_paging_structure_coherency(iommu)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); + dmar_domain->domain.ops = &intel_ss_paging_domain_ops; + dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; + + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops; + + ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); + } + + /* Adjust the supported page sizes to HW capability */ + sslps = cap_super_page_val(iommu->cap); + if (!(sslps & BIT(0))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M; + if (!(sslps & BIT(1))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; + + /* + * Besides the internal write buffer flush, the caching mode used for + * legacy nested translation (which utilizes shadowing page tables) + * also requires iotlb sync on map. + */ + if (rwbf_required(iommu) || cap_caching_mode(iommu->cap)) + dmar_domain->iotlb_sync_map = true; + + return &dmar_domain->domain; +} + +static struct iommu_domain * +intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct iommu_domain *domain; + + if (user_data) + return ERR_PTR(-EOPNOTSUPP); + + /* Prefer first stage if possible by default. */ + domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags); + if (domain != ERR_PTR(-EOPNOTSUPP)) + return domain; + return intel_iommu_domain_alloc_second_stage(dev, iommu, flags); +} + +static void intel_iommu_domain_free(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + if (WARN_ON(dmar_domain->nested_parent && + !list_empty(&dmar_domain->s1_domains))) + return; + + if (WARN_ON(!list_empty(&dmar_domain->devices))) + return; + + pt_iommu_deinit(&dmar_domain->iommu); + + kfree(dmar_domain->qi_batch); + kfree(dmar_domain); +} + +static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) +{ + if (WARN_ON(dmar_domain->domain.dirty_ops || + dmar_domain->nested_parent)) + return -EINVAL; + + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return -EINVAL; + + if (!ecap_smpwc(iommu->ecap) && + !(dmar_domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; + + /* Supports the number of table levels */ + if (!cap_fl5lp_support(iommu->cap) && + dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48) + return -EINVAL; + + /* Same page size support */ + if (!cap_fl1gp_support(iommu->cap) && + (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map) + return -EINVAL; + + return 0; +} + +static int +paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) +{ + unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2; + unsigned int sslps = cap_super_page_val(iommu->cap); + struct pt_iommu_vtdss_hw_info pt_info; + + pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info); + + if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) + return -EINVAL; + if (dmar_domain->nested_parent && !nested_supported(iommu)) + return -EINVAL; + + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return -EINVAL; + + if (!iommu_paging_structure_coherency(iommu) && + !(dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; + + /* Address width falls within the capability */ + if (cap_mgaw(iommu->cap) < vasz_lg2) + return -EINVAL; + + /* Page table level is supported. */ + if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw))) + return -EINVAL; + + /* Same page size support */ + if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) + return -EINVAL; + if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) && + !dmar_domain->iotlb_sync_map) + return -EINVAL; + + /* + * FIXME this is locked wrong, it needs to be under the + * dmar_domain->lock + */ + if ((dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) && + !ecap_sc_support(iommu->ecap)) + return -EINVAL; + return 0; +} + +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + int ret = -EINVAL; + + if (intel_domain_is_fs_paging(dmar_domain)) + ret = paging_domain_compatible_first_stage(dmar_domain, iommu); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = paging_domain_compatible_second_stage(dmar_domain, iommu); + else if (WARN_ON(true)) + ret = -EINVAL; + if (ret) + return ret; + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && + context_copied(iommu, info->bus, info->devfn)) + return intel_pasid_setup_sm_context(dev); + + return 0; +} + +static int intel_iommu_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + int ret; + + device_block_translation(dev); + + ret = paging_domain_compatible(domain, dev); + if (ret) + return ret; + + ret = iopf_for_domain_set(domain, dev); + if (ret) + return ret; + + ret = dmar_domain_attach_device(to_dmar_domain(domain), dev); + if (ret) + iopf_for_domain_remove(domain, dev); + + return ret; +} + +static void intel_iommu_tlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + cache_tag_flush_range(to_dmar_domain(domain), gather->start, + gather->end, + iommu_pages_list_empty(&gather->freelist)); + iommu_put_pages_list(&gather->freelist); +} + +static bool domain_support_force_snooping(struct dmar_domain *domain) +{ + struct device_domain_info *info; + bool support = true; + + assert_spin_locked(&domain->lock); + list_for_each_entry(info, &domain->devices, link) { + if (!ecap_sc_support(info->iommu->ecap)) { + support = false; + break; + } + } + + return support; +} + +static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct device_domain_info *info; + + guard(spinlock_irqsave)(&dmar_domain->lock); + + if (dmar_domain->force_snooping) + return true; + + if (!domain_support_force_snooping(dmar_domain)) + return false; + + dmar_domain->force_snooping = true; + list_for_each_entry(info, &dmar_domain->devices, link) + intel_pasid_setup_page_snoop_control(info->iommu, info->dev, + IOMMU_NO_PASID); + return true; +} + +static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + guard(spinlock_irqsave)(&dmar_domain->lock); + if (!domain_support_force_snooping(dmar_domain)) + return false; + + /* + * Second level page table supports per-PTE snoop control. The + * iommu_map() interface will handle this by setting SNP bit. + */ + dmar_domain->sspt.vtdss_pt.common.features |= + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE); + dmar_domain->force_snooping = true; + return true; +} + +static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + case IOMMU_CAP_DEFERRED_FLUSH: + return true; + case IOMMU_CAP_PRE_BOOT_PROTECTION: + return dmar_platform_optin(); + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return ecap_sc_support(info->iommu->ecap); + case IOMMU_CAP_DIRTY_TRACKING: + return ssads_supported(info->iommu); + default: + return false; + } +} + +static struct iommu_device *intel_iommu_probe_device(struct device *dev) +{ + struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; + struct device_domain_info *info; + struct intel_iommu *iommu; + u8 bus, devfn; + int ret; + + iommu = device_lookup_iommu(dev, &bus, &devfn); + if (!iommu || !iommu->iommu.ops) + return ERR_PTR(-ENODEV); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + if (dev_is_real_dma_subdevice(dev)) { + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->segment = pci_domain_nr(pdev->bus); + } else { + info->bus = bus; + info->devfn = devfn; + info->segment = iommu->segment; + } + + info->dev = dev; + info->iommu = iommu; + if (dev_is_pci(dev)) { + if (ecap_dev_iotlb_support(iommu->ecap) && + pci_ats_supported(pdev) && + dmar_ats_supported(pdev, iommu)) { + info->ats_supported = 1; + info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); + + /* + * For IOMMU that supports device IOTLB throttling + * (DIT), we assign PFSID to the invalidation desc + * of a VF such that IOMMU HW can gauge queue depth + * at PF level. If DIT is not set, PFSID will be + * treated as reserved, which should be set to 0. + */ + if (ecap_dit(iommu->ecap)) + info->pfsid = pci_dev_id(pci_physfn(pdev)); + info->ats_qdep = pci_ats_queue_depth(pdev); + } + if (sm_supported(iommu)) { + if (pasid_supported(iommu)) { + int features = pci_pasid_features(pdev); + + if (features >= 0) + info->pasid_supported = features | 1; + } + + if (info->ats_supported && ecap_prs(iommu->ecap) && + ecap_pds(iommu->ecap) && pci_pri_supported(pdev)) + info->pri_supported = 1; + } + } + + dev_iommu_priv_set(dev, info); + if (pdev && pci_ats_supported(pdev)) { + pci_prepare_ats(pdev, VTD_PAGE_SHIFT); + ret = device_rbtree_insert(iommu, info); + if (ret) + goto free; + } + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { + ret = intel_pasid_alloc_table(dev); + if (ret) { + dev_err(dev, "PASID table allocation failed\n"); + goto clear_rbtree; + } + + if (!context_copied(iommu, info->bus, info->devfn)) { + ret = intel_pasid_setup_sm_context(dev); + if (ret) + goto free_table; + } + } + + intel_iommu_debugfs_create_dev(info); + + return &iommu->iommu; +free_table: + intel_pasid_free_table(dev); +clear_rbtree: + device_rbtree_remove(info); +free: + kfree(info); + + return ERR_PTR(ret); +} + +static void intel_iommu_probe_finalize(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * The PCIe spec, in its wisdom, declares that the behaviour of the + * device is undefined if you enable PASID support after ATS support. + * So always enable PASID support on devices which have it, even if + * we can't yet know if we're ever going to use it. + */ + if (info->pasid_supported && + !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1)) + info->pasid_enabled = 1; + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { + iommu_enable_pci_ats(info); + /* Assign a DEVTLB cache tag to the default domain. */ + if (info->ats_enabled && info->domain) { + u16 did = domain_id_iommu(info->domain, iommu); + + if (cache_tag_assign(info->domain, did, dev, + IOMMU_NO_PASID, CACHE_TAG_DEVTLB)) + iommu_disable_pci_ats(info); + } + } + iommu_enable_pci_pri(info); +} + +static void intel_iommu_release_device(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + iommu_disable_pci_pri(info); + iommu_disable_pci_ats(info); + + if (info->pasid_enabled) { + pci_disable_pasid(to_pci_dev(dev)); + info->pasid_enabled = 0; + } + + mutex_lock(&iommu->iopf_lock); + if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) + device_rbtree_remove(info); + mutex_unlock(&iommu->iopf_lock); + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && + !context_copied(iommu, info->bus, info->devfn)) + intel_pasid_teardown_sm_context(dev); + + intel_pasid_free_table(dev); + intel_iommu_debugfs_remove_dev(info); + kfree(info); +} + +static void intel_iommu_get_resv_regions(struct device *device, + struct list_head *head) +{ + int prot = DMA_PTE_READ | DMA_PTE_WRITE; + struct iommu_resv_region *reg; + struct dmar_rmrr_unit *rmrr; + struct device *i_dev; + int i; + + rcu_read_lock(); + for_each_rmrr_units(rmrr) { + for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, + i, i_dev) { + struct iommu_resv_region *resv; + enum iommu_resv_type type; + size_t length; + + if (i_dev != device && + !is_downstream_to_pci_bridge(device, i_dev)) + continue; + + length = rmrr->end_address - rmrr->base_address + 1; + + type = device_rmrr_is_relaxable(device) ? + IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; + + resv = iommu_alloc_resv_region(rmrr->base_address, + length, prot, type, + GFP_ATOMIC); + if (!resv) + break; + + list_add_tail(&resv->list, head); + } + } + rcu_read_unlock(); + +#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA + if (dev_is_pci(device)) { + struct pci_dev *pdev = to_pci_dev(device); + + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { + reg = iommu_alloc_resv_region(0, 1UL << 24, prot, + IOMMU_RESV_DIRECT_RELAXABLE, + GFP_KERNEL); + if (reg) + list_add_tail(®->list, head); + } + } +#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ + + reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, + IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, + 0, IOMMU_RESV_MSI, GFP_KERNEL); + if (!reg) + return; + list_add_tail(®->list, head); +} + +static struct iommu_group *intel_iommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + return generic_device_group(dev); +} + +int intel_iommu_enable_iopf(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + if (!info->pri_enabled) + return -ENODEV; + + /* pri_enabled is protected by the group mutex. */ + iommu_group_mutex_assert(dev); + if (info->iopf_refcount) { + info->iopf_refcount++; + return 0; + } + + ret = iopf_queue_add_device(iommu->iopf_queue, dev); + if (ret) + return ret; + + info->iopf_refcount = 1; + + return 0; +} + +void intel_iommu_disable_iopf(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) + return; + + iommu_group_mutex_assert(dev); + if (--info->iopf_refcount) + return; + + iopf_queue_remove_device(iommu->iopf_queue, dev); +} + +static bool intel_iommu_is_attach_deferred(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + return translation_pre_enabled(info->iommu) && !info->domain; +} + +/* + * Check that the device does not live on an external facing PCI port that is + * marked as untrusted. Such devices should not be able to apply quirks and + * thus not be able to bypass the IOMMU restrictions. + */ +static bool risky_device(struct pci_dev *pdev) +{ + if (pdev->untrusted) { + pci_info(pdev, + "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", + pdev->vendor, pdev->device); + pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); + return true; + } + return false; +} + +static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + if (dmar_domain->iotlb_sync_map) + cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1); + + return 0; +} + +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dev_pasid_info *curr, *dev_pasid = NULL; + struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; + unsigned long flags; + + if (!domain) + return; + + /* Identity domain has no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY) + return; + + dmar_domain = to_dmar_domain(domain); + spin_lock_irqsave(&dmar_domain->lock, flags); + list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { + if (curr->dev == dev && curr->pasid == pasid) { + list_del(&curr->link_domain); + dev_pasid = curr; + break; + } + } + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + cache_tag_unassign_domain(dmar_domain, dev, pasid); + domain_detach_iommu(dmar_domain, iommu); + if (!WARN_ON_ONCE(!dev_pasid)) { + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + kfree(dev_pasid); + } +} + +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); + iopf_for_domain_remove(old, dev); + domain_remove_dev_pasid(old, dev, pasid); + + return 0; +} + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + unsigned long flags; + int ret; + + dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); + if (!dev_pasid) + return ERR_PTR(-ENOMEM); + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) + goto out_free; + + ret = cache_tag_assign_domain(dmar_domain, dev, pasid); + if (ret) + goto out_detach_iommu; + + dev_pasid->dev = dev; + dev_pasid->pasid = pasid; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return dev_pasid; +out_detach_iommu: + domain_detach_iommu(dmar_domain, iommu); +out_free: + kfree(dev_pasid); + return ERR_PTR(ret); +} + +static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EINVAL; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (domain->dirty_ops) + return -EINVAL; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + goto out_remove_dev_pasid; + + if (intel_domain_is_fs_paging(dmar_domain)) + ret = domain_setup_first_level(iommu, dmar_domain, + dev, pasid, old); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = domain_setup_second_level(iommu, dmar_domain, + dev, pasid, old); + else if (WARN_ON(true)) + ret = -EINVAL; + + if (ret) + goto out_unwind_iopf; + + domain_remove_dev_pasid(old, dev, pasid); + + intel_iommu_debugfs_create_dev_pasid(dev_pasid); + + return 0; + +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + +static void *intel_iommu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct iommu_hw_info_vtd *vtd; + + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_INTEL_VTD) + return ERR_PTR(-EOPNOTSUPP); + + vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); + if (!vtd) + return ERR_PTR(-ENOMEM); + + vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; + vtd->cap_reg = iommu->cap; + vtd->ecap_reg = iommu->ecap; + *length = sizeof(*vtd); + *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; + return vtd; +} + +/* + * Set dirty tracking for the device list of a domain. The caller must + * hold the domain->lock when calling it. + */ +static int device_set_dirty_tracking(struct list_head *devices, bool enable) +{ + struct device_domain_info *info; + int ret = 0; + + list_for_each_entry(info, devices, link) { + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, + IOMMU_NO_PASID, enable); + if (ret) + break; + } + + return ret; +} + +static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, + bool enable) +{ + struct dmar_domain *s1_domain; + unsigned long flags; + int ret; + + spin_lock(&domain->s1_lock); + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + ret = device_set_dirty_tracking(&s1_domain->devices, enable); + spin_unlock_irqrestore(&s1_domain->lock, flags); + if (ret) + goto err_unwind; + } + spin_unlock(&domain->s1_lock); + return 0; + +err_unwind: + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + device_set_dirty_tracking(&s1_domain->devices, + domain->dirty_tracking); + spin_unlock_irqrestore(&s1_domain->lock, flags); + } + spin_unlock(&domain->s1_lock); + return ret; +} + +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + int ret; + + spin_lock(&dmar_domain->lock); + if (dmar_domain->dirty_tracking == enable) + goto out_unlock; + + ret = device_set_dirty_tracking(&dmar_domain->devices, enable); + if (ret) + goto err_unwind; + + if (dmar_domain->nested_parent) { + ret = parent_domain_set_dirty_tracking(dmar_domain, enable); + if (ret) + goto err_unwind; + } + + dmar_domain->dirty_tracking = enable; +out_unlock: + spin_unlock(&dmar_domain->lock); + + return 0; + +err_unwind: + device_set_dirty_tracking(&dmar_domain->devices, + dmar_domain->dirty_tracking); + spin_unlock(&dmar_domain->lock); + return ret; +} + +static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, 1); + if (!context) { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + if (context_present(context) && !context_copied(iommu, bus, devfn)) { + spin_unlock(&iommu->lock); + return 0; + } + + copied_context_tear_down(iommu, context, bus, devfn); + context_clear_entry(context); + context_set_domain_id(context, FLPT_DEFAULT_DID); + + /* + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. + */ + context_set_address_width(context, iommu->msagaw); + context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); + context_set_fault_enable(context); + context_set_present(context); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(context, sizeof(*context)); + context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); + spin_unlock(&iommu->lock); + + return 0; +} + +static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; + + return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); +} + +static int device_setup_pass_through(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) + return context_setup_pass_through(dev, info->bus, info->devfn); + + return pci_for_each_dma_alias(to_pci_dev(dev), + context_setup_pass_through_cb, dev); +} + +static int identity_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + device_block_translation(dev); + + if (dev_is_real_dma_subdevice(dev)) + return 0; + + /* + * No PRI support with the global identity domain. No need to enable or + * disable PRI in this path as the iommu has been put in the blocking + * state. + */ + if (sm_supported(iommu)) + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); + else + ret = device_setup_pass_through(dev); + + if (!ret) + info->domain_attached = true; + + return ret; +} + +static int identity_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + return ret; + + ret = domain_setup_passthrough(iommu, dev, pasid, old); + if (ret) { + iopf_for_domain_replace(old, domain, dev); + return ret; + } + + domain_remove_dev_pasid(old, dev, pasid); + return 0; +} + +static struct iommu_domain identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = identity_domain_attach_dev, + .set_dev_pasid = identity_domain_set_dev_pasid, + }, +}; + +const struct iommu_domain_ops intel_fs_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, +}; + +const struct iommu_domain_ops intel_ss_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(vtdss), + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, +}; + +const struct iommu_ops intel_iommu_ops = { + .blocked_domain = &blocking_domain, + .release_domain = &blocking_domain, + .identity_domain = &identity_domain, + .capable = intel_iommu_capable, + .hw_info = intel_iommu_hw_info, + .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, + .domain_alloc_sva = intel_svm_domain_alloc, + .domain_alloc_nested = intel_iommu_domain_alloc_nested, + .probe_device = intel_iommu_probe_device, + .probe_finalize = intel_iommu_probe_finalize, + .release_device = intel_iommu_release_device, + .get_resv_regions = intel_iommu_get_resv_regions, + .device_group = intel_iommu_device_group, + .is_attach_deferred = intel_iommu_is_attach_deferred, + .def_domain_type = device_def_domain_type, + .page_response = intel_iommu_page_response, +}; + +static void quirk_iommu_igfx(struct pci_dev *dev) +{ + if (risky_device(dev)) + return; + + pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); + disable_igfx_iommu = 1; +} + +/* G4x/GM45 integrated gfx dmar support is totally busted. */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); + +/* QM57/QS57 integrated gfx malfunctions with dmar */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx); + +/* Broadwell igfx malfunctions with dmar */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); + +static void quirk_iommu_rwbf(struct pci_dev *dev) +{ + if (risky_device(dev)) + return; + + /* + * Mobile 4 Series Chipset neglects to set RWBF capability, + * but needs it. Same seems to hold for the desktop versions. + */ + pci_info(dev, "Forcing write-buffer flush capability\n"); + rwbf_quirk = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); + +#define GGC 0x52 +#define GGC_MEMORY_SIZE_MASK (0xf << 8) +#define GGC_MEMORY_SIZE_NONE (0x0 << 8) +#define GGC_MEMORY_SIZE_1M (0x1 << 8) +#define GGC_MEMORY_SIZE_2M (0x3 << 8) +#define GGC_MEMORY_VT_ENABLED (0x8 << 8) +#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) +#define GGC_MEMORY_SIZE_3M_VT (0xa << 8) +#define GGC_MEMORY_SIZE_4M_VT (0xb << 8) + +static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) +{ + unsigned short ggc; + + if (risky_device(dev)) + return; + + if (pci_read_config_word(dev, GGC, &ggc)) + return; + + if (!(ggc & GGC_MEMORY_VT_ENABLED)) { + pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); + disable_igfx_iommu = 1; + } else if (!disable_igfx_iommu) { + /* we have to ensure the gfx device is idle before we flush */ + pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); + iommu_set_dma_strict(); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); + +static void quirk_igfx_skip_te_disable(struct pci_dev *dev) +{ + unsigned short ver; + + if (!IS_GFX_DEVICE(dev)) + return; + + ver = (dev->device >> 8) & 0xff; + if (ver != 0x45 && ver != 0x46 && ver != 0x4c && + ver != 0x4e && ver != 0x8a && ver != 0x98 && + ver != 0x9a && ver != 0xa7 && ver != 0x7d) + return; + + if (risky_device(dev)) + return; + + pci_info(dev, "Skip IOMMU disabling for graphics\n"); + iommu_skip_te_disable = 1; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); + +/* On Tylersburg chipsets, some BIOSes have been known to enable the + ISOCH DMAR unit for the Azalia sound device, but not give it any + TLB entries, which causes it to deadlock. Check for that. We do + this in a function called from init_dmars(), instead of in a PCI + quirk, because we don't want to print the obnoxious "BIOS broken" + message if VT-d is actually disabled. +*/ +static void __init check_tylersburg_isoch(void) +{ + struct pci_dev *pdev; + uint32_t vtisochctrl; + + /* If there's no Azalia in the system anyway, forget it. */ + pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); + if (!pdev) + return; + + if (risky_device(pdev)) { + pci_dev_put(pdev); + return; + } + + pci_dev_put(pdev); + + /* System Management Registers. Might be hidden, in which case + we can't do the sanity check. But that's OK, because the + known-broken BIOSes _don't_ actually hide it, so far. */ + pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); + if (!pdev) + return; + + if (risky_device(pdev)) { + pci_dev_put(pdev); + return; + } + + if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { + pci_dev_put(pdev); + return; + } + + pci_dev_put(pdev); + + /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ + if (vtisochctrl & 1) + return; + + /* Drop all bits other than the number of TLB entries */ + vtisochctrl &= 0x1c; + + /* If we have the recommended number of TLB entries (16), fine. */ + if (vtisochctrl == 0x10) + return; + + /* Zero TLB entries? You get to ride the short bus to school. */ + if (!vtisochctrl) { + WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + iommu_identity_mapping |= IDENTMAP_AZALIA; + return; + } + + pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", + vtisochctrl); +} + +/* + * Here we deal with a device TLB defect where device may inadvertently issue ATS + * invalidation completion before posted writes initiated with translated address + * that utilized translations matching the invalidation address range, violating + * the invalidation completion ordering. + * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is + * vulnerable to this defect. In other words, any dTLB invalidation initiated not + * under the control of the trusted/privileged host device driver must use this + * quirk. + * Device TLBs are invalidated under the following six conditions: + * 1. Device driver does DMA API unmap IOVA + * 2. Device driver unbind a PASID from a process, sva_unbind_device() + * 3. PASID is torn down, after PASID cache is flushed. e.g. process + * exit_mmap() due to crash + * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where + * VM has to free pages that were unmapped + * 5. Userspace driver unmaps a DMA buffer + * 6. Cache invalidation in vSVA usage (upcoming) + * + * For #1 and #2, device drivers are responsible for stopping DMA traffic + * before unmap/unbind. For #3, iommu driver gets mmu_notifier to + * invalidate TLB the same way as normal user unmap which will use this quirk. + * The dTLB invalidation after PASID cache flush does not need this quirk. + * + * As a reminder, #6 will *NEED* this quirk as we enable nested translation. + */ +void quirk_extra_dev_tlb_flush(struct device_domain_info *info, + unsigned long address, unsigned long mask, + u32 pasid, u16 qdep) +{ + u16 sid; + + if (likely(!info->dtlb_extra_inval)) + return; + + sid = PCI_DEVID(info->bus, info->devfn); + if (pasid == IOMMU_NO_PASID) { + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, address, mask); + } else { + qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, + pasid, qdep, address, mask); + } +} + +#define ecmd_get_status_code(res) (((res) & 0xff) >> 1) + +/* + * Function to submit a command to the enhanced command interface. The + * valid enhanced command descriptions are defined in Table 47 of the + * VT-d spec. The VT-d hardware implementation may support some but not + * all commands, which can be determined by checking the Enhanced + * Command Capability Register. + * + * Return values: + * - 0: Command successful without any error; + * - Negative: software error value; + * - Nonzero positive: failure status code defined in Table 48. + */ +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) +{ + unsigned long flags; + u64 res; + int ret; + + if (!cap_ecmds(iommu->cap)) + return -ENODEV; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); + if (res & DMA_ECMD_ECRSP_IP) { + ret = -EBUSY; + goto err; + } + + /* + * Unconditionally write the operand B, because + * - There is no side effect if an ecmd doesn't require an + * operand B, but we set the register to some value. + * - It's not invoked in any critical path. The extra MMIO + * write doesn't bring any performance concerns. + */ + dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); + dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); + + IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, + !(res & DMA_ECMD_ECRSP_IP), res); + + if (res & DMA_ECMD_ECRSP_IP) { + ret = -ETIMEDOUT; + goto err; + } + + ret = ecmd_get_status_code(res); +err: + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + return ret; +} + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); |
