summaryrefslogtreecommitdiff
path: root/drivers/iommu/intel/iommu.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/iommu/intel/iommu.c')
-rw-r--r--drivers/iommu/intel/iommu.c4861
1 files changed, 1641 insertions, 3220 deletions
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index a6a07d985709..134302fbcd92 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -13,47 +13,28 @@
#define pr_fmt(fmt) "DMAR: " fmt
#define dev_fmt(fmt) pr_fmt(fmt)
-#include <linux/init.h>
-#include <linux/bitmap.h>
-#include <linux/debugfs.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/dmar.h>
-#include <linux/dma-map-ops.h>
-#include <linux/mempool.h>
+#include <linux/crash_dump.h>
+#include <linux/dma-direct.h>
+#include <linux/dmi.h>
#include <linux/memory.h>
-#include <linux/cpu.h>
-#include <linux/timer.h>
-#include <linux/io.h>
-#include <linux/iova.h>
-#include <linux/iommu.h>
-#include <linux/dma-iommu.h>
-#include <linux/intel-iommu.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
#include <linux/tboot.h>
-#include <linux/dmi.h>
-#include <linux/pci-ats.h>
-#include <linux/memblock.h>
-#include <linux/dma-direct.h>
-#include <linux/crash_dump.h>
-#include <linux/numa.h>
-#include <asm/irq_remapping.h>
-#include <asm/cacheflush.h>
-#include <asm/iommu.h>
+#include <uapi/linux/iommufd.h>
+#include "iommu.h"
+#include "../dma-iommu.h"
#include "../irq_remapping.h"
-#include "../iommu-sva-lib.h"
+#include "../iommu-pages.h"
#include "pasid.h"
-#include "cap_audit.h"
+#include "perfmon.h"
#define ROOT_SIZE VTD_PAGE_SIZE
#define CONTEXT_SIZE VTD_PAGE_SIZE
-#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+#define IS_GFX_DEVICE(pdev) pci_is_display(pdev)
#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
@@ -64,116 +45,13 @@
#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
-#define MAX_AGAW_WIDTH 64
-#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
-
-#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
-
-/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
- to match. That way, we can use 'unsigned long' for PFNs with impunity. */
-#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
- __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
-#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
-
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN (1)
-
-#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
-
-/* page table handling */
-#define LEVEL_STRIDE (9)
-#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
-
-/*
- * This bitmap is used to advertise the page sizes our hardware support
- * to the IOMMU core, which will then use this information to split
- * physically contiguous memory regions it is mapping into page sizes
- * that we support.
- *
- * Traditionally the IOMMU core just handed us the mappings directly,
- * after making sure the size is an order of a 4KiB page and that the
- * mapping has natural alignment.
- *
- * To retain this behavior, we currently advertise that we support
- * all page sizes that are an order of 4KiB.
- *
- * If at some point we'd like to utilize the IOMMU core's new behavior,
- * we could change this to advertise the real page sizes we support.
- */
-#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
-
-static inline int agaw_to_level(int agaw)
-{
- return agaw + 2;
-}
-
-static inline int agaw_to_width(int agaw)
-{
- return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
-}
-
-static inline int width_to_agaw(int width)
-{
- return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
-}
-
-static inline unsigned int level_to_offset_bits(int level)
-{
- return (level - 1) * LEVEL_STRIDE;
-}
-
-static inline int pfn_level_offset(u64 pfn, int level)
-{
- return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
-}
-
-static inline u64 level_mask(int level)
-{
- return -1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 level_size(int level)
-{
- return 1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 align_to_level(u64 pfn, int level)
-{
- return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
-static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
-{
- return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
-}
-
-/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
- are never going to work. */
-static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
-{
- return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
-}
-
-static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
-{
- return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
-}
-static inline unsigned long page_to_dma_pfn(struct page *pg)
-{
- return mm_to_dma_pfn(page_to_pfn(pg));
-}
-static inline unsigned long virt_to_dma_pfn(void *p)
-{
- return page_to_dma_pfn(virt_to_page(p));
-}
-
-/* global iommu list, set NULL for ignored DMAR units */
-static struct intel_iommu **g_iommus;
-
static void __init check_tylersburg_isoch(void);
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
static int rwbf_quirk;
+#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap))
+
/*
* set to 1 to panic kernel if can't successfully enable VT-d
* (used when kernel is launched w/ TXT)
@@ -208,98 +86,81 @@ static phys_addr_t root_entry_uctp(struct root_entry *re)
return re->hi & VTD_PAGE_MASK;
}
-static inline void context_clear_pasid_enable(struct context_entry *context)
+static int device_rid_cmp_key(const void *key, const struct rb_node *node)
{
- context->lo &= ~(1ULL << 11);
-}
+ struct device_domain_info *info =
+ rb_entry(node, struct device_domain_info, node);
+ const u16 *rid_lhs = key;
-static inline bool context_pasid_enabled(struct context_entry *context)
-{
- return !!(context->lo & (1ULL << 11));
-}
+ if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
+ return -1;
-static inline void context_set_copied(struct context_entry *context)
-{
- context->hi |= (1ull << 3);
-}
+ if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
+ return 1;
-static inline bool context_copied(struct context_entry *context)
-{
- return !!(context->hi & (1ULL << 3));
+ return 0;
}
-static inline bool __context_present(struct context_entry *context)
+static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
{
- return (context->lo & 1);
-}
+ struct device_domain_info *info =
+ rb_entry(lhs, struct device_domain_info, node);
+ u16 key = PCI_DEVID(info->bus, info->devfn);
-bool context_present(struct context_entry *context)
-{
- return context_pasid_enabled(context) ?
- __context_present(context) :
- __context_present(context) && !context_copied(context);
+ return device_rid_cmp_key(&key, rhs);
}
-static inline void context_set_present(struct context_entry *context)
+/*
+ * Looks up an IOMMU-probed device using its source ID.
+ *
+ * Returns the pointer to the device if there is a match. Otherwise,
+ * returns NULL.
+ *
+ * Note that this helper doesn't guarantee that the device won't be
+ * released by the iommu subsystem after being returned. The caller
+ * should use its own synchronization mechanism to avoid the device
+ * being released during its use if its possibly the case.
+ */
+struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
{
- context->lo |= 1;
-}
+ struct device_domain_info *info = NULL;
+ struct rb_node *node;
+ unsigned long flags;
-static inline void context_set_fault_enable(struct context_entry *context)
-{
- context->lo &= (((u64)-1) << 2) | 1;
-}
+ spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
+ node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
+ if (node)
+ info = rb_entry(node, struct device_domain_info, node);
+ spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
-static inline void context_set_translation_type(struct context_entry *context,
- unsigned long value)
-{
- context->lo &= (((u64)-1) << 4) | 3;
- context->lo |= (value & 3) << 2;
+ return info ? info->dev : NULL;
}
-static inline void context_set_address_root(struct context_entry *context,
- unsigned long value)
+static int device_rbtree_insert(struct intel_iommu *iommu,
+ struct device_domain_info *info)
{
- context->lo &= ~VTD_PAGE_MASK;
- context->lo |= value & VTD_PAGE_MASK;
-}
+ struct rb_node *curr;
+ unsigned long flags;
-static inline void context_set_address_width(struct context_entry *context,
- unsigned long value)
-{
- context->hi |= value & 7;
-}
+ spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
+ curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
+ spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
+ if (WARN_ON(curr))
+ return -EEXIST;
-static inline void context_set_domain_id(struct context_entry *context,
- unsigned long value)
-{
- context->hi |= (value & ((1 << 16) - 1)) << 8;
+ return 0;
}
-static inline int context_domain_id(struct context_entry *c)
+static void device_rbtree_remove(struct device_domain_info *info)
{
- return((c->hi >> 8) & 0xffff);
-}
+ struct intel_iommu *iommu = info->iommu;
+ unsigned long flags;
-static inline void context_clear_entry(struct context_entry *context)
-{
- context->lo = 0;
- context->hi = 0;
+ spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
+ rb_erase(&info->node, &iommu->device_rbtree);
+ spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
}
-/*
- * This domain is a statically identity mapping domain.
- * 1. This domain creats a static 1:1 mapping to all usable memory.
- * 2. It maps to each iommu if successful.
- * 3. Each iommu mapps to this domain if successful.
- */
-static struct dmar_domain *si_domain;
-static int hw_pass_through = 1;
-
-#define for_each_domain_iommu(idx, domain) \
- for (idx = 0; idx < g_num_of_iommus; idx++) \
- if (domain->iommu_refcnt[idx])
-
struct dmar_rmrr_unit {
struct list_head list; /* list of rmrr units */
struct acpi_dmar_header *hdr; /* ACPI header */
@@ -333,87 +194,21 @@ static LIST_HEAD(dmar_satc_units);
#define for_each_rmrr_units(rmrr) \
list_for_each_entry(rmrr, &dmar_rmrr_units, list)
-/* bitmap for indexing intel_iommus */
-static int g_num_of_iommus;
-
-static void domain_exit(struct dmar_domain *domain);
-static void domain_remove_dev_info(struct dmar_domain *domain);
-static void dmar_remove_one_dev_info(struct device *dev);
-static void __dmar_remove_one_dev_info(struct device_domain_info *info);
-static int intel_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev);
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova);
+static void intel_iommu_domain_free(struct iommu_domain *domain);
-#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
-int dmar_disabled = 0;
-#else
-int dmar_disabled = 1;
-#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
-
-#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
-int intel_iommu_sm = 1;
-#else
-int intel_iommu_sm;
-#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
+int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
+int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
int intel_iommu_enabled = 0;
EXPORT_SYMBOL_GPL(intel_iommu_enabled);
-static int dmar_map_gfx = 1;
-static int intel_iommu_strict;
static int intel_iommu_superpage = 1;
static int iommu_identity_mapping;
static int iommu_skip_te_disable;
+static int disable_igfx_iommu;
-#define IDENTMAP_GFX 2
#define IDENTMAP_AZALIA 4
-int intel_iommu_gfx_mapped;
-EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
-
-#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
-struct device_domain_info *get_domain_info(struct device *dev)
-{
- struct device_domain_info *info;
-
- if (!dev)
- return NULL;
-
- info = dev_iommu_priv_get(dev);
- if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
- return NULL;
-
- return info;
-}
-
-DEFINE_SPINLOCK(device_domain_lock);
-static LIST_HEAD(device_domain_list);
-
-/*
- * Iterate over elements in device_domain_list and call the specified
- * callback @fn against each element.
- */
-int for_each_device_domain(int (*fn)(struct device_domain_info *info,
- void *data), void *data)
-{
- int ret = 0;
- unsigned long flags;
- struct device_domain_info *info;
-
- spin_lock_irqsave(&device_domain_lock, flags);
- list_for_each_entry(info, &device_domain_list, global) {
- ret = fn(info, data);
- if (ret) {
- spin_unlock_irqrestore(&device_domain_lock, flags);
- return ret;
- }
- }
- spin_unlock_irqrestore(&device_domain_lock, flags);
-
- return 0;
-}
-
const struct iommu_ops intel_iommu_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
@@ -439,6 +234,7 @@ static int __init intel_iommu_setup(char *str)
{
if (!str)
return -EINVAL;
+
while (*str) {
if (!strncmp(str, "on", 2)) {
dmar_disabled = 0;
@@ -448,118 +244,60 @@ static int __init intel_iommu_setup(char *str)
no_platform_optin = 1;
pr_info("IOMMU disabled\n");
} else if (!strncmp(str, "igfx_off", 8)) {
- dmar_map_gfx = 0;
+ disable_igfx_iommu = 1;
pr_info("Disable GFX device mapping\n");
} else if (!strncmp(str, "forcedac", 8)) {
pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
iommu_dma_forcedac = true;
} else if (!strncmp(str, "strict", 6)) {
- pr_info("Disable batched IOTLB flush\n");
- intel_iommu_strict = 1;
+ pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
+ iommu_set_dma_strict();
} else if (!strncmp(str, "sp_off", 6)) {
pr_info("Disable supported super page\n");
intel_iommu_superpage = 0;
} else if (!strncmp(str, "sm_on", 5)) {
- pr_info("Intel-IOMMU: scalable mode supported\n");
+ pr_info("Enable scalable mode if hardware supports\n");
intel_iommu_sm = 1;
+ } else if (!strncmp(str, "sm_off", 6)) {
+ pr_info("Scalable mode is disallowed\n");
+ intel_iommu_sm = 0;
} else if (!strncmp(str, "tboot_noforce", 13)) {
pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
intel_iommu_tboot_noforce = 1;
+ } else {
+ pr_notice("Unknown option - '%s'\n", str);
}
str += strcspn(str, ",");
while (*str == ',')
str++;
}
- return 0;
-}
-__setup("intel_iommu=", intel_iommu_setup);
-
-static struct kmem_cache *iommu_domain_cache;
-static struct kmem_cache *iommu_devinfo_cache;
-
-static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
-{
- struct dmar_domain **domains;
- int idx = did >> 8;
-
- domains = iommu->domains[idx];
- if (!domains)
- return NULL;
-
- return domains[did & 0xff];
-}
-
-static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
- struct dmar_domain *domain)
-{
- struct dmar_domain **domains;
- int idx = did >> 8;
-
- if (!iommu->domains[idx]) {
- size_t size = 256 * sizeof(struct dmar_domain *);
- iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
- }
- domains = iommu->domains[idx];
- if (WARN_ON(!domains))
- return;
- else
- domains[did & 0xff] = domain;
-}
-
-void *alloc_pgtable_page(int node)
-{
- struct page *page;
- void *vaddr = NULL;
-
- page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
- if (page)
- vaddr = page_address(page);
- return vaddr;
-}
-
-void free_pgtable_page(void *vaddr)
-{
- free_page((unsigned long)vaddr);
-}
-
-static inline void *alloc_domain_mem(void)
-{
- return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
-}
-
-static void free_domain_mem(void *vaddr)
-{
- kmem_cache_free(iommu_domain_cache, vaddr);
-}
-
-static inline void * alloc_devinfo_mem(void)
-{
- return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
+ return 1;
}
+__setup("intel_iommu=", intel_iommu_setup);
-static inline void free_devinfo_mem(void *vaddr)
+/*
+ * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
+ * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
+ * the returned SAGAW.
+ */
+static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
{
- kmem_cache_free(iommu_devinfo_cache, vaddr);
-}
+ unsigned long fl_sagaw, sl_sagaw;
-static inline int domain_type_is_si(struct dmar_domain *domain)
-{
- return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
-}
+ fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
+ sl_sagaw = cap_sagaw(iommu->cap);
-static inline bool domain_use_first_level(struct dmar_domain *domain)
-{
- return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
-}
+ /* Second level only. */
+ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
+ return sl_sagaw;
-static inline int domain_pfn_supported(struct dmar_domain *domain,
- unsigned long pfn)
-{
- int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+ /* First level only. */
+ if (!ecap_slts(iommu->ecap))
+ return fl_sagaw;
- return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
+ return fl_sagaw & sl_sagaw;
}
static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
@@ -567,9 +305,8 @@ static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
unsigned long sagaw;
int agaw;
- sagaw = cap_sagaw(iommu->cap);
- for (agaw = width_to_agaw(max_gaw);
- agaw >= 0; agaw--) {
+ sagaw = __iommu_calculate_sagaw(iommu);
+ for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
if (test_bit(agaw, &sagaw))
break;
}
@@ -595,176 +332,12 @@ int iommu_calculate_agaw(struct intel_iommu *iommu)
return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
}
-/* This functionin only returns single iommu in a domain */
-struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
-{
- int iommu_id;
-
- /* si_domain and vm domain should not get here. */
- if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
- return NULL;
-
- for_each_domain_iommu(iommu_id, domain)
- break;
-
- if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
- return NULL;
-
- return g_iommus[iommu_id];
-}
-
-static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
+static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
{
return sm_supported(iommu) ?
ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
}
-static void domain_update_iommu_coherency(struct dmar_domain *domain)
-{
- struct dmar_drhd_unit *drhd;
- struct intel_iommu *iommu;
- bool found = false;
- int i;
-
- domain->iommu_coherency = true;
-
- for_each_domain_iommu(i, domain) {
- found = true;
- if (!iommu_paging_structure_coherency(g_iommus[i])) {
- domain->iommu_coherency = false;
- break;
- }
- }
- if (found)
- return;
-
- /* No hardware attached; use lowest common denominator */
- rcu_read_lock();
- for_each_active_iommu(iommu, drhd) {
- if (!iommu_paging_structure_coherency(iommu)) {
- domain->iommu_coherency = false;
- break;
- }
- }
- rcu_read_unlock();
-}
-
-static bool domain_update_iommu_snooping(struct intel_iommu *skip)
-{
- struct dmar_drhd_unit *drhd;
- struct intel_iommu *iommu;
- bool ret = true;
-
- rcu_read_lock();
- for_each_active_iommu(iommu, drhd) {
- if (iommu != skip) {
- /*
- * If the hardware is operating in the scalable mode,
- * the snooping control is always supported since we
- * always set PASID-table-entry.PGSNP bit if the domain
- * is managed outside (UNMANAGED).
- */
- if (!sm_supported(iommu) &&
- !ecap_sc_support(iommu->ecap)) {
- ret = false;
- break;
- }
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static int domain_update_iommu_superpage(struct dmar_domain *domain,
- struct intel_iommu *skip)
-{
- struct dmar_drhd_unit *drhd;
- struct intel_iommu *iommu;
- int mask = 0x3;
-
- if (!intel_iommu_superpage)
- return 0;
-
- /* set iommu_superpage to the smallest common denominator */
- rcu_read_lock();
- for_each_active_iommu(iommu, drhd) {
- if (iommu != skip) {
- if (domain && domain_use_first_level(domain)) {
- if (!cap_fl1gp_support(iommu->cap))
- mask = 0x1;
- } else {
- mask &= cap_super_page_val(iommu->cap);
- }
-
- if (!mask)
- break;
- }
- }
- rcu_read_unlock();
-
- return fls(mask);
-}
-
-static int domain_update_device_node(struct dmar_domain *domain)
-{
- struct device_domain_info *info;
- int nid = NUMA_NO_NODE;
-
- assert_spin_locked(&device_domain_lock);
-
- if (list_empty(&domain->devices))
- return NUMA_NO_NODE;
-
- list_for_each_entry(info, &domain->devices, link) {
- if (!info->dev)
- continue;
-
- /*
- * There could possibly be multiple device numa nodes as devices
- * within the same domain may sit behind different IOMMUs. There
- * isn't perfect answer in such situation, so we select first
- * come first served policy.
- */
- nid = dev_to_node(info->dev);
- if (nid != NUMA_NO_NODE)
- break;
- }
-
- return nid;
-}
-
-static void domain_update_iotlb(struct dmar_domain *domain);
-
-/* Some capabilities may be different across iommus */
-static void domain_update_iommu_cap(struct dmar_domain *domain)
-{
- domain_update_iommu_coherency(domain);
- domain->iommu_snooping = domain_update_iommu_snooping(NULL);
- domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
-
- /*
- * If RHSA is missing, we should default to the device numa domain
- * as fall back.
- */
- if (domain->nid == NUMA_NO_NODE)
- domain->nid = domain_update_device_node(domain);
-
- /*
- * First-level translation restricts the input-address to a
- * canonical address (i.e., address bits 63:N have the same
- * value as address bit [N-1], where N is 48-bits with 4-level
- * paging and 57-bits with 5-level paging). Hence, skip bit
- * [N-1].
- */
- if (domain_use_first_level(domain))
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
- else
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
-
- domain_update_iotlb(domain);
-}
-
struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
u8 devfn, int alloc)
{
@@ -772,6 +345,13 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
struct context_entry *context;
u64 *entry;
+ /*
+ * Except that the caller requested to allocate a new entry,
+ * returning a copied context entry makes no sense.
+ */
+ if (!alloc && context_copied(iommu, bus, devfn))
+ return NULL;
+
entry = &root->lo;
if (sm_supported(iommu)) {
if (devfn >= 0x80) {
@@ -787,7 +367,8 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
if (!alloc)
return NULL;
- context = alloc_pgtable_page(iommu->node);
+ context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
+ SZ_4K);
if (!context)
return NULL;
@@ -799,11 +380,6 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
return &context[devfn];
}
-static bool attach_deferred(struct device *dev)
-{
- return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
-}
-
/**
* is_downstream_to_pci_bridge - test if a device belongs to the PCI
* sub-hierarchy of a candidate PCI-PCI bridge
@@ -878,7 +454,7 @@ static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
return false;
}
-struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
+static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
{
struct dmar_drhd_unit *drhd = NULL;
struct pci_dev *pdev = NULL;
@@ -930,7 +506,7 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
}
if (pdev && drhd->include_all) {
- got_pdev:
+got_pdev:
if (bus && devfn) {
*bus = pdev->bus->number;
*devfn = pdev->devfn;
@@ -939,7 +515,7 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
}
}
iommu = NULL;
- out:
+out:
if (iommu_is_dummy(iommu, dev))
iommu = NULL;
@@ -948,369 +524,156 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
return iommu;
}
-static void domain_flush_cache(struct dmar_domain *domain,
- void *addr, int size)
-{
- if (!domain->iommu_coherency)
- clflush_cache_range(addr, size);
-}
-
-static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
-{
- struct context_entry *context;
- int ret = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
- context = iommu_context_addr(iommu, bus, devfn, 0);
- if (context)
- ret = context_present(context);
- spin_unlock_irqrestore(&iommu->lock, flags);
- return ret;
-}
-
static void free_context_table(struct intel_iommu *iommu)
{
- int i;
- unsigned long flags;
struct context_entry *context;
+ int i;
+
+ if (!iommu->root_entry)
+ return;
- spin_lock_irqsave(&iommu->lock, flags);
- if (!iommu->root_entry) {
- goto out;
- }
for (i = 0; i < ROOT_ENTRY_NR; i++) {
context = iommu_context_addr(iommu, i, 0, 0);
if (context)
- free_pgtable_page(context);
+ iommu_free_pages(context);
if (!sm_supported(iommu))
continue;
context = iommu_context_addr(iommu, i, 0x80, 0);
if (context)
- free_pgtable_page(context);
-
+ iommu_free_pages(context);
}
- free_pgtable_page(iommu->root_entry);
+
+ iommu_free_pages(iommu->root_entry);
iommu->root_entry = NULL;
-out:
- spin_unlock_irqrestore(&iommu->lock, flags);
}
-static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
- unsigned long pfn, int *target_level)
+#ifdef CONFIG_DMAR_DEBUG
+static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
+ u8 bus, u8 devfn, struct dma_pte *parent, int level)
{
- struct dma_pte *parent, *pte;
- int level = agaw_to_level(domain->agaw);
+ struct dma_pte *pte;
int offset;
- BUG_ON(!domain->pgd);
-
- if (!domain_pfn_supported(domain, pfn))
- /* Address beyond IOMMU's addressing capabilities. */
- return NULL;
-
- parent = domain->pgd;
-
while (1) {
- void *tmp_page;
-
offset = pfn_level_offset(pfn, level);
pte = &parent[offset];
- if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
- break;
- if (level == *target_level)
- break;
-
- if (!dma_pte_present(pte)) {
- uint64_t pteval;
-
- tmp_page = alloc_pgtable_page(domain->nid);
-
- if (!tmp_page)
- return NULL;
-
- domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
- pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
- if (domain_use_first_level(domain)) {
- pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
- if (domain->domain.type == IOMMU_DOMAIN_DMA)
- pteval |= DMA_FL_PTE_ACCESS;
- }
- if (cmpxchg64(&pte->val, 0ULL, pteval))
- /* Someone else set it while we were thinking; use theirs. */
- free_pgtable_page(tmp_page);
- else
- domain_flush_cache(domain, pte, sizeof(*pte));
- }
- if (level == 1)
- break;
-
- parent = phys_to_virt(dma_pte_addr(pte));
- level--;
- }
-
- if (!*target_level)
- *target_level = level;
-
- return pte;
-}
-
-/* return address's pte at specific level */
-static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
- unsigned long pfn,
- int level, int *large_page)
-{
- struct dma_pte *parent, *pte;
- int total = agaw_to_level(domain->agaw);
- int offset;
- parent = domain->pgd;
- while (level <= total) {
- offset = pfn_level_offset(pfn, total);
- pte = &parent[offset];
- if (level == total)
- return pte;
+ pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
if (!dma_pte_present(pte)) {
- *large_page = total;
+ pr_info("page table not present at level %d\n", level - 1);
break;
}
- if (dma_pte_superpage(pte)) {
- *large_page = total;
- return pte;
- }
+ if (level == 1 || dma_pte_superpage(pte))
+ break;
parent = phys_to_virt(dma_pte_addr(pte));
- total--;
+ level--;
}
- return NULL;
}
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn)
+void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
+ unsigned long long addr, u32 pasid)
{
- unsigned int large_page;
- struct dma_pte *first_pte, *pte;
-
- BUG_ON(!domain_pfn_supported(domain, start_pfn));
- BUG_ON(!domain_pfn_supported(domain, last_pfn));
- BUG_ON(start_pfn > last_pfn);
-
- /* we don't need lock here; nobody else touches the iova range */
- do {
- large_page = 1;
- first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
- if (!pte) {
- start_pfn = align_to_level(start_pfn + 1, large_page + 1);
- continue;
- }
- do {
- dma_clear_pte(pte);
- start_pfn += lvl_to_nr_pages(large_page);
- pte++;
- } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
+ struct pasid_dir_entry *dir, *pde;
+ struct pasid_entry *entries, *pte;
+ struct context_entry *ctx_entry;
+ struct root_entry *rt_entry;
+ int i, dir_index, index, level;
+ u8 devfn = source_id & 0xff;
+ u8 bus = source_id >> 8;
+ struct dma_pte *pgtable;
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
-
- } while (start_pfn && start_pfn <= last_pfn);
-}
+ pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
-static void dma_pte_free_level(struct dmar_domain *domain, int level,
- int retain_level, struct dma_pte *pte,
- unsigned long pfn, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn;
- struct dma_pte *level_pte;
+ /* root entry dump */
+ if (!iommu->root_entry) {
+ pr_info("root table is not present\n");
+ return;
+ }
+ rt_entry = &iommu->root_entry[bus];
- if (!dma_pte_present(pte) || dma_pte_superpage(pte))
- goto next;
+ if (sm_supported(iommu))
+ pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
+ rt_entry->hi, rt_entry->lo);
+ else
+ pr_info("root entry: 0x%016llx", rt_entry->lo);
- level_pfn = pfn & level_mask(level);
- level_pte = phys_to_virt(dma_pte_addr(pte));
+ /* context entry dump */
+ ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
+ if (!ctx_entry) {
+ pr_info("context table is not present\n");
+ return;
+ }
- if (level > 2) {
- dma_pte_free_level(domain, level - 1, retain_level,
- level_pte, level_pfn, start_pfn,
- last_pfn);
- }
+ pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
+ ctx_entry->hi, ctx_entry->lo);
- /*
- * Free the page table if we're below the level we want to
- * retain and the range covers the entire table.
- */
- if (level < retain_level && !(start_pfn > level_pfn ||
- last_pfn < level_pfn + level_size(level) - 1)) {
- dma_clear_pte(pte);
- domain_flush_cache(domain, pte, sizeof(*pte));
- free_pgtable_page(level_pte);
+ /* legacy mode does not require PASID entries */
+ if (!sm_supported(iommu)) {
+ if (!context_present(ctx_entry)) {
+ pr_info("legacy mode page table is not present\n");
+ return;
}
-next:
- pfn += level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-}
-
-/*
- * clear last level (leaf) ptes and free page table pages below the
- * level we wish to keep intact.
- */
-static void dma_pte_free_pagetable(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn,
- int retain_level)
-{
- BUG_ON(!domain_pfn_supported(domain, start_pfn));
- BUG_ON(!domain_pfn_supported(domain, last_pfn));
- BUG_ON(start_pfn > last_pfn);
-
- dma_pte_clear_range(domain, start_pfn, last_pfn);
-
- /* We don't need lock here; nobody else touches the iova range */
- dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
- domain->pgd, 0, start_pfn, last_pfn);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- free_pgtable_page(domain->pgd);
- domain->pgd = NULL;
+ level = agaw_to_level(ctx_entry->hi & 7);
+ pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
+ goto pgtable_walk;
}
-}
-/* When a page at a given level is being unlinked from its parent, we don't
- need to *modify* it at all. All we need to do is make a list of all the
- pages which can be freed just as soon as we've flushed the IOTLB and we
- know the hardware page-walk will no longer touch them.
- The 'pte' argument is the *parent* PTE, pointing to the page that is to
- be freed. */
-static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
- int level, struct dma_pte *pte,
- struct page *freelist)
-{
- struct page *pg;
-
- pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
- pg->freelist = freelist;
- freelist = pg;
-
- if (level == 1)
- return freelist;
-
- pte = page_address(pg);
- do {
- if (dma_pte_present(pte) && !dma_pte_superpage(pte))
- freelist = dma_pte_list_pagetables(domain, level - 1,
- pte, freelist);
- pte++;
- } while (!first_pte_in_page(pte));
-
- return freelist;
-}
-
-static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
- struct dma_pte *pte, unsigned long pfn,
- unsigned long start_pfn,
- unsigned long last_pfn,
- struct page *freelist)
-{
- struct dma_pte *first_pte = NULL, *last_pte = NULL;
-
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn;
-
- if (!dma_pte_present(pte))
- goto next;
-
- level_pfn = pfn & level_mask(level);
-
- /* If range covers entire pagetable, free it */
- if (start_pfn <= level_pfn &&
- last_pfn >= level_pfn + level_size(level) - 1) {
- /* These suborbinate page tables are going away entirely. Don't
- bother to clear them; we're just going to *free* them. */
- if (level > 1 && !dma_pte_superpage(pte))
- freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
-
- dma_clear_pte(pte);
- if (!first_pte)
- first_pte = pte;
- last_pte = pte;
- } else if (level > 1) {
- /* Recurse down into a level that isn't *entirely* obsolete */
- freelist = dma_pte_clear_level(domain, level - 1,
- phys_to_virt(dma_pte_addr(pte)),
- level_pfn, start_pfn, last_pfn,
- freelist);
- }
-next:
- pfn += level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-
- if (first_pte)
- domain_flush_cache(domain, first_pte,
- (void *)++last_pte - (void *)first_pte);
-
- return freelist;
-}
+ if (!context_present(ctx_entry)) {
+ pr_info("pasid directory table is not present\n");
+ return;
+ }
-/* We can't just free the pages because the IOMMU may still be walking
- the page tables, and may have cached the intermediate levels. The
- pages can only be freed after the IOTLB flush has been done. */
-static struct page *domain_unmap(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn,
- struct page *freelist)
-{
- BUG_ON(!domain_pfn_supported(domain, start_pfn));
- BUG_ON(!domain_pfn_supported(domain, last_pfn));
- BUG_ON(start_pfn > last_pfn);
+ /* get the pointer to pasid directory entry */
+ dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
- /* we don't need lock here; nobody else touches the iova range */
- freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
- domain->pgd, 0, start_pfn, last_pfn,
- freelist);
+ /* For request-without-pasid, get the pasid from context entry */
+ if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
+ pasid = IOMMU_NO_PASID;
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- struct page *pgd_page = virt_to_page(domain->pgd);
- pgd_page->freelist = freelist;
- freelist = pgd_page;
+ dir_index = pasid >> PASID_PDE_SHIFT;
+ pde = &dir[dir_index];
+ pr_info("pasid dir entry: 0x%016llx\n", pde->val);
- domain->pgd = NULL;
+ /* get the pointer to the pasid table entry */
+ entries = get_pasid_table_from_pde(pde);
+ if (!entries) {
+ pr_info("pasid table is not present\n");
+ return;
}
+ index = pasid & PASID_PTE_MASK;
+ pte = &entries[index];
+ for (i = 0; i < ARRAY_SIZE(pte->val); i++)
+ pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
- return freelist;
-}
-
-static void dma_free_pagelist(struct page *freelist)
-{
- struct page *pg;
+ if (!pasid_pte_is_present(pte)) {
+ pr_info("scalable mode page table is not present\n");
+ return;
+ }
- while ((pg = freelist)) {
- freelist = pg->freelist;
- free_pgtable_page(page_address(pg));
+ if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
+ level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
+ pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
+ } else {
+ level = agaw_to_level((pte->val[0] >> 2) & 0x7);
+ pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
}
+
+pgtable_walk:
+ pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
}
+#endif
/* iommu handling */
static int iommu_alloc_root_entry(struct intel_iommu *iommu)
{
struct root_entry *root;
- unsigned long flags;
- root = (struct root_entry *)alloc_pgtable_page(iommu->node);
+ root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K);
if (!root) {
pr_err("Allocating root entry for %s failed\n",
iommu->name);
@@ -1318,10 +681,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu)
}
__iommu_flush_cache(iommu, root, ROOT_SIZE);
-
- spin_lock_irqsave(&iommu->lock, flags);
iommu->root_entry = root;
- spin_unlock_irqrestore(&iommu->lock, flags);
return 0;
}
@@ -1347,6 +707,13 @@ static void iommu_set_root_entry(struct intel_iommu *iommu)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+ /*
+ * Hardware invalidates all DMA remapping hardware translation
+ * caches as part of SRTP flow.
+ */
+ if (cap_esrtps(iommu->cap))
+ return;
+
iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
if (sm_supported(iommu))
qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
@@ -1391,7 +758,9 @@ static void __iommu_flush_context(struct intel_iommu *iommu,
| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
break;
default:
- BUG();
+ pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
+ iommu->name, type);
+ return;
}
val |= DMA_CCMD_ICC;
@@ -1405,9 +774,8 @@ static void __iommu_flush_context(struct intel_iommu *iommu,
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
-/* return value determine if we need a write buffer flush */
-static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
- u64 addr, unsigned int size_order, u64 type)
+void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+ unsigned int size_order, u64 type)
{
int tlb_offset = ecap_iotlb_offset(iommu->ecap);
u64 val = 0, val_iva = 0;
@@ -1427,17 +795,11 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
val_iva = size_order | addr;
break;
default:
- BUG();
+ pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
+ iommu->name, type);
+ return;
}
- /* Note: set drain read/write */
-#if 0
- /*
- * This is probably to be super secure.. Looks like we can
- * ignore it without any impact.
- */
- if (cap_read_drain(iommu->cap))
- val |= DMA_TLB_READ_DRAIN;
-#endif
+
if (cap_write_drain(iommu->cap))
val |= DMA_TLB_WRITE_DRAIN;
@@ -1463,252 +825,101 @@ static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
}
static struct device_domain_info *
-iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
- u8 bus, u8 devfn)
+domain_lookup_dev_info(struct dmar_domain *domain,
+ struct intel_iommu *iommu, u8 bus, u8 devfn)
{
struct device_domain_info *info;
+ unsigned long flags;
- assert_spin_locked(&device_domain_lock);
-
- if (!iommu->qi)
- return NULL;
-
- list_for_each_entry(info, &domain->devices, link)
+ spin_lock_irqsave(&domain->lock, flags);
+ list_for_each_entry(info, &domain->devices, link) {
if (info->iommu == iommu && info->bus == bus &&
info->devfn == devfn) {
- if (info->ats_supported && info->dev)
- return info;
- break;
+ spin_unlock_irqrestore(&domain->lock, flags);
+ return info;
}
+ }
+ spin_unlock_irqrestore(&domain->lock, flags);
return NULL;
}
-static void domain_update_iotlb(struct dmar_domain *domain)
+/*
+ * The extra devTLB flush quirk impacts those QAT devices with PCI device
+ * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
+ * check because it applies only to the built-in QAT devices and it doesn't
+ * grant additional privileges.
+ */
+#define BUGGY_QAT_DEVID_MASK 0x4940
+static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
{
- struct device_domain_info *info;
- bool has_iotlb_device = false;
-
- assert_spin_locked(&device_domain_lock);
-
- list_for_each_entry(info, &domain->devices, link)
- if (info->ats_enabled) {
- has_iotlb_device = true;
- break;
- }
-
- if (!has_iotlb_device) {
- struct subdev_domain_info *sinfo;
+ if (pdev->vendor != PCI_VENDOR_ID_INTEL)
+ return false;
- list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
- info = get_domain_info(sinfo->pdev);
- if (info && info->ats_enabled) {
- has_iotlb_device = true;
- break;
- }
- }
- }
+ if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
+ return false;
- domain->has_iotlb_device = has_iotlb_device;
+ return true;
}
-static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+static void iommu_enable_pci_ats(struct device_domain_info *info)
{
struct pci_dev *pdev;
- assert_spin_locked(&device_domain_lock);
-
- if (!info || !dev_is_pci(info->dev))
+ if (!info->ats_supported)
return;
pdev = to_pci_dev(info->dev);
- /* For IOMMU that supports device IOTLB throttling (DIT), we assign
- * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
- * queue depth at PF level. If DIT is not set, PFSID will be treated as
- * reserved, which should be set to 0.
- */
- if (!ecap_dit(info->iommu->ecap))
- info->pfsid = 0;
- else {
- struct pci_dev *pf_pdev;
-
- /* pdev will be returned if device is not a vf */
- pf_pdev = pci_physfn(pdev);
- info->pfsid = pci_dev_id(pf_pdev);
- }
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
- /* The PCIe spec, in its wisdom, declares that the behaviour of
- the device if you enable PASID support after ATS support is
- undefined. So always enable PASID support on devices which
- have it, even if we can't yet know if we're ever going to
- use it. */
- if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
- info->pasid_enabled = 1;
+ if (!pci_ats_page_aligned(pdev))
+ return;
- if (info->pri_supported &&
- (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
- !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
- info->pri_enabled = 1;
-#endif
- if (info->ats_supported && pci_ats_page_aligned(pdev) &&
- !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
+ if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
info->ats_enabled = 1;
- domain_update_iotlb(info->domain);
- info->ats_qdep = pci_ats_queue_depth(pdev);
- }
}
-static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+static void iommu_disable_pci_ats(struct device_domain_info *info)
{
- struct pci_dev *pdev;
-
- assert_spin_locked(&device_domain_lock);
-
- if (!dev_is_pci(info->dev))
+ if (!info->ats_enabled)
return;
- pdev = to_pci_dev(info->dev);
-
- if (info->ats_enabled) {
- pci_disable_ats(pdev);
- info->ats_enabled = 0;
- domain_update_iotlb(info->domain);
- }
-#ifdef CONFIG_INTEL_IOMMU_SVM
- if (info->pri_enabled) {
- pci_disable_pri(pdev);
- info->pri_enabled = 0;
- }
- if (info->pasid_enabled) {
- pci_disable_pasid(pdev);
- info->pasid_enabled = 0;
- }
-#endif
+ pci_disable_ats(to_pci_dev(info->dev));
+ info->ats_enabled = 0;
}
-static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
- u64 addr, unsigned int mask)
+static void iommu_enable_pci_pri(struct device_domain_info *info)
{
- u16 sid, qdep;
+ struct pci_dev *pdev;
- if (!info || !info->ats_enabled)
+ if (!info->ats_enabled || !info->pri_supported)
return;
- sid = info->bus << 8 | info->devfn;
- qdep = info->ats_qdep;
- qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
- qdep, addr, mask);
-}
-
-static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
- u64 addr, unsigned mask)
-{
- unsigned long flags;
- struct device_domain_info *info;
- struct subdev_domain_info *sinfo;
-
- if (!domain->has_iotlb_device)
+ pdev = to_pci_dev(info->dev);
+ /* PASID is required in PRG Response Message. */
+ if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
return;
- spin_lock_irqsave(&device_domain_lock, flags);
- list_for_each_entry(info, &domain->devices, link)
- __iommu_flush_dev_iotlb(info, addr, mask);
-
- list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
- info = get_domain_info(sinfo->pdev);
- __iommu_flush_dev_iotlb(info, addr, mask);
- }
- spin_unlock_irqrestore(&device_domain_lock, flags);
-}
-
-static void domain_flush_piotlb(struct intel_iommu *iommu,
- struct dmar_domain *domain,
- u64 addr, unsigned long npages, bool ih)
-{
- u16 did = domain->iommu_did[iommu->seq_id];
-
- if (domain->default_pasid)
- qi_flush_piotlb(iommu, did, domain->default_pasid,
- addr, npages, ih);
+ if (pci_reset_pri(pdev))
+ return;
- if (!list_empty(&domain->devices))
- qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
+ if (!pci_enable_pri(pdev, PRQ_DEPTH))
+ info->pri_enabled = 1;
}
-static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
- struct dmar_domain *domain,
- unsigned long pfn, unsigned int pages,
- int ih, int map)
+static void iommu_disable_pci_pri(struct device_domain_info *info)
{
- unsigned int mask = ilog2(__roundup_pow_of_two(pages));
- uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
- u16 did = domain->iommu_did[iommu->seq_id];
-
- BUG_ON(pages == 0);
-
- if (ih)
- ih = 1 << 6;
-
- if (domain_use_first_level(domain)) {
- domain_flush_piotlb(iommu, domain, addr, pages, ih);
- } else {
- /*
- * Fallback to domain selective flush if no PSI support or
- * the size is too big. PSI requires page size to be 2 ^ x,
- * and the base address is naturally aligned to the size.
- */
- if (!cap_pgsel_inv(iommu->cap) ||
- mask > cap_max_amask_val(iommu->cap))
- iommu->flush.flush_iotlb(iommu, did, 0, 0,
- DMA_TLB_DSI_FLUSH);
- else
- iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
- DMA_TLB_PSI_FLUSH);
- }
+ if (!info->pri_enabled)
+ return;
- /*
- * In caching mode, changes of pages from non-present to present require
- * flush. However, device IOTLB doesn't need to be flushed in this case.
- */
- if (!cap_caching_mode(iommu->cap) || !map)
- iommu_flush_dev_iotlb(domain, addr, mask);
-}
+ if (WARN_ON(info->iopf_refcount))
+ iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
-/* Notification for newly created mappings */
-static inline void __mapping_notify_one(struct intel_iommu *iommu,
- struct dmar_domain *domain,
- unsigned long pfn, unsigned int pages)
-{
- /*
- * It's a non-present to present mapping. Only flush if caching mode
- * and second level.
- */
- if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
- iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
- else
- iommu_flush_write_buffer(iommu);
+ pci_disable_pri(to_pci_dev(info->dev));
+ info->pri_enabled = 0;
}
static void intel_flush_iotlb_all(struct iommu_domain *domain)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- int idx;
-
- for_each_domain_iommu(idx, dmar_domain) {
- struct intel_iommu *iommu = g_iommus[idx];
- u16 did = dmar_domain->iommu_did[iommu->seq_id];
-
- if (domain_use_first_level(dmar_domain))
- domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
- else
- iommu->flush.flush_iotlb(iommu, did, 0, 0,
- DMA_TLB_DSI_FLUSH);
-
- if (!cap_caching_mode(iommu->cap))
- iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
- 0, MAX_AGAW_PFN_WIDTH);
- }
+ cache_tag_flush_all(to_dmar_domain(domain));
}
static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1767,1039 +978,381 @@ static void iommu_disable_translation(struct intel_iommu *iommu)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
-static int iommu_init_domains(struct intel_iommu *iommu)
+static void disable_dmar_iommu(struct intel_iommu *iommu)
{
- u32 ndomains, nlongs;
- size_t size;
-
- ndomains = cap_ndoms(iommu->cap);
- pr_debug("%s: Number of Domains supported <%d>\n",
- iommu->name, ndomains);
- nlongs = BITS_TO_LONGS(ndomains);
-
- spin_lock_init(&iommu->lock);
-
- iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
- if (!iommu->domain_ids) {
- pr_err("%s: Allocating domain id array failed\n",
- iommu->name);
- return -ENOMEM;
- }
-
- size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
- iommu->domains = kzalloc(size, GFP_KERNEL);
-
- if (iommu->domains) {
- size = 256 * sizeof(struct dmar_domain *);
- iommu->domains[0] = kzalloc(size, GFP_KERNEL);
- }
-
- if (!iommu->domains || !iommu->domains[0]) {
- pr_err("%s: Allocating domain array failed\n",
- iommu->name);
- kfree(iommu->domain_ids);
- kfree(iommu->domains);
- iommu->domain_ids = NULL;
- iommu->domains = NULL;
- return -ENOMEM;
- }
-
- /*
- * If Caching mode is set, then invalid translations are tagged
- * with domain-id 0, hence we need to pre-allocate it. We also
- * use domain-id 0 as a marker for non-allocated domain-id, so
- * make sure it is not used for a real domain.
- */
- set_bit(0, iommu->domain_ids);
-
/*
- * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
- * entry for first-level or pass-through translation modes should
- * be programmed with a domain id different from those used for
- * second-level or nested translation. We reserve a domain id for
- * this purpose.
+ * All iommu domains must have been detached from the devices,
+ * hence there should be no domain IDs in use.
*/
- if (sm_supported(iommu))
- set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
-
- return 0;
-}
-
-static void disable_dmar_iommu(struct intel_iommu *iommu)
-{
- struct device_domain_info *info, *tmp;
- unsigned long flags;
-
- if (!iommu->domains || !iommu->domain_ids)
+ if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
return;
- spin_lock_irqsave(&device_domain_lock, flags);
- list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
- if (info->iommu != iommu)
- continue;
-
- if (!info->dev || !info->domain)
- continue;
-
- __dmar_remove_one_dev_info(info);
- }
- spin_unlock_irqrestore(&device_domain_lock, flags);
-
if (iommu->gcmd & DMA_GCMD_TE)
iommu_disable_translation(iommu);
}
static void free_dmar_iommu(struct intel_iommu *iommu)
{
- if ((iommu->domains) && (iommu->domain_ids)) {
- int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
- int i;
-
- for (i = 0; i < elems; i++)
- kfree(iommu->domains[i]);
- kfree(iommu->domains);
- kfree(iommu->domain_ids);
- iommu->domains = NULL;
- iommu->domain_ids = NULL;
+ if (iommu->copied_tables) {
+ bitmap_free(iommu->copied_tables);
+ iommu->copied_tables = NULL;
}
- g_iommus[iommu->seq_id] = NULL;
-
/* free context mapping */
free_context_table(iommu);
-#ifdef CONFIG_INTEL_IOMMU_SVM
- if (pasid_supported(iommu)) {
- if (ecap_prs(iommu->ecap))
- intel_svm_finish_prq(iommu);
- }
- if (vccap_pasid(iommu->vccap))
- ioasid_unregister_allocator(&iommu->pasid_allocator);
-
-#endif
+ if (ecap_prs(iommu->ecap))
+ intel_iommu_finish_prq(iommu);
}
/*
* Check and return whether first level is used by default for
* DMA translation.
*/
-static bool first_level_by_default(void)
-{
- return scalable_mode_support() && intel_cap_flts_sanity();
-}
-
-static struct dmar_domain *alloc_domain(int flags)
+static bool first_level_by_default(struct intel_iommu *iommu)
{
- struct dmar_domain *domain;
-
- domain = alloc_domain_mem();
- if (!domain)
- return NULL;
+ /* Only SL is available in legacy mode */
+ if (!sm_supported(iommu))
+ return false;
- memset(domain, 0, sizeof(*domain));
- domain->nid = NUMA_NO_NODE;
- domain->flags = flags;
- if (first_level_by_default())
- domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
- domain->has_iotlb_device = false;
- INIT_LIST_HEAD(&domain->devices);
- INIT_LIST_HEAD(&domain->subdevices);
+ /* Only level (either FL or SL) is available, just use it */
+ if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
+ return ecap_flts(iommu->ecap);
- return domain;
+ return true;
}
-/* Must be called with iommu->lock */
-static int domain_attach_iommu(struct dmar_domain *domain,
- struct intel_iommu *iommu)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
- unsigned long ndomains;
- int num;
-
- assert_spin_locked(&device_domain_lock);
- assert_spin_locked(&iommu->lock);
-
- domain->iommu_refcnt[iommu->seq_id] += 1;
- if (domain->iommu_refcnt[iommu->seq_id] == 1) {
- ndomains = cap_ndoms(iommu->cap);
- num = find_first_zero_bit(iommu->domain_ids, ndomains);
-
- if (num >= ndomains) {
- pr_err("%s: No free domain ids\n", iommu->name);
- domain->iommu_refcnt[iommu->seq_id] -= 1;
- return -ENOSPC;
- }
+ struct iommu_domain_info *info, *curr;
+ int num, ret = -ENOSPC;
- set_bit(num, iommu->domain_ids);
- set_iommu_domain(iommu, num, domain);
+ if (domain->domain.type == IOMMU_DOMAIN_SVA)
+ return 0;
- domain->iommu_did[iommu->seq_id] = num;
- domain->nid = iommu->node;
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
- domain_update_iommu_cap(domain);
+ guard(mutex)(&iommu->did_lock);
+ curr = xa_load(&domain->iommu_array, iommu->seq_id);
+ if (curr) {
+ curr->refcnt++;
+ kfree(info);
+ return 0;
}
- return 0;
-}
-
-static void domain_detach_iommu(struct dmar_domain *domain,
- struct intel_iommu *iommu)
-{
- int num;
-
- assert_spin_locked(&device_domain_lock);
- assert_spin_locked(&iommu->lock);
-
- domain->iommu_refcnt[iommu->seq_id] -= 1;
- if (domain->iommu_refcnt[iommu->seq_id] == 0) {
- num = domain->iommu_did[iommu->seq_id];
- clear_bit(num, iommu->domain_ids);
- set_iommu_domain(iommu, num, NULL);
+ num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
+ cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
+ if (num < 0) {
+ pr_err("%s: No free domain ids\n", iommu->name);
+ goto err_unlock;
+ }
- domain_update_iommu_cap(domain);
- domain->iommu_did[iommu->seq_id] = 0;
+ info->refcnt = 1;
+ info->did = num;
+ info->iommu = iommu;
+ curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
+ NULL, info, GFP_KERNEL);
+ if (curr) {
+ ret = xa_err(curr) ? : -EBUSY;
+ goto err_clear;
}
-}
-static inline int guestwidth_to_adjustwidth(int gaw)
-{
- int agaw;
- int r = (gaw - 12) % 9;
+ return 0;
- if (r == 0)
- agaw = gaw;
- else
- agaw = gaw + 9 - r;
- if (agaw > 64)
- agaw = 64;
- return agaw;
+err_clear:
+ ida_free(&iommu->domain_ida, info->did);
+err_unlock:
+ kfree(info);
+ return ret;
}
-static void domain_exit(struct dmar_domain *domain)
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
+ struct iommu_domain_info *info;
- /* Remove associated devices and clear attached or cached domains */
- domain_remove_dev_info(domain);
-
- /* destroy iovas */
- if (domain->domain.type == IOMMU_DOMAIN_DMA)
- iommu_put_dma_cookie(&domain->domain);
-
- if (domain->pgd) {
- struct page *freelist;
+ if (domain->domain.type == IOMMU_DOMAIN_SVA)
+ return;
- freelist = domain_unmap(domain, 0,
- DOMAIN_MAX_PFN(domain->gaw), NULL);
- dma_free_pagelist(freelist);
+ guard(mutex)(&iommu->did_lock);
+ info = xa_load(&domain->iommu_array, iommu->seq_id);
+ if (--info->refcnt == 0) {
+ ida_free(&iommu->domain_ida, info->did);
+ xa_erase(&domain->iommu_array, iommu->seq_id);
+ kfree(info);
}
-
- free_domain_mem(domain);
}
/*
- * Get the PASID directory size for scalable mode context entry.
- * Value of X in the PDTS field of a scalable mode context entry
- * indicates PASID directory with 2^(X + 7) entries.
+ * For kdump cases, old valid entries may be cached due to the
+ * in-flight DMA and copied pgtable, but there is no unmapping
+ * behaviour for them, thus we need an explicit cache flush for
+ * the newly-mapped device. For kdump, at this point, the device
+ * is supposed to finish reset at its driver probe stage, so no
+ * in-flight DMA will exist, and we don't need to worry anymore
+ * hereafter.
*/
-static inline unsigned long context_get_sm_pds(struct pasid_table *table)
+static void copied_context_tear_down(struct intel_iommu *iommu,
+ struct context_entry *context,
+ u8 bus, u8 devfn)
{
- int pds, max_pde;
+ u16 did_old;
- max_pde = table->max_pasid >> PASID_PDE_SHIFT;
- pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
- if (pds < 7)
- return 0;
+ if (!context_copied(iommu, bus, devfn))
+ return;
- return pds - 7;
-}
+ assert_spin_locked(&iommu->lock);
-/*
- * Set the RID_PASID field of a scalable mode context entry. The
- * IOMMU hardware will use the PASID value set in this field for
- * DMA translations of DMA requests without PASID.
- */
-static inline void
-context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
-{
- context->hi |= pasid & ((1 << 20) - 1);
-}
+ did_old = context_domain_id(context);
+ context_clear_entry(context);
-/*
- * Set the DTE(Device-TLB Enable) field of a scalable mode context
- * entry.
- */
-static inline void context_set_sm_dte(struct context_entry *context)
-{
- context->lo |= (1 << 2);
+ if (did_old < cap_ndoms(iommu->cap)) {
+ iommu->flush.flush_context(iommu, did_old,
+ PCI_DEVID(bus, devfn),
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
+ DMA_TLB_DSI_FLUSH);
+ }
+
+ clear_context_copied(iommu, bus, devfn);
}
/*
- * Set the PRE(Page Request Enable) field of a scalable mode context
- * entry.
+ * It's a non-present to present mapping. If hardware doesn't cache
+ * non-present entry we only need to flush the write-buffer. If the
+ * _does_ cache non-present entries, then it does so in the special
+ * domain #0, which we have to flush:
*/
-static inline void context_set_sm_pre(struct context_entry *context)
+static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
+ u8 bus, u8 devfn)
{
- context->lo |= (1 << 4);
+ if (cap_caching_mode(iommu->cap)) {
+ iommu->flush.flush_context(iommu, 0,
+ PCI_DEVID(bus, devfn),
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+ } else {
+ iommu_flush_write_buffer(iommu);
+ }
}
-/* Convert value to context PASID directory size field coding. */
-#define context_pdts(pds) (((pds) & 0x7) << 9)
-
static int domain_context_mapping_one(struct dmar_domain *domain,
struct intel_iommu *iommu,
- struct pasid_table *table,
u8 bus, u8 devfn)
{
- u16 did = domain->iommu_did[iommu->seq_id];
+ struct device_domain_info *info =
+ domain_lookup_dev_info(domain, iommu, bus, devfn);
+ u16 did = domain_id_iommu(domain, iommu);
int translation = CONTEXT_TT_MULTI_LEVEL;
- struct device_domain_info *info = NULL;
+ struct pt_iommu_vtdss_hw_info pt_info;
struct context_entry *context;
- unsigned long flags;
int ret;
- WARN_ON(did == 0);
+ if (WARN_ON(!intel_domain_is_ss_paging(domain)))
+ return -EINVAL;
- if (hw_pass_through && domain_type_is_si(domain))
- translation = CONTEXT_TT_PASS_THROUGH;
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
pr_debug("Set context mapping for %02x:%02x.%d\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
- BUG_ON(!domain->pgd);
-
- spin_lock_irqsave(&device_domain_lock, flags);
spin_lock(&iommu->lock);
-
ret = -ENOMEM;
context = iommu_context_addr(iommu, bus, devfn, 1);
if (!context)
goto out_unlock;
ret = 0;
- if (context_present(context))
+ if (context_present(context) && !context_copied(iommu, bus, devfn))
goto out_unlock;
- /*
- * For kdump cases, old valid entries may be cached due to the
- * in-flight DMA and copied pgtable, but there is no unmapping
- * behaviour for them, thus we need an explicit cache flush for
- * the newly-mapped device. For kdump, at this point, the device
- * is supposed to finish reset at its driver probe stage, so no
- * in-flight DMA will exist, and we don't need to worry anymore
- * hereafter.
- */
- if (context_copied(context)) {
- u16 did_old = context_domain_id(context);
-
- if (did_old < cap_ndoms(iommu->cap)) {
- iommu->flush.flush_context(iommu, did_old,
- (((u16)bus) << 8) | devfn,
- DMA_CCMD_MASK_NOBIT,
- DMA_CCMD_DEVICE_INVL);
- iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
- DMA_TLB_DSI_FLUSH);
- }
- }
-
+ copied_context_tear_down(iommu, context, bus, devfn);
context_clear_entry(context);
+ context_set_domain_id(context, did);
- if (sm_supported(iommu)) {
- unsigned long pds;
-
- WARN_ON(!table);
-
- /* Setup the PASID DIR pointer: */
- pds = context_get_sm_pds(table);
- context->lo = (u64)virt_to_phys(table->table) |
- context_pdts(pds);
-
- /* Setup the RID_PASID field: */
- context_set_sm_rid2pasid(context, PASID_RID2PASID);
-
- /*
- * Setup the Device-TLB enable bit and Page request
- * Enable bit:
- */
- info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
- if (info && info->ats_supported)
- context_set_sm_dte(context);
- if (info && info->pri_supported)
- context_set_sm_pre(context);
- } else {
- struct dma_pte *pgd = domain->pgd;
- int agaw;
-
- context_set_domain_id(context, did);
-
- if (translation != CONTEXT_TT_PASS_THROUGH) {
- /*
- * Skip top levels of page tables for iommu which has
- * less agaw than default. Unnecessary for PT mode.
- */
- for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
- ret = -ENOMEM;
- pgd = phys_to_virt(dma_pte_addr(pgd));
- if (!dma_pte_present(pgd))
- goto out_unlock;
- }
-
- info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
- if (info && info->ats_supported)
- translation = CONTEXT_TT_DEV_IOTLB;
- else
- translation = CONTEXT_TT_MULTI_LEVEL;
-
- context_set_address_root(context, virt_to_phys(pgd));
- context_set_address_width(context, agaw);
- } else {
- /*
- * In pass through mode, AW must be programmed to
- * indicate the largest AGAW value supported by
- * hardware. And ASR is ignored by hardware.
- */
- context_set_address_width(context, iommu->msagaw);
- }
-
- context_set_translation_type(context, translation);
- }
+ if (info && info->ats_supported)
+ translation = CONTEXT_TT_DEV_IOTLB;
+ else
+ translation = CONTEXT_TT_MULTI_LEVEL;
+ context_set_address_root(context, pt_info.ssptptr);
+ context_set_address_width(context, pt_info.aw);
+ context_set_translation_type(context, translation);
context_set_fault_enable(context);
context_set_present(context);
if (!ecap_coherent(iommu->ecap))
clflush_cache_range(context, sizeof(*context));
-
- /*
- * It's a non-present to present mapping. If hardware doesn't cache
- * non-present entry we only need to flush the write-buffer. If the
- * _does_ cache non-present entries, then it does so in the special
- * domain #0, which we have to flush:
- */
- if (cap_caching_mode(iommu->cap)) {
- iommu->flush.flush_context(iommu, 0,
- (((u16)bus) << 8) | devfn,
- DMA_CCMD_MASK_NOBIT,
- DMA_CCMD_DEVICE_INVL);
- iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
- } else {
- iommu_flush_write_buffer(iommu);
- }
- iommu_enable_dev_iotlb(info);
-
+ context_present_cache_flush(iommu, did, bus, devfn);
ret = 0;
out_unlock:
spin_unlock(&iommu->lock);
- spin_unlock_irqrestore(&device_domain_lock, flags);
return ret;
}
-struct domain_context_mapping_data {
- struct dmar_domain *domain;
- struct intel_iommu *iommu;
- struct pasid_table *table;
-};
-
static int domain_context_mapping_cb(struct pci_dev *pdev,
u16 alias, void *opaque)
{
- struct domain_context_mapping_data *data = opaque;
+ struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
+ struct intel_iommu *iommu = info->iommu;
+ struct dmar_domain *domain = opaque;
- return domain_context_mapping_one(data->domain, data->iommu,
- data->table, PCI_BUS_NUM(alias),
- alias & 0xff);
+ return domain_context_mapping_one(domain, iommu,
+ PCI_BUS_NUM(alias), alias & 0xff);
}
static int
domain_context_mapping(struct dmar_domain *domain, struct device *dev)
{
- struct domain_context_mapping_data data;
- struct pasid_table *table;
- struct intel_iommu *iommu;
- u8 bus, devfn;
-
- iommu = device_to_iommu(dev, &bus, &devfn);
- if (!iommu)
- return -ENODEV;
-
- table = intel_pasid_get_table(dev);
-
- if (!dev_is_pci(dev))
- return domain_context_mapping_one(domain, iommu, table,
- bus, devfn);
-
- data.domain = domain;
- data.iommu = iommu;
- data.table = table;
-
- return pci_for_each_dma_alias(to_pci_dev(dev),
- &domain_context_mapping_cb, &data);
-}
-
-static int domain_context_mapped_cb(struct pci_dev *pdev,
- u16 alias, void *opaque)
-{
- struct intel_iommu *iommu = opaque;
-
- return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
-}
-
-static int domain_context_mapped(struct device *dev)
-{
- struct intel_iommu *iommu;
- u8 bus, devfn;
-
- iommu = device_to_iommu(dev, &bus, &devfn);
- if (!iommu)
- return -ENODEV;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ u8 bus = info->bus, devfn = info->devfn;
+ int ret;
if (!dev_is_pci(dev))
- return device_context_mapped(iommu, bus, devfn);
-
- return !pci_for_each_dma_alias(to_pci_dev(dev),
- domain_context_mapped_cb, iommu);
-}
-
-/* Returns a number of VTD pages, but aligned to MM page size */
-static inline unsigned long aligned_nrpages(unsigned long host_addr,
- size_t size)
-{
- host_addr &= ~PAGE_MASK;
- return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
-}
-
-/* Return largest possible superpage level for a given mapping */
-static inline int hardware_largepage_caps(struct dmar_domain *domain,
- unsigned long iov_pfn,
- unsigned long phy_pfn,
- unsigned long pages)
-{
- int support, level = 1;
- unsigned long pfnmerge;
-
- support = domain->iommu_superpage;
-
- /* To use a large page, the virtual *and* physical addresses
- must be aligned to 2MiB/1GiB/etc. Lower bits set in either
- of them will mean we have to use smaller pages. So just
- merge them and check both at once. */
- pfnmerge = iov_pfn | phy_pfn;
-
- while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
- pages >>= VTD_STRIDE_SHIFT;
- if (!pages)
- break;
- pfnmerge >>= VTD_STRIDE_SHIFT;
- level++;
- support--;
- }
- return level;
-}
-
-/*
- * Ensure that old small page tables are removed to make room for superpage(s).
- * We're going to add new large pages, so make sure we don't remove their parent
- * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
- */
-static void switch_to_super_page(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long end_pfn, int level)
-{
- unsigned long lvl_pages = lvl_to_nr_pages(level);
- struct dma_pte *pte = NULL;
- int i;
-
- while (start_pfn <= end_pfn) {
- if (!pte)
- pte = pfn_to_dma_pte(domain, start_pfn, &level);
-
- if (dma_pte_present(pte)) {
- dma_pte_free_pagetable(domain, start_pfn,
- start_pfn + lvl_pages - 1,
- level + 1);
-
- for_each_domain_iommu(i, domain)
- iommu_flush_iotlb_psi(g_iommus[i], domain,
- start_pfn, lvl_pages,
- 0, 0);
- }
-
- pte++;
- start_pfn += lvl_pages;
- if (first_pte_in_page(pte))
- pte = NULL;
- }
-}
-
-static int
-__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phys_pfn, unsigned long nr_pages, int prot)
-{
- unsigned int largepage_lvl = 0;
- unsigned long lvl_pages = 0;
- struct dma_pte *pte = NULL;
- phys_addr_t pteval;
- u64 attr;
-
- BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
-
- if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
- return -EINVAL;
-
- attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
- attr |= DMA_FL_PTE_PRESENT;
- if (domain_use_first_level(domain)) {
- attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
-
- if (domain->domain.type == IOMMU_DOMAIN_DMA) {
- attr |= DMA_FL_PTE_ACCESS;
- if (prot & DMA_PTE_WRITE)
- attr |= DMA_FL_PTE_DIRTY;
- }
- }
-
- pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-
- while (nr_pages > 0) {
- uint64_t tmp;
-
- if (!pte) {
- largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
- phys_pfn, nr_pages);
+ return domain_context_mapping_one(domain, iommu, bus, devfn);
- pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
- if (!pte)
- return -ENOMEM;
- /* It is large page*/
- if (largepage_lvl > 1) {
- unsigned long end_pfn;
-
- pteval |= DMA_PTE_LARGE_PAGE;
- end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
- switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
- } else {
- pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
- }
-
- }
- /* We don't need lock here, nobody else
- * touches the iova range
- */
- tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
- if (tmp) {
- static int dumps = 5;
- pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
- iov_pfn, tmp, (unsigned long long)pteval);
- if (dumps) {
- dumps--;
- debug_dma_dump_mappings(NULL);
- }
- WARN_ON(1);
- }
+ ret = pci_for_each_dma_alias(to_pci_dev(dev),
+ domain_context_mapping_cb, domain);
+ if (ret)
+ return ret;
- lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
- BUG_ON(nr_pages < lvl_pages);
-
- nr_pages -= lvl_pages;
- iov_pfn += lvl_pages;
- phys_pfn += lvl_pages;
- pteval += lvl_pages * VTD_PAGE_SIZE;
-
- /* If the next PTE would be the first in a new page, then we
- * need to flush the cache on the entries we've just written.
- * And then we'll need to recalculate 'pte', so clear it and
- * let it get set again in the if (!pte) block above.
- *
- * If we're done (!nr_pages) we need to flush the cache too.
- *
- * Also if we've been setting superpages, we may need to
- * recalculate 'pte' and switch back to smaller pages for the
- * end of the mapping, if the trailing size is not enough to
- * use another superpage (i.e. nr_pages < lvl_pages).
- *
- * We leave clflush for the leaf pte changes to iotlb_sync_map()
- * callback.
- */
- pte++;
- if (!nr_pages || first_pte_in_page(pte) ||
- (largepage_lvl > 1 && nr_pages < lvl_pages))
- pte = NULL;
- }
+ iommu_enable_pci_ats(info);
return 0;
}
-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
+static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
{
- unsigned long flags;
+ struct intel_iommu *iommu = info->iommu;
struct context_entry *context;
- u16 did_old;
-
- if (!iommu)
- return;
+ u16 did;
- spin_lock_irqsave(&iommu->lock, flags);
+ spin_lock(&iommu->lock);
context = iommu_context_addr(iommu, bus, devfn, 0);
if (!context) {
- spin_unlock_irqrestore(&iommu->lock, flags);
+ spin_unlock(&iommu->lock);
return;
}
- did_old = context_domain_id(context);
+
+ did = context_domain_id(context);
context_clear_entry(context);
__iommu_flush_cache(iommu, context, sizeof(*context));
- spin_unlock_irqrestore(&iommu->lock, flags);
- iommu->flush.flush_context(iommu,
- did_old,
- (((u16)bus) << 8) | devfn,
- DMA_CCMD_MASK_NOBIT,
- DMA_CCMD_DEVICE_INVL);
-
- if (sm_supported(iommu))
- qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
-
- iommu->flush.flush_iotlb(iommu,
- did_old,
- 0,
- 0,
- DMA_TLB_DSI_FLUSH);
+ spin_unlock(&iommu->lock);
+ intel_context_flush_no_pasid(info, context, did);
}
-static inline void unlink_domain_info(struct device_domain_info *info)
+int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
+ ioasid_t pasid, u16 did, phys_addr_t fsptptr,
+ int flags, struct iommu_domain *old)
{
- assert_spin_locked(&device_domain_lock);
- list_del(&info->link);
- list_del(&info->global);
- if (info->dev)
- dev_iommu_priv_set(info->dev, NULL);
+ if (!old)
+ return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid,
+ did, flags);
+ return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did,
+ iommu_domain_did(old, iommu),
+ flags);
}
-static void domain_remove_dev_info(struct dmar_domain *domain)
+static int domain_setup_second_level(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
- struct device_domain_info *info, *tmp;
- unsigned long flags;
-
- spin_lock_irqsave(&device_domain_lock, flags);
- list_for_each_entry_safe(info, tmp, &domain->devices, link)
- __dmar_remove_one_dev_info(info);
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ if (!old)
+ return intel_pasid_setup_second_level(iommu, domain,
+ dev, pasid);
+ return intel_pasid_replace_second_level(iommu, domain, dev,
+ iommu_domain_did(old, iommu),
+ pasid);
}
-struct dmar_domain *find_domain(struct device *dev)
+static int domain_setup_passthrough(struct intel_iommu *iommu,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
- struct device_domain_info *info;
-
- if (unlikely(!dev || !dev->iommu))
- return NULL;
-
- if (unlikely(attach_deferred(dev)))
- return NULL;
-
- /* No lock here, assumes no domain exit in normal case */
- info = get_domain_info(dev);
- if (likely(info))
- return info->domain;
-
- return NULL;
-}
-
-static inline struct device_domain_info *
-dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
-{
- struct device_domain_info *info;
-
- list_for_each_entry(info, &device_domain_list, global)
- if (info->segment == segment && info->bus == bus &&
- info->devfn == devfn)
- return info;
-
- return NULL;
+ if (!old)
+ return intel_pasid_setup_pass_through(iommu, dev, pasid);
+ return intel_pasid_replace_pass_through(iommu, dev,
+ iommu_domain_did(old, iommu),
+ pasid);
}
static int domain_setup_first_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev,
- u32 pasid)
+ u32 pasid, struct iommu_domain *old)
{
- struct dma_pte *pgd = domain->pgd;
- int agaw, level;
- int flags = 0;
+ struct pt_iommu_x86_64_hw_info pt_info;
+ unsigned int flags = 0;
- /*
- * Skip top levels of page tables for iommu which has
- * less agaw than default. Unnecessary for PT mode.
- */
- for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
- pgd = phys_to_virt(dma_pte_addr(pgd));
- if (!dma_pte_present(pgd))
- return -ENOMEM;
- }
-
- level = agaw_to_level(agaw);
- if (level != 4 && level != 5)
+ pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info);
+ if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5))
return -EINVAL;
- if (pasid != PASID_RID2PASID)
- flags |= PASID_FLAG_SUPERVISOR_MODE;
- if (level == 5)
+ if (pt_info.levels == 5)
flags |= PASID_FLAG_FL5LP;
- if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+ if (domain->force_snooping)
flags |= PASID_FLAG_PAGE_SNOOP;
- return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
- domain->iommu_did[iommu->seq_id],
- flags);
-}
+ if (!(domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ flags |= PASID_FLAG_PWSNP;
-static bool dev_is_real_dma_subdevice(struct device *dev)
-{
- return dev && dev_is_pci(dev) &&
- pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
+ return __domain_setup_first_level(iommu, dev, pasid,
+ domain_id_iommu(domain, iommu),
+ pt_info.gcr3_pt, flags, old);
}
-static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
- int bus, int devfn,
- struct device *dev,
- struct dmar_domain *domain)
+static int dmar_domain_attach_device(struct dmar_domain *domain,
+ struct device *dev)
{
- struct dmar_domain *found = NULL;
- struct device_domain_info *info;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
unsigned long flags;
int ret;
- info = alloc_devinfo_mem();
- if (!info)
- return NULL;
-
- if (!dev_is_real_dma_subdevice(dev)) {
- info->bus = bus;
- info->devfn = devfn;
- info->segment = iommu->segment;
- } else {
- struct pci_dev *pdev = to_pci_dev(dev);
-
- info->bus = pdev->bus->number;
- info->devfn = pdev->devfn;
- info->segment = pci_domain_nr(pdev->bus);
- }
-
- info->ats_supported = info->pasid_supported = info->pri_supported = 0;
- info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
- info->ats_qdep = 0;
- info->dev = dev;
- info->domain = domain;
- info->iommu = iommu;
- info->pasid_table = NULL;
- info->auxd_enabled = 0;
- INIT_LIST_HEAD(&info->subdevices);
-
- if (dev && dev_is_pci(dev)) {
- struct pci_dev *pdev = to_pci_dev(info->dev);
-
- if (ecap_dev_iotlb_support(iommu->ecap) &&
- pci_ats_supported(pdev) &&
- dmar_find_matched_atsr_unit(pdev))
- info->ats_supported = 1;
-
- if (sm_supported(iommu)) {
- if (pasid_supported(iommu)) {
- int features = pci_pasid_features(pdev);
- if (features >= 0)
- info->pasid_supported = features | 1;
- }
-
- if (info->ats_supported && ecap_prs(iommu->ecap) &&
- pci_pri_supported(pdev))
- info->pri_supported = 1;
- }
- }
-
- spin_lock_irqsave(&device_domain_lock, flags);
- if (dev)
- found = find_domain(dev);
-
- if (!found) {
- struct device_domain_info *info2;
- info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
- info->devfn);
- if (info2) {
- found = info2->domain;
- info2->dev = dev;
- }
- }
-
- if (found) {
- spin_unlock_irqrestore(&device_domain_lock, flags);
- free_devinfo_mem(info);
- /* Caller must free the original domain */
- return found;
- }
-
- spin_lock(&iommu->lock);
ret = domain_attach_iommu(domain, iommu);
- spin_unlock(&iommu->lock);
-
- if (ret) {
- spin_unlock_irqrestore(&device_domain_lock, flags);
- free_devinfo_mem(info);
- return NULL;
- }
+ if (ret)
+ return ret;
+ info->domain = domain;
+ info->domain_attached = true;
+ spin_lock_irqsave(&domain->lock, flags);
list_add(&info->link, &domain->devices);
- list_add(&info->global, &device_domain_list);
- if (dev)
- dev_iommu_priv_set(dev, info);
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ spin_unlock_irqrestore(&domain->lock, flags);
- /* PASID table is mandatory for a PCI device in scalable mode. */
- if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
- ret = intel_pasid_alloc_table(dev);
- if (ret) {
- dev_err(dev, "PASID table allocation failed\n");
- dmar_remove_one_dev_info(dev);
- return NULL;
- }
-
- /* Setup the PASID entry for requests without PASID: */
- spin_lock_irqsave(&iommu->lock, flags);
- if (hw_pass_through && domain_type_is_si(domain))
- ret = intel_pasid_setup_pass_through(iommu, domain,
- dev, PASID_RID2PASID);
- else if (domain_use_first_level(domain))
- ret = domain_setup_first_level(iommu, domain, dev,
- PASID_RID2PASID);
- else
- ret = intel_pasid_setup_second_level(iommu, domain,
- dev, PASID_RID2PASID);
- spin_unlock_irqrestore(&iommu->lock, flags);
- if (ret) {
- dev_err(dev, "Setup RID2PASID failed\n");
- dmar_remove_one_dev_info(dev);
- return NULL;
- }
- }
-
- if (dev && domain_context_mapping(domain, dev)) {
- dev_err(dev, "Domain context map failed\n");
- dmar_remove_one_dev_info(dev);
- return NULL;
- }
-
- return domain;
-}
-
-static int iommu_domain_identity_map(struct dmar_domain *domain,
- unsigned long first_vpfn,
- unsigned long last_vpfn)
-{
- /*
- * RMRR range might have overlap with physical memory range,
- * clear it first
- */
- dma_pte_clear_range(domain, first_vpfn, last_vpfn);
-
- return __domain_mapping(domain, first_vpfn,
- first_vpfn, last_vpfn - first_vpfn + 1,
- DMA_PTE_READ|DMA_PTE_WRITE);
-}
-
-static int md_domain_init(struct dmar_domain *domain, int guest_width);
-
-static int __init si_domain_init(int hw)
-{
- struct dmar_rmrr_unit *rmrr;
- struct device *dev;
- int i, nid, ret;
-
- si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
- if (!si_domain)
- return -EFAULT;
-
- if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
- domain_exit(si_domain);
- return -EFAULT;
- }
-
- if (hw)
+ if (dev_is_real_dma_subdevice(dev))
return 0;
- for_each_online_node(nid) {
- unsigned long start_pfn, end_pfn;
- int i;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- ret = iommu_domain_identity_map(si_domain,
- mm_to_dma_pfn(start_pfn),
- mm_to_dma_pfn(end_pfn));
- if (ret)
- return ret;
- }
- }
-
- /*
- * Identity map the RMRRs so that devices with RMRRs could also use
- * the si_domain.
- */
- for_each_rmrr_units(rmrr) {
- for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
- i, dev) {
- unsigned long long start = rmrr->base_address;
- unsigned long long end = rmrr->end_address;
-
- if (WARN_ON(end < start ||
- end >> agaw_to_width(si_domain->agaw)))
- continue;
-
- ret = iommu_domain_identity_map(si_domain,
- mm_to_dma_pfn(start >> PAGE_SHIFT),
- mm_to_dma_pfn(end >> PAGE_SHIFT));
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
-{
- struct dmar_domain *ndomain;
- struct intel_iommu *iommu;
- u8 bus, devfn;
+ if (!sm_supported(iommu))
+ ret = domain_context_mapping(domain, dev);
+ else if (intel_domain_is_fs_paging(domain))
+ ret = domain_setup_first_level(iommu, domain, dev,
+ IOMMU_NO_PASID, NULL);
+ else if (intel_domain_is_ss_paging(domain))
+ ret = domain_setup_second_level(iommu, domain, dev,
+ IOMMU_NO_PASID, NULL);
+ else if (WARN_ON(true))
+ ret = -EINVAL;
- iommu = device_to_iommu(dev, &bus, &devfn);
- if (!iommu)
- return -ENODEV;
+ if (ret)
+ goto out_block_translation;
- ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
- if (ndomain != domain)
- return -EBUSY;
+ ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
+ if (ret)
+ goto out_block_translation;
return 0;
-}
-static bool device_has_rmrr(struct device *dev)
-{
- struct dmar_rmrr_unit *rmrr;
- struct device *tmp;
- int i;
-
- rcu_read_lock();
- for_each_rmrr_units(rmrr) {
- /*
- * Return TRUE if this RMRR contains the device that
- * is passed in.
- */
- for_each_active_dev_scope(rmrr->devices,
- rmrr->devices_cnt, i, tmp)
- if (tmp == dev ||
- is_downstream_to_pci_bridge(dev, tmp)) {
- rcu_read_unlock();
- return true;
- }
- }
- rcu_read_unlock();
- return false;
+out_block_translation:
+ device_block_translation(dev);
+ return ret;
}
/**
@@ -2831,55 +1384,23 @@ static bool device_rmrr_is_relaxable(struct device *dev)
return false;
}
-/*
- * There are a couple cases where we need to restrict the functionality of
- * devices associated with RMRRs. The first is when evaluating a device for
- * identity mapping because problems exist when devices are moved in and out
- * of domains and their respective RMRR information is lost. This means that
- * a device with associated RMRRs will never be in a "passthrough" domain.
- * The second is use of the device through the IOMMU API. This interface
- * expects to have full control of the IOVA space for the device. We cannot
- * satisfy both the requirement that RMRR access is maintained and have an
- * unencumbered IOVA space. We also have no ability to quiesce the device's
- * use of the RMRR space or even inform the IOMMU API user of the restriction.
- * We therefore prevent devices associated with an RMRR from participating in
- * the IOMMU API, which eliminates them from device assignment.
- *
- * In both cases, devices which have relaxable RMRRs are not concerned by this
- * restriction. See device_rmrr_is_relaxable comment.
- */
-static bool device_is_rmrr_locked(struct device *dev)
+static int device_def_domain_type(struct device *dev)
{
- if (!device_has_rmrr(dev))
- return false;
-
- if (device_rmrr_is_relaxable(dev))
- return false;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
- return true;
-}
+ /*
+ * Hardware does not support the passthrough translation mode.
+ * Always use a dynamaic mapping domain.
+ */
+ if (!ecap_pass_through(iommu->ecap))
+ return IOMMU_DOMAIN_DMA;
-/*
- * Return the required default domain type for a specific device.
- *
- * @dev: the device in query
- * @startup: true if this is during early boot
- *
- * Returns:
- * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
- * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
- * - 0: both identity and dynamic domains work for this device
- */
-static int device_def_domain_type(struct device *dev)
-{
if (dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(dev);
if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
return IOMMU_DOMAIN_IDENTITY;
-
- if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
- return IOMMU_DOMAIN_IDENTITY;
}
return 0;
@@ -2972,7 +1493,8 @@ static int copy_context_table(struct intel_iommu *iommu,
if (!old_ce)
goto out;
- new_ce = alloc_pgtable_page(iommu->node);
+ new_ce = iommu_alloc_pages_node_sz(iommu->node,
+ GFP_KERNEL, SZ_4K);
if (!new_ce)
goto out_unmap;
@@ -2982,32 +1504,14 @@ static int copy_context_table(struct intel_iommu *iommu,
/* Now copy the context entry */
memcpy(&ce, old_ce + idx, sizeof(ce));
- if (!__context_present(&ce))
+ if (!context_present(&ce))
continue;
did = context_domain_id(&ce);
if (did >= 0 && did < cap_ndoms(iommu->cap))
- set_bit(did, iommu->domain_ids);
-
- /*
- * We need a marker for copied context entries. This
- * marker needs to work for the old format as well as
- * for extended context entries.
- *
- * Bit 67 of the context entry is used. In the old
- * format this bit is available to software, in the
- * extended format it is the PGE bit, but PGE is ignored
- * by HW if PASIDs are disabled (and thus still
- * available).
- *
- * So disable PASIDs first and then mark the entry
- * copied. This means that we don't copy PASID
- * translations from the old kernel, but this is fine as
- * faults there are not fatal.
- */
- context_clear_pasid_enable(&ce);
- context_set_copied(&ce);
+ ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL);
+ set_context_copied(iommu, bus, devfn);
new_ce[idx] = ce;
}
@@ -3028,14 +1532,13 @@ static int copy_translation_tables(struct intel_iommu *iommu)
struct root_entry *old_rt;
phys_addr_t old_rt_phys;
int ctxt_table_entries;
- unsigned long flags;
u64 rtaddr_reg;
int bus, ret;
bool new_ext, ext;
rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
- ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
- new_ext = !!ecap_ecs(iommu->ecap);
+ ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
+ new_ext = !!sm_supported(iommu);
/*
* The RTT bit can only be changed when translation is disabled,
@@ -3046,6 +1549,10 @@ static int copy_translation_tables(struct intel_iommu *iommu)
if (new_ext != ext)
return -EINVAL;
+ iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
+ if (!iommu->copied_tables)
+ return -ENOMEM;
+
old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
if (!old_rt_phys)
return -EINVAL;
@@ -3071,7 +1578,7 @@ static int copy_translation_tables(struct intel_iommu *iommu)
}
}
- spin_lock_irqsave(&iommu->lock, flags);
+ spin_lock(&iommu->lock);
/* Context tables are copied, now write them to the root_entry table */
for (bus = 0; bus < 256; bus++) {
@@ -3090,7 +1597,7 @@ static int copy_translation_tables(struct intel_iommu *iommu)
iommu->root_entry[bus].hi = val;
}
- spin_unlock_irqrestore(&iommu->lock, flags);
+ spin_unlock(&iommu->lock);
kfree(ctxt_tbls);
@@ -3104,126 +1611,12 @@ out_unmap:
return ret;
}
-#ifdef CONFIG_INTEL_IOMMU_SVM
-static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
-{
- struct intel_iommu *iommu = data;
- ioasid_t ioasid;
-
- if (!iommu)
- return INVALID_IOASID;
- /*
- * VT-d virtual command interface always uses the full 20 bit
- * PASID range. Host can partition guest PASID range based on
- * policies but it is out of guest's control.
- */
- if (min < PASID_MIN || max > intel_pasid_max_id)
- return INVALID_IOASID;
-
- if (vcmd_alloc_pasid(iommu, &ioasid))
- return INVALID_IOASID;
-
- return ioasid;
-}
-
-static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
-{
- struct intel_iommu *iommu = data;
-
- if (!iommu)
- return;
- /*
- * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
- * We can only free the PASID when all the devices are unbound.
- */
- if (ioasid_find(NULL, ioasid, NULL)) {
- pr_alert("Cannot free active IOASID %d\n", ioasid);
- return;
- }
- vcmd_free_pasid(iommu, ioasid);
-}
-
-static void register_pasid_allocator(struct intel_iommu *iommu)
-{
- /*
- * If we are running in the host, no need for custom allocator
- * in that PASIDs are allocated from the host system-wide.
- */
- if (!cap_caching_mode(iommu->cap))
- return;
-
- if (!sm_supported(iommu)) {
- pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
- return;
- }
-
- /*
- * Register a custom PASID allocator if we are running in a guest,
- * guest PASID must be obtained via virtual command interface.
- * There can be multiple vIOMMUs in each guest but only one allocator
- * is active. All vIOMMU allocators will eventually be calling the same
- * host allocator.
- */
- if (!vccap_pasid(iommu->vccap))
- return;
-
- pr_info("Register custom PASID allocator\n");
- iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
- iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
- iommu->pasid_allocator.pdata = (void *)iommu;
- if (ioasid_register_allocator(&iommu->pasid_allocator)) {
- pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
- /*
- * Disable scalable mode on this IOMMU if there
- * is no custom allocator. Mixing SM capable vIOMMU
- * and non-SM vIOMMU are not supported.
- */
- intel_iommu_sm = 0;
- }
-}
-#endif
-
static int __init init_dmars(void)
{
struct dmar_drhd_unit *drhd;
struct intel_iommu *iommu;
int ret;
- /*
- * for each drhd
- * allocate root
- * initialize and program root entry to not present
- * endfor
- */
- for_each_drhd_unit(drhd) {
- /*
- * lock not needed as this is only incremented in the single
- * threaded kernel __init code path all other access are read
- * only
- */
- if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
- g_num_of_iommus++;
- continue;
- }
- pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
- }
-
- /* Preallocate enough resources for IOMMU hot-addition */
- if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
- g_num_of_iommus = DMAR_UNITS_SUPPORTED;
-
- g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
- GFP_KERNEL);
- if (!g_iommus) {
- pr_err("Allocating global iommu array failed\n");
- ret = -ENOMEM;
- goto error;
- }
-
- ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
- if (ret)
- goto free_iommu;
-
for_each_iommu(iommu, drhd) {
if (drhd->ignored) {
iommu_disable_translation(iommu);
@@ -3242,14 +1635,7 @@ static int __init init_dmars(void)
intel_pasid_max_id);
}
- g_iommus[iommu->seq_id] = iommu;
-
intel_iommu_init_qi(iommu);
-
- ret = iommu_init_domains(iommu);
- if (ret)
- goto free_iommu;
-
init_translation_status(iommu);
if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
@@ -3292,8 +1678,6 @@ static int __init init_dmars(void)
}
}
- if (!ecap_pass_through(iommu->ecap))
- hw_pass_through = 0;
intel_svm_check(iommu);
}
@@ -3304,25 +1688,11 @@ static int __init init_dmars(void)
*/
for_each_active_iommu(iommu, drhd) {
iommu_flush_write_buffer(iommu);
-#ifdef CONFIG_INTEL_IOMMU_SVM
- register_pasid_allocator(iommu);
-#endif
iommu_set_root_entry(iommu);
}
-#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
- dmar_map_gfx = 0;
-#endif
-
- if (!dmar_map_gfx)
- iommu_identity_mapping |= IDENTMAP_GFX;
-
check_tylersburg_isoch();
- ret = si_domain_init(hw_pass_through);
- if (ret)
- goto free_iommu;
-
/*
* for each drhd
* enable fault log
@@ -3343,19 +1713,18 @@ static int __init init_dmars(void)
iommu_flush_write_buffer(iommu);
-#ifdef CONFIG_INTEL_IOMMU_SVM
- if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
+ if (ecap_prs(iommu->ecap)) {
/*
* Call dmar_alloc_hwirq() with dmar_global_lock held,
* could cause possible lock race condition.
*/
up_write(&dmar_global_lock);
- ret = intel_svm_enable_prq(iommu);
+ ret = intel_iommu_enable_prq(iommu);
down_write(&dmar_global_lock);
if (ret)
goto free_iommu;
}
-#endif
+
ret = dmar_set_interrupt(iommu);
if (ret)
goto free_iommu;
@@ -3369,76 +1738,9 @@ free_iommu:
free_dmar_iommu(iommu);
}
- kfree(g_iommus);
-
-error:
- return ret;
-}
-
-static inline int iommu_domain_cache_init(void)
-{
- int ret = 0;
-
- iommu_domain_cache = kmem_cache_create("iommu_domain",
- sizeof(struct dmar_domain),
- 0,
- SLAB_HWCACHE_ALIGN,
-
- NULL);
- if (!iommu_domain_cache) {
- pr_err("Couldn't create iommu_domain cache\n");
- ret = -ENOMEM;
- }
-
- return ret;
-}
-
-static inline int iommu_devinfo_cache_init(void)
-{
- int ret = 0;
-
- iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
- sizeof(struct device_domain_info),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!iommu_devinfo_cache) {
- pr_err("Couldn't create devinfo cache\n");
- ret = -ENOMEM;
- }
-
return ret;
}
-static int __init iommu_init_mempool(void)
-{
- int ret;
- ret = iova_cache_get();
- if (ret)
- return ret;
-
- ret = iommu_domain_cache_init();
- if (ret)
- goto domain_error;
-
- ret = iommu_devinfo_cache_init();
- if (!ret)
- return ret;
-
- kmem_cache_destroy(iommu_domain_cache);
-domain_error:
- iova_cache_put();
-
- return -ENOMEM;
-}
-
-static void __init iommu_exit_mempool(void)
-{
- kmem_cache_destroy(iommu_devinfo_cache);
- kmem_cache_destroy(iommu_domain_cache);
- iova_cache_put();
-}
-
static void __init init_no_remapping_devices(void)
{
struct dmar_drhd_unit *drhd;
@@ -3470,7 +1772,7 @@ static void __init init_no_remapping_devices(void)
/* This IOMMU has *only* gfx devices. Either bypass it or
set the gfx_mapped flag, as appropriate */
drhd->gfx_dedicated = 1;
- if (!dmar_map_gfx)
+ if (disable_igfx_iommu)
drhd->ignored = 1;
}
}
@@ -3480,10 +1782,15 @@ static int init_iommu_hw(void)
{
struct dmar_drhd_unit *drhd;
struct intel_iommu *iommu = NULL;
+ int ret;
- for_each_active_iommu(iommu, drhd)
- if (iommu->qi)
- dmar_reenable_qi(iommu);
+ for_each_active_iommu(iommu, drhd) {
+ if (iommu->qi) {
+ ret = dmar_reenable_qi(iommu);
+ if (ret)
+ return ret;
+ }
+ }
for_each_iommu(iommu, drhd) {
if (drhd->ignored) {
@@ -3518,19 +1825,12 @@ static void iommu_flush_all(void)
}
}
-static int iommu_suspend(void)
+static int iommu_suspend(void *data)
{
struct dmar_drhd_unit *drhd;
struct intel_iommu *iommu = NULL;
unsigned long flag;
- for_each_active_iommu(iommu, drhd) {
- iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
- GFP_KERNEL);
- if (!iommu->iommu_state)
- goto nomem;
- }
-
iommu_flush_all();
for_each_active_iommu(iommu, drhd) {
@@ -3550,15 +1850,9 @@ static int iommu_suspend(void)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
return 0;
-
-nomem:
- for_each_active_iommu(iommu, drhd)
- kfree(iommu->iommu_state);
-
- return -ENOMEM;
}
-static void iommu_resume(void)
+static void iommu_resume(void *data)
{
struct dmar_drhd_unit *drhd;
struct intel_iommu *iommu = NULL;
@@ -3587,26 +1881,27 @@ static void iommu_resume(void)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
-
- for_each_active_iommu(iommu, drhd)
- kfree(iommu->iommu_state);
}
-static struct syscore_ops iommu_syscore_ops = {
+static const struct syscore_ops iommu_syscore_ops = {
.resume = iommu_resume,
.suspend = iommu_suspend,
};
+static struct syscore iommu_syscore = {
+ .ops = &iommu_syscore_ops,
+};
+
static void __init init_iommu_pm_ops(void)
{
- register_syscore_ops(&iommu_syscore_ops);
+ register_syscore(&iommu_syscore);
}
#else
static inline void init_iommu_pm_ops(void) {}
#endif /* CONFIG_PM */
-static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
+static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
{
if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
!IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
@@ -3813,33 +2108,8 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
{
- int sp, ret;
struct intel_iommu *iommu = dmaru->iommu;
-
- if (g_iommus[iommu->seq_id])
- return 0;
-
- ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
- if (ret)
- goto out;
-
- if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
- pr_warn("%s: Doesn't support hardware pass through.\n",
- iommu->name);
- return -ENXIO;
- }
- if (!ecap_sc_support(iommu->ecap) &&
- domain_update_iommu_snooping(iommu)) {
- pr_warn("%s: Doesn't support snooping.\n",
- iommu->name);
- return -ENXIO;
- }
- sp = domain_update_iommu_superpage(NULL, iommu) - 1;
- if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
- pr_warn("%s: Doesn't support large page.\n",
- iommu->name);
- return -ENXIO;
- }
+ int ret;
/*
* Disable translation if already enabled prior to OS handover.
@@ -3847,10 +2117,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
if (iommu->gcmd & DMA_GCMD_TE)
iommu_disable_translation(iommu);
- g_iommus[iommu->seq_id] = iommu;
- ret = iommu_init_domains(iommu);
- if (ret == 0)
- ret = iommu_alloc_root_entry(iommu);
+ ret = iommu_alloc_root_entry(iommu);
if (ret)
goto out;
@@ -3868,13 +2135,12 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
intel_iommu_init_qi(iommu);
iommu_flush_write_buffer(iommu);
-#ifdef CONFIG_INTEL_IOMMU_SVM
- if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
- ret = intel_svm_enable_prq(iommu);
+ if (ecap_prs(iommu->ecap)) {
+ ret = intel_iommu_enable_prq(iommu);
if (ret)
goto disable_iommu;
}
-#endif
+
ret = dmar_set_interrupt(iommu);
if (ret)
goto disable_iommu;
@@ -3935,25 +2201,61 @@ static void intel_iommu_free_dmars(void)
}
}
-int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
{
- int i, ret = 1;
- struct pci_bus *bus;
- struct pci_dev *bridge = NULL;
+ struct dmar_satc_unit *satcu;
+ struct acpi_dmar_satc *satc;
struct device *tmp;
- struct acpi_dmar_atsr *atsr;
+ int i;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
+ satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
+ if (satc->segment != pci_domain_nr(dev->bus))
+ continue;
+ for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
+ if (to_pci_dev(tmp) == dev)
+ goto out;
+ }
+ satcu = NULL;
+out:
+ rcu_read_unlock();
+ return satcu;
+}
+
+static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
+{
+ struct pci_dev *bridge = NULL;
struct dmar_atsr_unit *atsru;
+ struct dmar_satc_unit *satcu;
+ struct acpi_dmar_atsr *atsr;
+ bool supported = true;
+ struct pci_bus *bus;
+ struct device *tmp;
+ int i;
dev = pci_physfn(dev);
+ satcu = dmar_find_matched_satc_unit(dev);
+ if (satcu)
+ /*
+ * This device supports ATS as it is in SATC table.
+ * When IOMMU is in legacy mode, enabling ATS is done
+ * automatically by HW for the device that requires
+ * ATS, hence OS should not enable this device ATS
+ * to avoid duplicated TLB invalidation.
+ */
+ return !(satcu->atc_required && !sm_supported(iommu));
+
for (bus = dev->bus; bus; bus = bus->parent) {
bridge = bus->self;
/* If it's an integrated device, allow ATS */
if (!bridge)
- return 1;
+ return true;
/* Connected via non-PCIe: no ATS */
if (!pci_is_pcie(bridge) ||
pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
- return 0;
+ return false;
/* If we found the root port, look it up in the ATSR */
if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
break;
@@ -3972,11 +2274,11 @@ int dmar_find_matched_atsr_unit(struct pci_dev *dev)
if (atsru->include_all)
goto out;
}
- ret = 0;
+ supported = false;
out:
rcu_read_unlock();
- return ret;
+ return supported;
}
int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
@@ -4049,54 +2351,6 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
return 0;
}
-static int intel_iommu_memory_notifier(struct notifier_block *nb,
- unsigned long val, void *v)
-{
- struct memory_notify *mhp = v;
- unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
- unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
- mhp->nr_pages - 1);
-
- switch (val) {
- case MEM_GOING_ONLINE:
- if (iommu_domain_identity_map(si_domain,
- start_vpfn, last_vpfn)) {
- pr_warn("Failed to build identity map for [%lx-%lx]\n",
- start_vpfn, last_vpfn);
- return NOTIFY_BAD;
- }
- break;
-
- case MEM_OFFLINE:
- case MEM_CANCEL_ONLINE:
- {
- struct dmar_drhd_unit *drhd;
- struct intel_iommu *iommu;
- struct page *freelist;
-
- freelist = domain_unmap(si_domain,
- start_vpfn, last_vpfn,
- NULL);
-
- rcu_read_lock();
- for_each_active_iommu(iommu, drhd)
- iommu_flush_iotlb_psi(iommu, si_domain,
- start_vpfn, mhp->nr_pages,
- !freelist, 0);
- rcu_read_unlock();
- dma_free_pagelist(freelist);
- }
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block intel_iommu_memory_nb = {
- .notifier_call = intel_iommu_memory_notifier,
- .priority = 0
-};
-
static void intel_disable_iommus(void)
{
struct intel_iommu *iommu = NULL;
@@ -4114,19 +2368,22 @@ void intel_iommu_shutdown(void)
if (no_iommu || dmar_disabled)
return;
- down_write(&dmar_global_lock);
+ /*
+ * All other CPUs were brought down, hotplug interrupts were disabled,
+ * no lock and RCU checking needed anymore
+ */
+ list_for_each_entry(drhd, &dmar_drhd_units, list) {
+ iommu = drhd->iommu;
- /* Disable PMRs explicitly here. */
- for_each_iommu(iommu, drhd)
+ /* Disable PMRs explicitly here. */
iommu_disable_protect_mem_regions(iommu);
- /* Make sure the IOMMUs are switched off */
- intel_disable_iommus();
-
- up_write(&dmar_global_lock);
+ /* Make sure the IOMMUs are switched off */
+ iommu_disable_translation(iommu);
+ }
}
-static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
+static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
{
struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
@@ -4138,8 +2395,8 @@ static ssize_t version_show(struct device *dev,
{
struct intel_iommu *iommu = dev_to_intel_iommu(dev);
u32 ver = readl(iommu->reg + DMAR_VER_REG);
- return sprintf(buf, "%d:%d\n",
- DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
+ return sysfs_emit(buf, "%d:%d\n",
+ DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
}
static DEVICE_ATTR_RO(version);
@@ -4147,7 +2404,7 @@ static ssize_t address_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct intel_iommu *iommu = dev_to_intel_iommu(dev);
- return sprintf(buf, "%llx\n", iommu->reg_phys);
+ return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
}
static DEVICE_ATTR_RO(address);
@@ -4155,7 +2412,7 @@ static ssize_t cap_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct intel_iommu *iommu = dev_to_intel_iommu(dev);
- return sprintf(buf, "%llx\n", iommu->cap);
+ return sysfs_emit(buf, "%llx\n", iommu->cap);
}
static DEVICE_ATTR_RO(cap);
@@ -4163,7 +2420,7 @@ static ssize_t ecap_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct intel_iommu *iommu = dev_to_intel_iommu(dev);
- return sprintf(buf, "%llx\n", iommu->ecap);
+ return sysfs_emit(buf, "%llx\n", iommu->ecap);
}
static DEVICE_ATTR_RO(ecap);
@@ -4171,7 +2428,7 @@ static ssize_t domains_supported_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct intel_iommu *iommu = dev_to_intel_iommu(dev);
- return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
+ return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
}
static DEVICE_ATTR_RO(domains_supported);
@@ -4179,8 +2436,14 @@ static ssize_t domains_used_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct intel_iommu *iommu = dev_to_intel_iommu(dev);
- return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
- cap_ndoms(iommu->cap)));
+ unsigned int count = 0;
+ int id;
+
+ for (id = 0; id < cap_ndoms(iommu->cap); id++)
+ if (ida_exists(&iommu->domain_ida, id))
+ count++;
+
+ return sysfs_emit(buf, "%d\n", count);
}
static DEVICE_ATTR_RO(domains_used);
@@ -4204,13 +2467,15 @@ const struct attribute_group *intel_iommu_groups[] = {
NULL,
};
-static inline bool has_external_pci(void)
+static bool has_external_pci(void)
{
struct pci_dev *pdev = NULL;
for_each_pci_dev(pdev)
- if (pdev->external_facing)
+ if (pdev->external_facing) {
+ pci_dev_put(pdev);
return true;
+ }
return false;
}
@@ -4248,28 +2513,22 @@ static int __init probe_acpi_namespace_devices(void)
for_each_active_dev_scope(drhd->devices,
drhd->devices_cnt, i, dev) {
struct acpi_device_physical_node *pn;
- struct iommu_group *group;
struct acpi_device *adev;
if (dev->bus != &acpi_bus_type)
continue;
+ up_read(&dmar_global_lock);
adev = to_acpi_device(dev);
mutex_lock(&adev->physical_node_lock);
list_for_each_entry(pn,
&adev->physical_node_list, node) {
- group = iommu_group_get(pn->dev);
- if (group) {
- iommu_group_put(group);
- continue;
- }
-
- pn->dev->bus->iommu_ops = &intel_iommu_ops;
ret = iommu_probe_device(pn->dev);
if (ret)
break;
}
mutex_unlock(&adev->physical_node_lock);
+ down_read(&dmar_global_lock);
if (ret)
return ret;
@@ -4279,6 +2538,20 @@ static int __init probe_acpi_namespace_devices(void)
return 0;
}
+static __init int tboot_force_iommu(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ if (no_iommu || dmar_disabled)
+ pr_warn("Forcing Intel-IOMMU to enabled\n");
+
+ dmar_disabled = 0;
+ no_iommu = 0;
+
+ return 1;
+}
+
int __init intel_iommu_init(void)
{
int ret = -ENODEV;
@@ -4292,12 +2565,6 @@ int __init intel_iommu_init(void)
force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
platform_optin_force_iommu();
- if (iommu_init_mempool()) {
- if (force_on)
- panic("tboot: Failed to initialize iommu memory\n");
- return -ENOMEM;
- }
-
down_write(&dmar_global_lock);
if (dmar_table_init()) {
if (force_on)
@@ -4356,9 +2623,6 @@ int __init intel_iommu_init(void)
if (list_empty(&dmar_satc_units))
pr_info("No SATC found\n");
- if (dmar_map_gfx)
- intel_iommu_gfx_mapped = 1;
-
init_no_remapping_devices();
ret = init_dmars();
@@ -4381,23 +2645,26 @@ int __init intel_iommu_init(void)
* is likely to be much lower than the overhead of synchronizing
* the virtual and physical IOMMU page-tables.
*/
- if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
- pr_warn("IOMMU batching is disabled due to virtualization");
- intel_iommu_strict = 1;
+ if (cap_caching_mode(iommu->cap) &&
+ !first_level_by_default(iommu)) {
+ pr_info_once("IOMMU batching disallowed due to virtualization\n");
+ iommu_set_dma_strict();
}
iommu_device_sysfs_add(&iommu->iommu, NULL,
intel_iommu_groups,
"%s", iommu->name);
+ /*
+ * The iommu device probe is protected by the iommu_probe_device_lock.
+ * Release the dmar_global_lock before entering the device probe path
+ * to avoid unnecessary lock order splat.
+ */
+ up_read(&dmar_global_lock);
iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
- }
- up_read(&dmar_global_lock);
+ down_read(&dmar_global_lock);
- iommu_set_dma_strict(intel_iommu_strict);
- bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
- if (si_domain && !hw_pass_through)
- register_memory_notifier(&intel_iommu_memory_nb);
+ iommu_pmu_register(iommu);
+ }
- down_read(&dmar_global_lock);
if (probe_acpi_namespace_devices())
pr_warn("ACPI name space devices didn't probe correctly\n");
@@ -4419,15 +2686,14 @@ int __init intel_iommu_init(void)
out_free_dmar:
intel_iommu_free_dmars();
up_write(&dmar_global_lock);
- iommu_exit_mempool();
return ret;
}
static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
{
- struct intel_iommu *iommu = opaque;
+ struct device_domain_info *info = opaque;
- domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+ domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
return 0;
}
@@ -4437,735 +2703,680 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op
* devices, unbinding the driver from any one of them will possibly leave
* the others unable to operate.
*/
-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
+static void domain_context_clear(struct device_domain_info *info)
{
- if (!iommu || !dev || !dev_is_pci(dev))
+ if (!dev_is_pci(info->dev)) {
+ domain_context_clear_one(info, info->bus, info->devfn);
return;
+ }
- pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
+ pci_for_each_dma_alias(to_pci_dev(info->dev),
+ &domain_context_clear_one_cb, info);
+ iommu_disable_pci_ats(info);
}
-static void __dmar_remove_one_dev_info(struct device_domain_info *info)
+/*
+ * Clear the page table pointer in context or pasid table entries so that
+ * all DMA requests without PASID from the device are blocked. If the page
+ * table has been set, clean up the data structures.
+ */
+void device_block_translation(struct device *dev)
{
- struct dmar_domain *domain;
- struct intel_iommu *iommu;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
unsigned long flags;
- assert_spin_locked(&device_domain_lock);
-
- if (WARN_ON(!info))
+ /* Device in DMA blocking state. Noting to do. */
+ if (!info->domain_attached)
return;
- iommu = info->iommu;
- domain = info->domain;
-
- if (info->dev) {
- if (dev_is_pci(info->dev) && sm_supported(iommu))
- intel_pasid_tear_down_entry(iommu, info->dev,
- PASID_RID2PASID, false);
+ if (info->domain)
+ cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
- iommu_disable_dev_iotlb(info);
- if (!dev_is_real_dma_subdevice(info->dev))
- domain_context_clear(iommu, info->dev);
- intel_pasid_free_table(info->dev);
+ if (!dev_is_real_dma_subdevice(dev)) {
+ if (sm_supported(iommu))
+ intel_pasid_tear_down_entry(iommu, dev,
+ IOMMU_NO_PASID, false);
+ else
+ domain_context_clear(info);
}
- unlink_domain_info(info);
+ /* Device now in DMA blocking state. */
+ info->domain_attached = false;
- spin_lock_irqsave(&iommu->lock, flags);
- domain_detach_iommu(domain, iommu);
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- free_devinfo_mem(info);
-}
+ if (!info->domain)
+ return;
-static void dmar_remove_one_dev_info(struct device *dev)
-{
- struct device_domain_info *info;
- unsigned long flags;
+ spin_lock_irqsave(&info->domain->lock, flags);
+ list_del(&info->link);
+ spin_unlock_irqrestore(&info->domain->lock, flags);
- spin_lock_irqsave(&device_domain_lock, flags);
- info = get_domain_info(dev);
- if (info)
- __dmar_remove_one_dev_info(info);
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ domain_detach_iommu(info->domain, iommu);
+ info->domain = NULL;
}
-static int md_domain_init(struct dmar_domain *domain, int guest_width)
+static int blocking_domain_attach_dev(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
- int adjust_width;
-
- /* calculate AGAW */
- domain->gaw = guest_width;
- adjust_width = guestwidth_to_adjustwidth(guest_width);
- domain->agaw = width_to_agaw(adjust_width);
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
- domain->iommu_coherency = false;
- domain->iommu_snooping = false;
- domain->iommu_superpage = 0;
- domain->max_addr = 0;
-
- /* always allocate the top pgd */
- domain->pgd = alloc_pgtable_page(domain->nid);
- if (!domain->pgd)
- return -ENOMEM;
- domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
+ iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev);
+ device_block_translation(dev);
return 0;
}
-static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
-{
- struct dmar_domain *dmar_domain;
- struct iommu_domain *domain;
-
- switch (type) {
- case IOMMU_DOMAIN_DMA:
- case IOMMU_DOMAIN_UNMANAGED:
- dmar_domain = alloc_domain(0);
- if (!dmar_domain) {
- pr_err("Can't allocate dmar_domain\n");
- return NULL;
- }
- if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
- pr_err("Domain initialization failed\n");
- domain_exit(dmar_domain);
- return NULL;
- }
-
- if (type == IOMMU_DOMAIN_DMA &&
- iommu_get_dma_cookie(&dmar_domain->domain))
- return NULL;
-
- domain = &dmar_domain->domain;
- domain->geometry.aperture_start = 0;
- domain->geometry.aperture_end =
- __DOMAIN_MAX_ADDR(dmar_domain->gaw);
- domain->geometry.force_aperture = true;
+static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old);
- return domain;
- case IOMMU_DOMAIN_IDENTITY:
- return &si_domain->domain;
- default:
- return NULL;
+static struct iommu_domain blocking_domain = {
+ .type = IOMMU_DOMAIN_BLOCKED,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = blocking_domain_attach_dev,
+ .set_dev_pasid = blocking_domain_set_dev_pasid,
}
+};
- return NULL;
-}
-
-static void intel_iommu_domain_free(struct iommu_domain *domain)
+static struct dmar_domain *paging_domain_alloc(void)
{
- if (domain != &si_domain->domain)
- domain_exit(to_dmar_domain(domain));
-}
+ struct dmar_domain *domain;
-/*
- * Check whether a @domain could be attached to the @dev through the
- * aux-domain attach/detach APIs.
- */
-static inline bool
-is_aux_domain(struct device *dev, struct iommu_domain *domain)
-{
- struct device_domain_info *info = get_domain_info(dev);
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&domain->devices);
+ INIT_LIST_HEAD(&domain->dev_pasids);
+ INIT_LIST_HEAD(&domain->cache_tags);
+ spin_lock_init(&domain->lock);
+ spin_lock_init(&domain->cache_lock);
+ xa_init(&domain->iommu_array);
+ INIT_LIST_HEAD(&domain->s1_domains);
+ spin_lock_init(&domain->s1_lock);
- return info && info->auxd_enabled &&
- domain->type == IOMMU_DOMAIN_UNMANAGED;
+ return domain;
}
-static inline struct subdev_domain_info *
-lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
+static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu,
+ unsigned int *top_level)
{
- struct subdev_domain_info *sinfo;
+ unsigned int mgaw = cap_mgaw(iommu->cap);
- if (!list_empty(&domain->subdevices)) {
- list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
- if (sinfo->pdev == dev)
- return sinfo;
- }
+ /*
+ * Spec 3.6 First-Stage Translation:
+ *
+ * Software must limit addresses to less than the minimum of MGAW
+ * and the lower canonical address width implied by FSPM (i.e.,
+ * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level).
+ */
+ if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) {
+ *top_level = 4;
+ return min(57, mgaw);
}
- return NULL;
+ /* Four level is always supported */
+ *top_level = 3;
+ return min(48, mgaw);
}
-static int auxiliary_link_device(struct dmar_domain *domain,
- struct device *dev)
+static struct iommu_domain *
+intel_iommu_domain_alloc_first_stage(struct device *dev,
+ struct intel_iommu *iommu, u32 flags)
{
- struct device_domain_info *info = get_domain_info(dev);
- struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
+ struct pt_iommu_x86_64_cfg cfg = {};
+ struct dmar_domain *dmar_domain;
+ int ret;
- assert_spin_locked(&device_domain_lock);
- if (WARN_ON(!info))
- return -EINVAL;
+ if (flags & ~IOMMU_HWPT_ALLOC_PASID)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /* Only SL is available in legacy mode */
+ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ dmar_domain = paging_domain_alloc();
+ if (IS_ERR(dmar_domain))
+ return ERR_CAST(dmar_domain);
+
+ cfg.common.hw_max_vasz_lg2 =
+ compute_vasz_lg2_fs(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
+ BIT(PT_FEAT_FLUSH_RANGE);
+ /* First stage always uses scalable mode */
+ if (!ecap_smpwc(iommu->ecap))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
+ dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
+ /*
+ * iotlb sync for map is only needed for legacy implementations that
+ * explicitly require flushing internal write buffers to ensure memory
+ * coherence.
+ */
+ if (rwbf_required(iommu))
+ dmar_domain->iotlb_sync_map = true;
- if (!sinfo) {
- sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
- if (!sinfo)
- return -ENOMEM;
- sinfo->domain = domain;
- sinfo->pdev = dev;
- list_add(&sinfo->link_phys, &info->subdevices);
- list_add(&sinfo->link_domain, &domain->subdevices);
+ ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
}
- return ++sinfo->users;
+ if (!cap_fl1gp_support(iommu->cap))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
+
+ return &dmar_domain->domain;
}
-static int auxiliary_unlink_device(struct dmar_domain *domain,
- struct device *dev)
+static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu,
+ unsigned int *top_level)
{
- struct device_domain_info *info = get_domain_info(dev);
- struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
- int ret;
-
- assert_spin_locked(&device_domain_lock);
- if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
- return -EINVAL;
+ unsigned int sagaw = cap_sagaw(iommu->cap);
+ unsigned int mgaw = cap_mgaw(iommu->cap);
- ret = --sinfo->users;
- if (!ret) {
- list_del(&sinfo->link_phys);
- list_del(&sinfo->link_domain);
- kfree(sinfo);
+ /*
+ * Find the largest table size that both the mgaw and sagaw support.
+ * This sets the valid range of IOVA and the top starting level.
+ * Some HW may only support a 4 or 5 level walk but must limit IOVA to
+ * 3 levels.
+ */
+ if (mgaw > 48 && sagaw >= BIT(3)) {
+ *top_level = 4;
+ return min(57, mgaw);
+ } else if (mgaw > 39 && sagaw >= BIT(2)) {
+ *top_level = 3 + ffs(sagaw >> 3);
+ return min(48, mgaw);
+ } else if (mgaw > 30 && sagaw >= BIT(1)) {
+ *top_level = 2 + ffs(sagaw >> 2);
+ return min(39, mgaw);
}
-
- return ret;
+ return 0;
}
-static int aux_domain_add_dev(struct dmar_domain *domain,
- struct device *dev)
+static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(vtdss),
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+};
+
+static struct iommu_domain *
+intel_iommu_domain_alloc_second_stage(struct device *dev,
+ struct intel_iommu *iommu, u32 flags)
{
+ struct pt_iommu_vtdss_cfg cfg = {};
+ struct dmar_domain *dmar_domain;
+ unsigned int sslps;
int ret;
- unsigned long flags;
- struct intel_iommu *iommu;
- iommu = device_to_iommu(dev, NULL, NULL);
- if (!iommu)
- return -ENODEV;
+ if (flags &
+ (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_PASID)))
+ return ERR_PTR(-EOPNOTSUPP);
- if (domain->default_pasid <= 0) {
- u32 pasid;
+ if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) &&
+ !nested_supported(iommu)) ||
+ ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
+ !ssads_supported(iommu)))
+ return ERR_PTR(-EOPNOTSUPP);
- /* No private data needed for the default pasid */
- pasid = ioasid_alloc(NULL, PASID_MIN,
- pci_max_pasids(to_pci_dev(dev)) - 1,
- NULL);
- if (pasid == INVALID_IOASID) {
- pr_err("Can't allocate default pasid\n");
- return -ENODEV;
- }
- domain->default_pasid = pasid;
- }
+ /* Legacy mode always supports second stage */
+ if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
+ return ERR_PTR(-EOPNOTSUPP);
- spin_lock_irqsave(&device_domain_lock, flags);
- ret = auxiliary_link_device(domain, dev);
- if (ret <= 0)
- goto link_failed;
+ dmar_domain = paging_domain_alloc();
+ if (IS_ERR(dmar_domain))
+ return ERR_CAST(dmar_domain);
+
+ cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
/*
- * Subdevices from the same physical device can be attached to the
- * same domain. For such cases, only the first subdevice attachment
- * needs to go through the full steps in this function. So if ret >
- * 1, just goto out.
+ * Read-only mapping is disallowed on the domain which serves as the
+ * parent in a nested configuration, due to HW errata
+ * (ERRATA_772415_SPR17)
*/
- if (ret > 1)
- goto out;
+ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)
+ cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE);
+
+ if (!iommu_paging_structure_coherency(iommu))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
+ dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
+ dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops;
+
+ ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
+ }
+
+ /* Adjust the supported page sizes to HW capability */
+ sslps = cap_super_page_val(iommu->cap);
+ if (!(sslps & BIT(0)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M;
+ if (!(sslps & BIT(1)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
/*
- * iommu->lock must be held to attach domain to iommu and setup the
- * pasid entry for second level translation.
+ * Besides the internal write buffer flush, the caching mode used for
+ * legacy nested translation (which utilizes shadowing page tables)
+ * also requires iotlb sync on map.
*/
- spin_lock(&iommu->lock);
- ret = domain_attach_iommu(domain, iommu);
- if (ret)
- goto attach_failed;
+ if (rwbf_required(iommu) || cap_caching_mode(iommu->cap))
+ dmar_domain->iotlb_sync_map = true;
- /* Setup the PASID entry for mediated devices: */
- if (domain_use_first_level(domain))
- ret = domain_setup_first_level(iommu, domain, dev,
- domain->default_pasid);
- else
- ret = intel_pasid_setup_second_level(iommu, domain, dev,
- domain->default_pasid);
- if (ret)
- goto table_failed;
-
- spin_unlock(&iommu->lock);
-out:
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ return &dmar_domain->domain;
+}
- return 0;
+static struct iommu_domain *
+intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ struct iommu_domain *domain;
-table_failed:
- domain_detach_iommu(domain, iommu);
-attach_failed:
- spin_unlock(&iommu->lock);
- auxiliary_unlink_device(domain, dev);
-link_failed:
- spin_unlock_irqrestore(&device_domain_lock, flags);
- if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
- ioasid_put(domain->default_pasid);
+ if (user_data)
+ return ERR_PTR(-EOPNOTSUPP);
- return ret;
+ /* Prefer first stage if possible by default. */
+ domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags);
+ if (domain != ERR_PTR(-EOPNOTSUPP))
+ return domain;
+ return intel_iommu_domain_alloc_second_stage(dev, iommu, flags);
}
-static void aux_domain_remove_dev(struct dmar_domain *domain,
- struct device *dev)
+static void intel_iommu_domain_free(struct iommu_domain *domain)
{
- struct device_domain_info *info;
- struct intel_iommu *iommu;
- unsigned long flags;
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- if (!is_aux_domain(dev, &domain->domain))
+ if (WARN_ON(dmar_domain->nested_parent &&
+ !list_empty(&dmar_domain->s1_domains)))
return;
- spin_lock_irqsave(&device_domain_lock, flags);
- info = get_domain_info(dev);
- iommu = info->iommu;
-
- if (!auxiliary_unlink_device(domain, dev)) {
- spin_lock(&iommu->lock);
- intel_pasid_tear_down_entry(iommu, dev,
- domain->default_pasid, false);
- domain_detach_iommu(domain, iommu);
- spin_unlock(&iommu->lock);
- }
+ if (WARN_ON(!list_empty(&dmar_domain->devices)))
+ return;
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ pt_iommu_deinit(&dmar_domain->iommu);
- if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
- ioasid_put(domain->default_pasid);
+ kfree(dmar_domain->qi_batch);
+ kfree(dmar_domain);
}
-static int prepare_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
+ struct intel_iommu *iommu)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct intel_iommu *iommu;
- int addr_width;
-
- iommu = device_to_iommu(dev, NULL, NULL);
- if (!iommu)
- return -ENODEV;
+ if (WARN_ON(dmar_domain->domain.dirty_ops ||
+ dmar_domain->nested_parent))
+ return -EINVAL;
- if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
- !ecap_nest(iommu->ecap)) {
- dev_err(dev, "%s: iommu not support nested translation\n",
- iommu->name);
+ /* Only SL is available in legacy mode */
+ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return -EINVAL;
- }
- /* check if this iommu agaw is sufficient for max mapped address */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
+ if (!ecap_smpwc(iommu->ecap) &&
+ !(dmar_domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
- if (dmar_domain->max_addr > (1LL << addr_width)) {
- dev_err(dev, "%s: iommu width (%d) is not "
- "sufficient for the mapped address (%llx)\n",
- __func__, addr_width, dmar_domain->max_addr);
- return -EFAULT;
- }
- dmar_domain->gaw = addr_width;
+ /* Supports the number of table levels */
+ if (!cap_fl5lp_support(iommu->cap) &&
+ dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48)
+ return -EINVAL;
- /*
- * Knock out extra levels of page tables if necessary
- */
- while (iommu->agaw < dmar_domain->agaw) {
- struct dma_pte *pte;
+ /* Same page size support */
+ if (!cap_fl1gp_support(iommu->cap) &&
+ (dmar_domain->domain.pgsize_bitmap & SZ_1G))
+ return -EINVAL;
- pte = dmar_domain->pgd;
- if (dma_pte_present(pte)) {
- dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
- free_pgtable_page(pte);
- }
- dmar_domain->agaw--;
- }
+ /* iotlb sync on map requirement */
+ if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map)
+ return -EINVAL;
return 0;
}
-static int intel_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+static int
+paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
+ struct intel_iommu *iommu)
{
- int ret;
+ unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2;
+ unsigned int sslps = cap_super_page_val(iommu->cap);
+ struct pt_iommu_vtdss_hw_info pt_info;
- if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
- device_is_rmrr_locked(dev)) {
- dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
- return -EPERM;
- }
+ pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info);
+
+ if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
+ return -EINVAL;
+ if (dmar_domain->nested_parent && !nested_supported(iommu))
+ return -EINVAL;
- if (is_aux_domain(dev, domain))
- return -EPERM;
+ /* Legacy mode always supports second stage */
+ if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
+ return -EINVAL;
- /* normally dev is not mapped */
- if (unlikely(domain_context_mapped(dev))) {
- struct dmar_domain *old_domain;
+ if (!iommu_paging_structure_coherency(iommu) &&
+ !(dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
- old_domain = find_domain(dev);
- if (old_domain)
- dmar_remove_one_dev_info(dev);
- }
+ /* Address width falls within the capability */
+ if (cap_mgaw(iommu->cap) < vasz_lg2)
+ return -EINVAL;
- ret = prepare_domain_attach_device(domain, dev);
+ /* Page table level is supported. */
+ if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
+ return -EINVAL;
+
+ /* Same page size support */
+ if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
+ return -EINVAL;
+ if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G))
+ return -EINVAL;
+
+ /* iotlb sync on map requirement */
+ if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) &&
+ !dmar_domain->iotlb_sync_map)
+ return -EINVAL;
+
+ /*
+ * FIXME this is locked wrong, it needs to be under the
+ * dmar_domain->lock
+ */
+ if ((dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) &&
+ !ecap_sc_support(iommu->ecap))
+ return -EINVAL;
+ return 0;
+}
+
+int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ int ret = -EINVAL;
+
+ if (intel_domain_is_fs_paging(dmar_domain))
+ ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
+ else if (intel_domain_is_ss_paging(dmar_domain))
+ ret = paging_domain_compatible_second_stage(dmar_domain, iommu);
+ else if (WARN_ON(true))
+ ret = -EINVAL;
if (ret)
return ret;
- return domain_add_dev_info(to_dmar_domain(domain), dev);
+ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
+ context_copied(iommu, info->bus, info->devfn))
+ return intel_pasid_setup_sm_context(dev);
+
+ return 0;
}
-static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
- struct device *dev)
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret;
- if (!is_aux_domain(dev, domain))
- return -EPERM;
+ device_block_translation(dev);
- ret = prepare_domain_attach_device(domain, dev);
+ ret = paging_domain_compatible(domain, dev);
if (ret)
return ret;
- return aux_domain_add_dev(to_dmar_domain(domain), dev);
-}
+ ret = iopf_for_domain_set(domain, dev);
+ if (ret)
+ return ret;
-static void intel_iommu_detach_device(struct iommu_domain *domain,
- struct device *dev)
-{
- dmar_remove_one_dev_info(dev);
-}
+ ret = dmar_domain_attach_device(to_dmar_domain(domain), dev);
+ if (ret)
+ iopf_for_domain_remove(domain, dev);
-static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
- struct device *dev)
-{
- aux_domain_remove_dev(to_dmar_domain(domain), dev);
+ return ret;
}
-#ifdef CONFIG_INTEL_IOMMU_SVM
-/*
- * 2D array for converting and sanitizing IOMMU generic TLB granularity to
- * VT-d granularity. Invalidation is typically included in the unmap operation
- * as a result of DMA or VFIO unmap. However, for assigned devices guest
- * owns the first level page tables. Invalidations of translation caches in the
- * guest are trapped and passed down to the host.
- *
- * vIOMMU in the guest will only expose first level page tables, therefore
- * we do not support IOTLB granularity for request without PASID (second level).
- *
- * For example, to find the VT-d granularity encoding for IOTLB
- * type and page selective granularity within PASID:
- * X: indexed by iommu cache type
- * Y: indexed by enum iommu_inv_granularity
- * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
- */
-
-static const int
-inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
- /*
- * PASID based IOTLB invalidation: PASID selective (per PASID),
- * page selective (address granularity)
- */
- {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
- /* PASID based dev TLBs */
- {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
- /* PASID cache */
- {-EINVAL, -EINVAL, -EINVAL}
-};
-
-static inline int to_vtd_granularity(int type, int granu)
+static void intel_iommu_tlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- return inv_type_granu_table[type][granu];
+ cache_tag_flush_range(to_dmar_domain(domain), gather->start,
+ gather->end,
+ iommu_pages_list_empty(&gather->freelist));
+ iommu_put_pages_list(&gather->freelist);
}
-static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
+static bool domain_support_force_snooping(struct dmar_domain *domain)
{
- u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
+ struct device_domain_info *info;
+ bool support = true;
- /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
- * IOMMU cache invalidate API passes granu_size in bytes, and number of
- * granu size in contiguous memory.
- */
- return order_base_2(nr_pages);
+ assert_spin_locked(&domain->lock);
+ list_for_each_entry(info, &domain->devices, link) {
+ if (!ecap_sc_support(info->iommu->ecap)) {
+ support = false;
+ break;
+ }
+ }
+
+ return support;
}
-static int
-intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
- struct iommu_cache_invalidate_info *inv_info)
+static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct device_domain_info *info;
- struct intel_iommu *iommu;
- unsigned long flags;
- int cache_type;
- u8 bus, devfn;
- u16 did, sid;
- int ret = 0;
- u64 size = 0;
- if (!inv_info || !dmar_domain)
- return -EINVAL;
+ guard(spinlock_irqsave)(&dmar_domain->lock);
- if (!dev || !dev_is_pci(dev))
- return -ENODEV;
-
- iommu = device_to_iommu(dev, &bus, &devfn);
- if (!iommu)
- return -ENODEV;
-
- if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
- return -EINVAL;
-
- spin_lock_irqsave(&device_domain_lock, flags);
- spin_lock(&iommu->lock);
- info = get_domain_info(dev);
- if (!info) {
- ret = -EINVAL;
- goto out_unlock;
- }
- did = dmar_domain->iommu_did[iommu->seq_id];
- sid = PCI_DEVID(bus, devfn);
-
- /* Size is only valid in address selective invalidation */
- if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
- size = to_vtd_size(inv_info->granu.addr_info.granule_size,
- inv_info->granu.addr_info.nb_granules);
-
- for_each_set_bit(cache_type,
- (unsigned long *)&inv_info->cache,
- IOMMU_CACHE_INV_TYPE_NR) {
- int granu = 0;
- u64 pasid = 0;
- u64 addr = 0;
-
- granu = to_vtd_granularity(cache_type, inv_info->granularity);
- if (granu == -EINVAL) {
- pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
- cache_type, inv_info->granularity);
- break;
- }
+ if (dmar_domain->force_snooping)
+ return true;
- /*
- * PASID is stored in different locations based on the
- * granularity.
- */
- if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
- (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
- pasid = inv_info->granu.pasid_info.pasid;
- else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
- (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
- pasid = inv_info->granu.addr_info.pasid;
-
- switch (BIT(cache_type)) {
- case IOMMU_CACHE_INV_TYPE_IOTLB:
- /* HW will ignore LSB bits based on address mask */
- if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
- size &&
- (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
- pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
- inv_info->granu.addr_info.addr, size);
- }
+ if (!domain_support_force_snooping(dmar_domain))
+ return false;
- /*
- * If granu is PASID-selective, address is ignored.
- * We use npages = -1 to indicate that.
- */
- qi_flush_piotlb(iommu, did, pasid,
- mm_to_dma_pfn(inv_info->granu.addr_info.addr),
- (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
- inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
+ dmar_domain->force_snooping = true;
+ list_for_each_entry(info, &dmar_domain->devices, link)
+ intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
+ IOMMU_NO_PASID);
+ return true;
+}
- if (!info->ats_enabled)
- break;
- /*
- * Always flush device IOTLB if ATS is enabled. vIOMMU
- * in the guest may assume IOTLB flush is inclusive,
- * which is more efficient.
- */
- fallthrough;
- case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
- /*
- * PASID based device TLB invalidation does not support
- * IOMMU_INV_GRANU_PASID granularity but only supports
- * IOMMU_INV_GRANU_ADDR.
- * The equivalent of that is we set the size to be the
- * entire range of 64 bit. User only provides PASID info
- * without address info. So we set addr to 0.
- */
- if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
- size = 64 - VTD_PAGE_SHIFT;
- addr = 0;
- } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
- addr = inv_info->granu.addr_info.addr;
- }
+static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- if (info->ats_enabled)
- qi_flush_dev_iotlb_pasid(iommu, sid,
- info->pfsid, pasid,
- info->ats_qdep, addr,
- size);
- else
- pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
- break;
- default:
- dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
- cache_type);
- ret = -EINVAL;
- }
- }
-out_unlock:
- spin_unlock(&iommu->lock);
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ guard(spinlock_irqsave)(&dmar_domain->lock);
+ if (!domain_support_force_snooping(dmar_domain))
+ return false;
- return ret;
+ /*
+ * Second level page table supports per-PTE snoop control. The
+ * iommu_map() interface will handle this by setting SNP bit.
+ */
+ dmar_domain->sspt.vtdss_pt.common.features |=
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE);
+ dmar_domain->force_snooping = true;
+ return true;
}
-#endif
-static int intel_iommu_map(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t hpa,
- size_t size, int iommu_prot, gfp_t gfp)
+static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- u64 max_addr;
- int prot = 0;
-
- if (iommu_prot & IOMMU_READ)
- prot |= DMA_PTE_READ;
- if (iommu_prot & IOMMU_WRITE)
- prot |= DMA_PTE_WRITE;
- if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
- prot |= DMA_PTE_SNP;
-
- max_addr = iova + size;
- if (dmar_domain->max_addr < max_addr) {
- u64 end;
-
- /* check if minimum agaw is sufficient for mapped address */
- end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
- if (end < max_addr) {
- pr_err("%s: iommu width (%d) is not "
- "sufficient for the mapped address (%llx)\n",
- __func__, dmar_domain->gaw, max_addr);
- return -EFAULT;
- }
- dmar_domain->max_addr = max_addr;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ case IOMMU_CAP_DEFERRED_FLUSH:
+ return true;
+ case IOMMU_CAP_PRE_BOOT_PROTECTION:
+ return dmar_platform_optin();
+ case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
+ return ecap_sc_support(info->iommu->ecap);
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return ssads_supported(info->iommu);
+ default:
+ return false;
}
- /* Round up size to next multiple of PAGE_SIZE, if it and
- the low bits of hpa would take us onto the next page */
- size = aligned_nrpages(hpa, size);
- return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
- hpa >> VTD_PAGE_SHIFT, size, prot);
}
-static size_t intel_iommu_unmap(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- struct iommu_iotlb_gather *gather)
+static struct iommu_device *intel_iommu_probe_device(struct device *dev)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long start_pfn, last_pfn;
- int level = 0;
-
- /* Cope with horrid API which requires us to unmap more than the
- size argument if it happens to be a large-page mapping. */
- BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
-
- if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
- size = VTD_PAGE_SIZE << level_to_offset_bits(level);
-
- start_pfn = iova >> VTD_PAGE_SHIFT;
- last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+ struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
+ struct device_domain_info *info;
+ struct intel_iommu *iommu;
+ u8 bus, devfn;
+ int ret;
- gather->freelist = domain_unmap(dmar_domain, start_pfn,
- last_pfn, gather->freelist);
+ iommu = device_lookup_iommu(dev, &bus, &devfn);
+ if (!iommu || !iommu->iommu.ops)
+ return ERR_PTR(-ENODEV);
- if (dmar_domain->max_addr == iova + size)
- dmar_domain->max_addr = iova;
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
- iommu_iotlb_gather_add_page(domain, gather, iova, size);
+ if (dev_is_real_dma_subdevice(dev)) {
+ info->bus = pdev->bus->number;
+ info->devfn = pdev->devfn;
+ info->segment = pci_domain_nr(pdev->bus);
+ } else {
+ info->bus = bus;
+ info->devfn = devfn;
+ info->segment = iommu->segment;
+ }
- return size;
-}
+ info->dev = dev;
+ info->iommu = iommu;
+ if (dev_is_pci(dev)) {
+ if (ecap_dev_iotlb_support(iommu->ecap) &&
+ pci_ats_supported(pdev) &&
+ dmar_ats_supported(pdev, iommu)) {
+ info->ats_supported = 1;
+ info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
-static void intel_iommu_tlb_sync(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long iova_pfn = IOVA_PFN(gather->start);
- size_t size = gather->end - gather->start;
- unsigned long start_pfn;
- unsigned long nrpages;
- int iommu_id;
+ /*
+ * For IOMMU that supports device IOTLB throttling
+ * (DIT), we assign PFSID to the invalidation desc
+ * of a VF such that IOMMU HW can gauge queue depth
+ * at PF level. If DIT is not set, PFSID will be
+ * treated as reserved, which should be set to 0.
+ */
+ if (ecap_dit(iommu->ecap))
+ info->pfsid = pci_dev_id(pci_physfn(pdev));
+ info->ats_qdep = pci_ats_queue_depth(pdev);
+ }
+ if (sm_supported(iommu)) {
+ if (pasid_supported(iommu)) {
+ int features = pci_pasid_features(pdev);
- nrpages = aligned_nrpages(gather->start, size);
- start_pfn = mm_to_dma_pfn(iova_pfn);
+ if (features >= 0)
+ info->pasid_supported = features | 1;
+ }
- for_each_domain_iommu(iommu_id, dmar_domain)
- iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
- start_pfn, nrpages, !gather->freelist, 0);
+ if (info->ats_supported && ecap_prs(iommu->ecap) &&
+ ecap_pds(iommu->ecap) && pci_pri_supported(pdev))
+ info->pri_supported = 1;
+ }
+ }
- dma_free_pagelist(gather->freelist);
-}
+ dev_iommu_priv_set(dev, info);
+ if (pdev && pci_ats_supported(pdev)) {
+ pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
+ ret = device_rbtree_insert(iommu, info);
+ if (ret)
+ goto free;
+ }
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct dma_pte *pte;
- int level = 0;
- u64 phys = 0;
+ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
+ ret = intel_pasid_alloc_table(dev);
+ if (ret) {
+ dev_err(dev, "PASID table allocation failed\n");
+ goto clear_rbtree;
+ }
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
- if (pte && dma_pte_present(pte))
- phys = dma_pte_addr(pte) +
- (iova & (BIT_MASK(level_to_offset_bits(level) +
- VTD_PAGE_SHIFT) - 1));
+ if (!context_copied(iommu, info->bus, info->devfn)) {
+ ret = intel_pasid_setup_sm_context(dev);
+ if (ret)
+ goto free_table;
+ }
+ }
- return phys;
-}
+ intel_iommu_debugfs_create_dev(info);
-static bool intel_iommu_capable(enum iommu_cap cap)
-{
- if (cap == IOMMU_CAP_CACHE_COHERENCY)
- return domain_update_iommu_snooping(NULL);
- if (cap == IOMMU_CAP_INTR_REMAP)
- return irq_remapping_enabled == 1;
+ return &iommu->iommu;
+free_table:
+ intel_pasid_free_table(dev);
+clear_rbtree:
+ device_rbtree_remove(info);
+free:
+ kfree(info);
- return false;
+ return ERR_PTR(ret);
}
-static struct iommu_device *intel_iommu_probe_device(struct device *dev)
+static void intel_iommu_probe_finalize(struct device *dev)
{
- struct intel_iommu *iommu;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
- iommu = device_to_iommu(dev, NULL, NULL);
- if (!iommu)
- return ERR_PTR(-ENODEV);
+ /*
+ * The PCIe spec, in its wisdom, declares that the behaviour of the
+ * device is undefined if you enable PASID support after ATS support.
+ * So always enable PASID support on devices which have it, even if
+ * we can't yet know if we're ever going to use it.
+ */
+ if (info->pasid_supported &&
+ !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
+ info->pasid_enabled = 1;
- if (translation_pre_enabled(iommu))
- dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
+ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
+ iommu_enable_pci_ats(info);
+ /* Assign a DEVTLB cache tag to the default domain. */
+ if (info->ats_enabled && info->domain) {
+ u16 did = domain_id_iommu(info->domain, iommu);
- return &iommu->iommu;
+ if (cache_tag_assign(info->domain, did, dev,
+ IOMMU_NO_PASID, CACHE_TAG_DEVTLB))
+ iommu_disable_pci_ats(info);
+ }
+ }
+ iommu_enable_pci_pri(info);
}
static void intel_iommu_release_device(struct device *dev)
{
- struct intel_iommu *iommu;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
- iommu = device_to_iommu(dev, NULL, NULL);
- if (!iommu)
- return;
+ iommu_disable_pci_pri(info);
+ iommu_disable_pci_ats(info);
- dmar_remove_one_dev_info(dev);
+ if (info->pasid_enabled) {
+ pci_disable_pasid(to_pci_dev(dev));
+ info->pasid_enabled = 0;
+ }
- set_dma_ops(dev, NULL);
-}
+ mutex_lock(&iommu->iopf_lock);
+ if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
+ device_rbtree_remove(info);
+ mutex_unlock(&iommu->iopf_lock);
-static void intel_iommu_probe_finalize(struct device *dev)
-{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
+ !context_copied(iommu, info->bus, info->devfn))
+ intel_pasid_teardown_sm_context(dev);
- if (domain && domain->type == IOMMU_DOMAIN_DMA)
- iommu_setup_dma_ops(dev, 0, U64_MAX);
- else
- set_dma_ops(dev, NULL);
+ intel_pasid_free_table(dev);
+ intel_iommu_debugfs_remove_dev(info);
+ kfree(info);
}
static void intel_iommu_get_resv_regions(struct device *device,
@@ -5177,7 +3388,7 @@ static void intel_iommu_get_resv_regions(struct device *device,
struct device *i_dev;
int i;
- down_read(&dmar_global_lock);
+ rcu_read_lock();
for_each_rmrr_units(rmrr) {
for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
i, i_dev) {
@@ -5195,14 +3406,15 @@ static void intel_iommu_get_resv_regions(struct device *device,
IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
resv = iommu_alloc_resv_region(rmrr->base_address,
- length, prot, type);
+ length, prot, type,
+ GFP_ATOMIC);
if (!resv)
break;
list_add_tail(&resv->list, head);
}
}
- up_read(&dmar_global_lock);
+ rcu_read_unlock();
#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
if (dev_is_pci(device)) {
@@ -5210,7 +3422,8 @@ static void intel_iommu_get_resv_regions(struct device *device,
if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
- IOMMU_RESV_DIRECT_RELAXABLE);
+ IOMMU_RESV_DIRECT_RELAXABLE,
+ GFP_KERNEL);
if (reg)
list_add_tail(&reg->list, head);
}
@@ -5219,396 +3432,499 @@ static void intel_iommu_get_resv_regions(struct device *device,
reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
- 0, IOMMU_RESV_MSI);
+ 0, IOMMU_RESV_MSI, GFP_KERNEL);
if (!reg)
return;
list_add_tail(&reg->list, head);
}
-int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
+static struct iommu_group *intel_iommu_device_group(struct device *dev)
{
- struct device_domain_info *info;
- struct context_entry *context;
- struct dmar_domain *domain;
- unsigned long flags;
- u64 ctx_lo;
+ if (dev_is_pci(dev))
+ return pci_device_group(dev);
+ return generic_device_group(dev);
+}
+
+int intel_iommu_enable_iopf(struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
int ret;
- domain = find_domain(dev);
- if (!domain)
- return -EINVAL;
+ if (!info->pri_enabled)
+ return -ENODEV;
- spin_lock_irqsave(&device_domain_lock, flags);
- spin_lock(&iommu->lock);
+ /* pri_enabled is protected by the group mutex. */
+ iommu_group_mutex_assert(dev);
+ if (info->iopf_refcount) {
+ info->iopf_refcount++;
+ return 0;
+ }
- ret = -EINVAL;
- info = get_domain_info(dev);
- if (!info || !info->pasid_supported)
- goto out;
+ ret = iopf_queue_add_device(iommu->iopf_queue, dev);
+ if (ret)
+ return ret;
- context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
- if (WARN_ON(!context))
- goto out;
+ info->iopf_refcount = 1;
- ctx_lo = context[0].lo;
+ return 0;
+}
- if (!(ctx_lo & CONTEXT_PASIDE)) {
- ctx_lo |= CONTEXT_PASIDE;
- context[0].lo = ctx_lo;
- wmb();
- iommu->flush.flush_context(iommu,
- domain->iommu_did[iommu->seq_id],
- PCI_DEVID(info->bus, info->devfn),
- DMA_CCMD_MASK_NOBIT,
- DMA_CCMD_DEVICE_INVL);
- }
+void intel_iommu_disable_iopf(struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
- /* Enable PASID support in the device, if it wasn't already */
- if (!info->pasid_enabled)
- iommu_enable_dev_iotlb(info);
+ if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
+ return;
- ret = 0;
+ iommu_group_mutex_assert(dev);
+ if (--info->iopf_refcount)
+ return;
- out:
- spin_unlock(&iommu->lock);
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ iopf_queue_remove_device(iommu->iopf_queue, dev);
+}
- return ret;
+static bool intel_iommu_is_attach_deferred(struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+
+ return translation_pre_enabled(info->iommu) && !info->domain;
}
-static struct iommu_group *intel_iommu_device_group(struct device *dev)
+/*
+ * Check that the device does not live on an external facing PCI port that is
+ * marked as untrusted. Such devices should not be able to apply quirks and
+ * thus not be able to bypass the IOMMU restrictions.
+ */
+static bool risky_device(struct pci_dev *pdev)
{
- if (dev_is_pci(dev))
- return pci_device_group(dev);
- return generic_device_group(dev);
+ if (pdev->untrusted) {
+ pci_info(pdev,
+ "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
+ pdev->vendor, pdev->device);
+ pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
+ return true;
+ }
+ return false;
}
-static int intel_iommu_enable_auxd(struct device *dev)
+static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
+ unsigned long iova, size_t size)
{
- struct device_domain_info *info;
- struct intel_iommu *iommu;
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+ if (dmar_domain->iotlb_sync_map)
+ cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
+
+ return 0;
+}
+
+void domain_remove_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dev_pasid_info *curr, *dev_pasid = NULL;
+ struct intel_iommu *iommu = info->iommu;
+ struct dmar_domain *dmar_domain;
unsigned long flags;
- int ret;
- iommu = device_to_iommu(dev, NULL, NULL);
- if (!iommu || dmar_disabled)
- return -EINVAL;
+ if (!domain)
+ return;
- if (!sm_supported(iommu) || !pasid_supported(iommu))
- return -EINVAL;
+ /* Identity domain has no meta data for pasid. */
+ if (domain->type == IOMMU_DOMAIN_IDENTITY)
+ return;
- ret = intel_iommu_enable_pasid(iommu, dev);
- if (ret)
- return -ENODEV;
+ dmar_domain = to_dmar_domain(domain);
+ spin_lock_irqsave(&dmar_domain->lock, flags);
+ list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
+ if (curr->dev == dev && curr->pasid == pasid) {
+ list_del(&curr->link_domain);
+ dev_pasid = curr;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&dmar_domain->lock, flags);
+
+ cache_tag_unassign_domain(dmar_domain, dev, pasid);
+ domain_detach_iommu(dmar_domain, iommu);
+ if (!WARN_ON_ONCE(!dev_pasid)) {
+ intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
+ kfree(dev_pasid);
+ }
+}
+
+static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
- spin_lock_irqsave(&device_domain_lock, flags);
- info = get_domain_info(dev);
- info->auxd_enabled = 1;
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
+ iopf_for_domain_remove(old, dev);
+ domain_remove_dev_pasid(old, dev, pasid);
return 0;
}
-static int intel_iommu_disable_auxd(struct device *dev)
+struct dev_pasid_info *
+domain_add_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid)
{
- struct device_domain_info *info;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ struct dev_pasid_info *dev_pasid;
unsigned long flags;
+ int ret;
- spin_lock_irqsave(&device_domain_lock, flags);
- info = get_domain_info(dev);
- if (!WARN_ON(!info))
- info->auxd_enabled = 0;
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
+ if (!dev_pasid)
+ return ERR_PTR(-ENOMEM);
- return 0;
+ ret = domain_attach_iommu(dmar_domain, iommu);
+ if (ret)
+ goto out_free;
+
+ ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
+ if (ret)
+ goto out_detach_iommu;
+
+ dev_pasid->dev = dev;
+ dev_pasid->pasid = pasid;
+ spin_lock_irqsave(&dmar_domain->lock, flags);
+ list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
+ spin_unlock_irqrestore(&dmar_domain->lock, flags);
+
+ return dev_pasid;
+out_detach_iommu:
+ domain_detach_iommu(dmar_domain, iommu);
+out_free:
+ kfree(dev_pasid);
+ return ERR_PTR(ret);
}
-static int intel_iommu_enable_sva(struct device *dev)
+static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
- struct device_domain_info *info = get_domain_info(dev);
- struct intel_iommu *iommu;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ struct dev_pasid_info *dev_pasid;
int ret;
- if (!info || dmar_disabled)
+ if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
return -EINVAL;
- iommu = info->iommu;
- if (!iommu)
+ if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
+ return -EOPNOTSUPP;
+
+ if (domain->dirty_ops)
return -EINVAL;
- if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
- return -ENODEV;
+ if (context_copied(iommu, info->bus, info->devfn))
+ return -EBUSY;
- if (intel_iommu_enable_pasid(iommu, dev))
- return -ENODEV;
+ ret = paging_domain_compatible(domain, dev);
+ if (ret)
+ return ret;
- if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
- return -EINVAL;
+ dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
+ if (IS_ERR(dev_pasid))
+ return PTR_ERR(dev_pasid);
- ret = iopf_queue_add_device(iommu->iopf_queue, dev);
- if (!ret)
- ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
+ ret = iopf_for_domain_replace(domain, old, dev);
+ if (ret)
+ goto out_remove_dev_pasid;
+
+ if (intel_domain_is_fs_paging(dmar_domain))
+ ret = domain_setup_first_level(iommu, dmar_domain,
+ dev, pasid, old);
+ else if (intel_domain_is_ss_paging(dmar_domain))
+ ret = domain_setup_second_level(iommu, dmar_domain,
+ dev, pasid, old);
+ else if (WARN_ON(true))
+ ret = -EINVAL;
+
+ if (ret)
+ goto out_unwind_iopf;
+
+ domain_remove_dev_pasid(old, dev, pasid);
+ intel_iommu_debugfs_create_dev_pasid(dev_pasid);
+
+ return 0;
+
+out_unwind_iopf:
+ iopf_for_domain_replace(old, domain, dev);
+out_remove_dev_pasid:
+ domain_remove_dev_pasid(domain, dev, pasid);
return ret;
}
-static int intel_iommu_disable_sva(struct device *dev)
+static void *intel_iommu_hw_info(struct device *dev, u32 *length,
+ enum iommu_hw_info_type *type)
{
- struct device_domain_info *info = get_domain_info(dev);
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
- int ret;
+ struct iommu_hw_info_vtd *vtd;
- ret = iommu_unregister_device_fault_handler(dev);
- if (!ret)
- ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
+ if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
+ *type != IOMMU_HW_INFO_TYPE_INTEL_VTD)
+ return ERR_PTR(-EOPNOTSUPP);
- return ret;
+ vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
+ if (!vtd)
+ return ERR_PTR(-ENOMEM);
+
+ vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
+ vtd->cap_reg = iommu->cap;
+ vtd->ecap_reg = iommu->ecap;
+ *length = sizeof(*vtd);
+ *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
+ return vtd;
}
/*
- * A PCI express designated vendor specific extended capability is defined
- * in the section 3.7 of Intel scalable I/O virtualization technical spec
- * for system software and tools to detect endpoint devices supporting the
- * Intel scalable IO virtualization without host driver dependency.
- *
- * Returns the address of the matching extended capability structure within
- * the device's PCI configuration space or 0 if the device does not support
- * it.
+ * Set dirty tracking for the device list of a domain. The caller must
+ * hold the domain->lock when calling it.
*/
-static int siov_find_pci_dvsec(struct pci_dev *pdev)
+static int device_set_dirty_tracking(struct list_head *devices, bool enable)
{
- int pos;
- u16 vendor, id;
-
- pos = pci_find_next_ext_capability(pdev, 0, 0x23);
- while (pos) {
- pci_read_config_word(pdev, pos + 4, &vendor);
- pci_read_config_word(pdev, pos + 8, &id);
- if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
- return pos;
+ struct device_domain_info *info;
+ int ret = 0;
- pos = pci_find_next_ext_capability(pdev, pos, 0x23);
+ list_for_each_entry(info, devices, link) {
+ ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
+ IOMMU_NO_PASID, enable);
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
-static bool
-intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
+static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
+ bool enable)
{
- struct device_domain_info *info = get_domain_info(dev);
-
- if (feat == IOMMU_DEV_FEAT_AUX) {
- int ret;
-
- if (!dev_is_pci(dev) || dmar_disabled ||
- !scalable_mode_support() || !pasid_mode_support())
- return false;
-
- ret = pci_pasid_features(to_pci_dev(dev));
- if (ret < 0)
- return false;
+ struct dmar_domain *s1_domain;
+ unsigned long flags;
+ int ret;
- return !!siov_find_pci_dvsec(to_pci_dev(dev));
+ spin_lock(&domain->s1_lock);
+ list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
+ spin_lock_irqsave(&s1_domain->lock, flags);
+ ret = device_set_dirty_tracking(&s1_domain->devices, enable);
+ spin_unlock_irqrestore(&s1_domain->lock, flags);
+ if (ret)
+ goto err_unwind;
}
+ spin_unlock(&domain->s1_lock);
+ return 0;
- if (feat == IOMMU_DEV_FEAT_IOPF)
- return info && info->pri_supported;
-
- if (feat == IOMMU_DEV_FEAT_SVA)
- return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
- info->pasid_supported && info->pri_supported &&
- info->ats_supported;
-
- return false;
+err_unwind:
+ list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
+ spin_lock_irqsave(&s1_domain->lock, flags);
+ device_set_dirty_tracking(&s1_domain->devices,
+ domain->dirty_tracking);
+ spin_unlock_irqrestore(&s1_domain->lock, flags);
+ }
+ spin_unlock(&domain->s1_lock);
+ return ret;
}
-static int
-intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
{
- switch (feat) {
- case IOMMU_DEV_FEAT_AUX:
- return intel_iommu_enable_auxd(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ int ret;
- case IOMMU_DEV_FEAT_IOPF:
- return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
+ spin_lock(&dmar_domain->lock);
+ if (dmar_domain->dirty_tracking == enable)
+ goto out_unlock;
- case IOMMU_DEV_FEAT_SVA:
- return intel_iommu_enable_sva(dev);
+ ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
+ if (ret)
+ goto err_unwind;
- default:
- return -ENODEV;
+ if (dmar_domain->nested_parent) {
+ ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
+ if (ret)
+ goto err_unwind;
}
-}
-static int
-intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
-{
- switch (feat) {
- case IOMMU_DEV_FEAT_AUX:
- return intel_iommu_disable_auxd(dev);
-
- case IOMMU_DEV_FEAT_IOPF:
- return 0;
+ dmar_domain->dirty_tracking = enable;
+out_unlock:
+ spin_unlock(&dmar_domain->lock);
- case IOMMU_DEV_FEAT_SVA:
- return intel_iommu_disable_sva(dev);
+ return 0;
- default:
- return -ENODEV;
- }
+err_unwind:
+ device_set_dirty_tracking(&dmar_domain->devices,
+ dmar_domain->dirty_tracking);
+ spin_unlock(&dmar_domain->lock);
+ return ret;
}
-static bool
-intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
+static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
{
- struct device_domain_info *info = get_domain_info(dev);
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ struct context_entry *context;
- if (feat == IOMMU_DEV_FEAT_AUX)
- return scalable_mode_support() && info && info->auxd_enabled;
+ spin_lock(&iommu->lock);
+ context = iommu_context_addr(iommu, bus, devfn, 1);
+ if (!context) {
+ spin_unlock(&iommu->lock);
+ return -ENOMEM;
+ }
- return false;
-}
+ if (context_present(context) && !context_copied(iommu, bus, devfn)) {
+ spin_unlock(&iommu->lock);
+ return 0;
+ }
-static int
-intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ copied_context_tear_down(iommu, context, bus, devfn);
+ context_clear_entry(context);
+ context_set_domain_id(context, FLPT_DEFAULT_DID);
+
+ /*
+ * In pass through mode, AW must be programmed to indicate the largest
+ * AGAW value supported by hardware. And ASR is ignored by hardware.
+ */
+ context_set_address_width(context, iommu->msagaw);
+ context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
+ context_set_fault_enable(context);
+ context_set_present(context);
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(context, sizeof(*context));
+ context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
+ spin_unlock(&iommu->lock);
- return dmar_domain->default_pasid > 0 ?
- dmar_domain->default_pasid : -EINVAL;
+ return 0;
}
-static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
- struct device *dev)
+static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
{
- return attach_deferred(dev);
+ struct device *dev = data;
+
+ return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
}
-static int
-intel_iommu_enable_nesting(struct iommu_domain *domain)
+static int device_setup_pass_through(struct device *dev)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long flags;
- int ret = -ENODEV;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
- spin_lock_irqsave(&device_domain_lock, flags);
- if (list_empty(&dmar_domain->devices)) {
- dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
- dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
- ret = 0;
- }
- spin_unlock_irqrestore(&device_domain_lock, flags);
+ if (!dev_is_pci(dev))
+ return context_setup_pass_through(dev, info->bus, info->devfn);
- return ret;
+ return pci_for_each_dma_alias(to_pci_dev(dev),
+ context_setup_pass_through_cb, dev);
}
-/*
- * Check that the device does not live on an external facing PCI port that is
- * marked as untrusted. Such devices should not be able to apply quirks and
- * thus not be able to bypass the IOMMU restrictions.
- */
-static bool risky_device(struct pci_dev *pdev)
+static int identity_domain_attach_dev(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
- if (pdev->untrusted) {
- pci_info(pdev,
- "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
- pdev->vendor, pdev->device);
- pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
- return true;
- }
- return false;
-}
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ int ret;
-static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
- unsigned long clf_pages)
-{
- struct dma_pte *first_pte = NULL, *pte = NULL;
- unsigned long lvl_pages = 0;
- int level = 0;
+ device_block_translation(dev);
- while (clf_pages > 0) {
- if (!pte) {
- level = 0;
- pte = pfn_to_dma_pte(domain, clf_pfn, &level);
- if (WARN_ON(!pte))
- return;
- first_pte = pte;
- lvl_pages = lvl_to_nr_pages(level);
- }
+ if (dev_is_real_dma_subdevice(dev))
+ return 0;
- if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
- return;
+ /*
+ * No PRI support with the global identity domain. No need to enable or
+ * disable PRI in this path as the iommu has been put in the blocking
+ * state.
+ */
+ if (sm_supported(iommu))
+ ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
+ else
+ ret = device_setup_pass_through(dev);
- clf_pages -= lvl_pages;
- clf_pfn += lvl_pages;
- pte++;
+ if (!ret)
+ info->domain_attached = true;
- if (!clf_pages || first_pte_in_page(pte) ||
- (level > 1 && clf_pages < lvl_pages)) {
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
- pte = NULL;
- }
- }
+ return ret;
}
-static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
- unsigned long iova, size_t size)
+static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long pages = aligned_nrpages(iova, size);
- unsigned long pfn = iova >> VTD_PAGE_SHIFT;
- struct intel_iommu *iommu;
- int iommu_id;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ int ret;
+
+ if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
+ return -EOPNOTSUPP;
- if (!dmar_domain->iommu_coherency)
- clflush_sync_map(dmar_domain, pfn, pages);
+ ret = iopf_for_domain_replace(domain, old, dev);
+ if (ret)
+ return ret;
- for_each_domain_iommu(iommu_id, dmar_domain) {
- iommu = g_iommus[iommu_id];
- __mapping_notify_one(iommu, dmar_domain, pfn, pages);
+ ret = domain_setup_passthrough(iommu, dev, pasid, old);
+ if (ret) {
+ iopf_for_domain_replace(old, domain, dev);
+ return ret;
}
+
+ domain_remove_dev_pasid(old, dev, pasid);
+ return 0;
}
+static struct iommu_domain identity_domain = {
+ .type = IOMMU_DOMAIN_IDENTITY,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = identity_domain_attach_dev,
+ .set_dev_pasid = identity_domain_set_dev_pasid,
+ },
+};
+
+const struct iommu_domain_ops intel_fs_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
+ .attach_dev = intel_iommu_attach_device,
+ .set_dev_pasid = intel_iommu_set_dev_pasid,
+ .iotlb_sync_map = intel_iommu_iotlb_sync_map,
+ .flush_iotlb_all = intel_flush_iotlb_all,
+ .iotlb_sync = intel_iommu_tlb_sync,
+ .free = intel_iommu_domain_free,
+ .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
+};
+
+const struct iommu_domain_ops intel_ss_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(vtdss),
+ .attach_dev = intel_iommu_attach_device,
+ .set_dev_pasid = intel_iommu_set_dev_pasid,
+ .iotlb_sync_map = intel_iommu_iotlb_sync_map,
+ .flush_iotlb_all = intel_flush_iotlb_all,
+ .iotlb_sync = intel_iommu_tlb_sync,
+ .free = intel_iommu_domain_free,
+ .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
+};
+
const struct iommu_ops intel_iommu_ops = {
+ .blocked_domain = &blocking_domain,
+ .release_domain = &blocking_domain,
+ .identity_domain = &identity_domain,
.capable = intel_iommu_capable,
- .domain_alloc = intel_iommu_domain_alloc,
- .domain_free = intel_iommu_domain_free,
- .enable_nesting = intel_iommu_enable_nesting,
- .attach_dev = intel_iommu_attach_device,
- .detach_dev = intel_iommu_detach_device,
- .aux_attach_dev = intel_iommu_aux_attach_device,
- .aux_detach_dev = intel_iommu_aux_detach_device,
- .aux_get_pasid = intel_iommu_aux_get_pasid,
- .map = intel_iommu_map,
- .iotlb_sync_map = intel_iommu_iotlb_sync_map,
- .unmap = intel_iommu_unmap,
- .flush_iotlb_all = intel_flush_iotlb_all,
- .iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
+ .hw_info = intel_iommu_hw_info,
+ .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
+ .domain_alloc_sva = intel_svm_domain_alloc,
+ .domain_alloc_nested = intel_iommu_domain_alloc_nested,
.probe_device = intel_iommu_probe_device,
.probe_finalize = intel_iommu_probe_finalize,
.release_device = intel_iommu_release_device,
.get_resv_regions = intel_iommu_get_resv_regions,
- .put_resv_regions = generic_iommu_put_resv_regions,
.device_group = intel_iommu_device_group,
- .dev_has_feat = intel_iommu_dev_has_feat,
- .dev_feat_enabled = intel_iommu_dev_feat_enabled,
- .dev_enable_feat = intel_iommu_dev_enable_feat,
- .dev_disable_feat = intel_iommu_dev_disable_feat,
.is_attach_deferred = intel_iommu_is_attach_deferred,
.def_domain_type = device_def_domain_type,
- .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
-#ifdef CONFIG_INTEL_IOMMU_SVM
- .cache_invalidate = intel_iommu_sva_invalidate,
- .sva_bind_gpasid = intel_svm_bind_gpasid,
- .sva_unbind_gpasid = intel_svm_unbind_gpasid,
- .sva_bind = intel_svm_bind,
- .sva_unbind = intel_svm_unbind,
- .sva_get_pasid = intel_svm_get_pasid,
- .page_response = intel_svm_page_response,
-#endif
+ .page_response = intel_iommu_page_response,
};
static void quirk_iommu_igfx(struct pci_dev *dev)
@@ -5617,7 +3933,7 @@ static void quirk_iommu_igfx(struct pci_dev *dev)
return;
pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
- dmar_map_gfx = 0;
+ disable_igfx_iommu = 1;
}
/* G4x/GM45 integrated gfx dmar support is totally busted. */
@@ -5629,6 +3945,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
+/* QM57/QS57 integrated gfx malfunctions with dmar */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
+
/* Broadwell igfx malfunctions with dmar */
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
@@ -5698,15 +4017,14 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
- dmar_map_gfx = 0;
- } else if (dmar_map_gfx) {
+ disable_igfx_iommu = 1;
+ } else if (!disable_igfx_iommu) {
/* we have to ensure the gfx device is idle before we flush */
pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
- intel_iommu_strict = 1;
- }
+ iommu_set_dma_strict();
+ }
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
@@ -5720,7 +4038,7 @@ static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
ver = (dev->device >> 8) & 0xff;
if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
ver != 0x4e && ver != 0x8a && ver != 0x98 &&
- ver != 0x9a)
+ ver != 0x9a && ver != 0xa7 && ver != 0x7d)
return;
if (risky_device(dev))
@@ -5799,3 +4117,106 @@ static void __init check_tylersburg_isoch(void)
pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
vtisochctrl);
}
+
+/*
+ * Here we deal with a device TLB defect where device may inadvertently issue ATS
+ * invalidation completion before posted writes initiated with translated address
+ * that utilized translations matching the invalidation address range, violating
+ * the invalidation completion ordering.
+ * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
+ * vulnerable to this defect. In other words, any dTLB invalidation initiated not
+ * under the control of the trusted/privileged host device driver must use this
+ * quirk.
+ * Device TLBs are invalidated under the following six conditions:
+ * 1. Device driver does DMA API unmap IOVA
+ * 2. Device driver unbind a PASID from a process, sva_unbind_device()
+ * 3. PASID is torn down, after PASID cache is flushed. e.g. process
+ * exit_mmap() due to crash
+ * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
+ * VM has to free pages that were unmapped
+ * 5. Userspace driver unmaps a DMA buffer
+ * 6. Cache invalidation in vSVA usage (upcoming)
+ *
+ * For #1 and #2, device drivers are responsible for stopping DMA traffic
+ * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
+ * invalidate TLB the same way as normal user unmap which will use this quirk.
+ * The dTLB invalidation after PASID cache flush does not need this quirk.
+ *
+ * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
+ */
+void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
+ unsigned long address, unsigned long mask,
+ u32 pasid, u16 qdep)
+{
+ u16 sid;
+
+ if (likely(!info->dtlb_extra_inval))
+ return;
+
+ sid = PCI_DEVID(info->bus, info->devfn);
+ if (pasid == IOMMU_NO_PASID) {
+ qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+ qdep, address, mask);
+ } else {
+ qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
+ pasid, qdep, address, mask);
+ }
+}
+
+#define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
+
+/*
+ * Function to submit a command to the enhanced command interface. The
+ * valid enhanced command descriptions are defined in Table 47 of the
+ * VT-d spec. The VT-d hardware implementation may support some but not
+ * all commands, which can be determined by checking the Enhanced
+ * Command Capability Register.
+ *
+ * Return values:
+ * - 0: Command successful without any error;
+ * - Negative: software error value;
+ * - Nonzero positive: failure status code defined in Table 48.
+ */
+int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
+{
+ unsigned long flags;
+ u64 res;
+ int ret;
+
+ if (!cap_ecmds(iommu->cap))
+ return -ENODEV;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flags);
+
+ res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
+ if (res & DMA_ECMD_ECRSP_IP) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ /*
+ * Unconditionally write the operand B, because
+ * - There is no side effect if an ecmd doesn't require an
+ * operand B, but we set the register to some value.
+ * - It's not invoked in any critical path. The extra MMIO
+ * write doesn't bring any performance concerns.
+ */
+ dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
+ dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
+
+ IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
+ !(res & DMA_ECMD_ECRSP_IP), res);
+
+ if (res & DMA_ECMD_ECRSP_IP) {
+ ret = -ETIMEDOUT;
+ goto err;
+ }
+
+ ret = ecmd_get_status_code(res);
+err:
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
+
+ return ret;
+}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");