diff options
Diffstat (limited to 'drivers/iommu/intel')
| -rw-r--r-- | drivers/iommu/intel/Kconfig | 9 | ||||
| -rw-r--r-- | drivers/iommu/intel/Makefile | 7 | ||||
| -rw-r--r-- | drivers/iommu/intel/cache.c | 285 | ||||
| -rw-r--r-- | drivers/iommu/intel/cap_audit.c | 217 | ||||
| -rw-r--r-- | drivers/iommu/intel/cap_audit.h | 131 | ||||
| -rw-r--r-- | drivers/iommu/intel/debugfs.c | 29 | ||||
| -rw-r--r-- | drivers/iommu/intel/dmar.c | 144 | ||||
| -rw-r--r-- | drivers/iommu/intel/iommu.c | 2495 | ||||
| -rw-r--r-- | drivers/iommu/intel/iommu.h | 407 | ||||
| -rw-r--r-- | drivers/iommu/intel/irq_remapping.c | 149 | ||||
| -rw-r--r-- | drivers/iommu/intel/nested.c | 96 | ||||
| -rw-r--r-- | drivers/iommu/intel/pasid.c | 558 | ||||
| -rw-r--r-- | drivers/iommu/intel/pasid.h | 45 | ||||
| -rw-r--r-- | drivers/iommu/intel/perf.c | 10 | ||||
| -rw-r--r-- | drivers/iommu/intel/perf.h | 5 | ||||
| -rw-r--r-- | drivers/iommu/intel/perfmon.c | 111 | ||||
| -rw-r--r-- | drivers/iommu/intel/prq.c | 396 | ||||
| -rw-r--r-- | drivers/iommu/intel/svm.c | 480 | ||||
| -rw-r--r-- | drivers/iommu/intel/trace.h | 5 |
19 files changed, 2493 insertions, 3086 deletions
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index f52fb39c968e..5471f814e073 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -12,9 +12,13 @@ config DMAR_DEBUG config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" depends on PCI_MSI && ACPI && X86 - select DMA_OPS select IOMMU_API + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_X86_64 + select IOMMU_PT_VTDSS select IOMMU_IOVA + select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD select NEED_DMA_MAP_STATE select DMAR_TABLE @@ -51,7 +55,6 @@ config INTEL_IOMMU_SVM depends on X86_64 select MMU_NOTIFIER select IOMMU_SVA - select IOMMU_IOPF help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by @@ -67,7 +70,7 @@ config INTEL_IOMMU_DEFAULT_ON config INTEL_IOMMU_FLOPPY_WA def_bool y - depends on X86 + depends on X86 && BLK_DEV_FD help Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index c8beb0281559..ada651c4a01b 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,11 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o -obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o +obj-y += iommu.o pasid.o nested.o cache.o prq.o +obj-$(CONFIG_DMAR_TABLE) += dmar.o trace.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o -ifdef CONFIG_INTEL_IOMMU obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o -endif obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index e8418cdd8331..265e7290256b 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -40,13 +40,13 @@ static bool cache_tage_match(struct cache_tag *tag, u16 domain_id, } /* Assign a cache tag with specified type to domain. */ -static int cache_tag_assign(struct dmar_domain *domain, u16 did, - struct device *dev, ioasid_t pasid, - enum cache_tag_type type) +int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev, + ioasid_t pasid, enum cache_tag_type type) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct cache_tag *tag, *temp; + struct list_head *prev; unsigned long flags; tag = kzalloc(sizeof(*tag), GFP_KERNEL); @@ -65,6 +65,7 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did, tag->dev = iommu->iommu.dev; spin_lock_irqsave(&domain->cache_lock, flags); + prev = &domain->cache_tags; list_for_each_entry(temp, &domain->cache_tags, node) { if (cache_tage_match(temp, did, iommu, dev, pasid, type)) { temp->users++; @@ -73,8 +74,15 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did, trace_cache_tag_assign(temp); return 0; } + if (temp->iommu == iommu) + prev = &temp->node; } - list_add_tail(&tag->node, &domain->cache_tags); + /* + * Link cache tags of same iommu unit together, so corresponding + * flush ops can be batched for iommu unit. + */ + list_add(&tag->node, prev); + spin_unlock_irqrestore(&domain->cache_lock, flags); trace_cache_tag_assign(tag); @@ -105,12 +113,35 @@ static void cache_tag_unassign(struct dmar_domain *domain, u16 did, spin_unlock_irqrestore(&domain->cache_lock, flags); } +/* domain->qi_batch will be freed in iommu_free_domain() path. */ +static int domain_qi_batch_alloc(struct dmar_domain *domain) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&domain->cache_lock, flags); + if (domain->qi_batch) + goto out_unlock; + + domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_ATOMIC); + if (!domain->qi_batch) + ret = -ENOMEM; +out_unlock: + spin_unlock_irqrestore(&domain->cache_lock, flags); + + return ret; +} + static int __cache_tag_assign_domain(struct dmar_domain *domain, u16 did, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_IOTLB); if (ret || !info->ats_enabled) return ret; @@ -139,6 +170,10 @@ static int __cache_tag_assign_parent_domain(struct dmar_domain *domain, u16 did, struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); if (ret || !info->ats_enabled) return ret; @@ -245,7 +280,8 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, * shared_bits are all equal in both pfn and end_pfn. */ shared_bits = ~(pfn ^ end_pfn) & ~bitmask; - mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; + mask = shared_bits ? __ffs(shared_bits) : MAX_AGAW_PFN_WIDTH; + aligned_pages = 1UL << mask; } *_pages = aligned_pages; @@ -254,6 +290,138 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask); } +static void qi_batch_flush_descs(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (!iommu || !batch->index) + return; + + qi_submit_sync(iommu, batch->descs, batch->index, 0); + + /* Reset the index value and clean the whole batch buffer. */ + memset(batch, 0, sizeof(*batch)); +} + +static void qi_batch_increment_index(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (++batch->index == QI_MAX_BATCHED_DESC_COUNT) + qi_batch_flush_descs(iommu, batch); +} + +static void qi_batch_add_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_batch *batch) +{ + qi_desc_iotlb(iommu, did, addr, size_order, type, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned int mask, + struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any Device-TLB + * invalidation requests while address remapping hardware is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, + u64 addr, unsigned long npages, bool ih, + struct qi_batch *batch) +{ + /* + * npages == -1 means a PASID-selective invalidation, otherwise, + * a positive value for Page-selective-within-PASID invalidation. + * 0 is not a valid input. + */ + if (!npages) + return; + + qi_desc_piotlb(did, pasid, addr, npages, ih, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_pasid_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any + * Device-TLB invalidation requests while address remapping hardware + * is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, qdep, addr, size_order, + &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long pages, + unsigned long mask, int ih) +{ + struct intel_iommu *iommu = tag->iommu; + u64 type = DMA_TLB_PSI_FLUSH; + + if (intel_domain_is_fs_paging(domain)) { + qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, + pages, ih, domain->qi_batch); + return; + } + + /* + * Fallback to domain selective flush if no PSI support or the size + * is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap) || pages == -1) { + addr = 0; + mask = 0; + ih = 0; + type = DMA_TLB_DSI_FLUSH; + } + + if (ecap_qis(iommu->ecap)) + qi_batch_add_iotlb(iommu, tag->domain_id, addr | ih, mask, type, + domain->qi_batch); + else + __iommu_flush_iotlb(iommu, tag->domain_id, addr | ih, mask, type); +} + +static void cache_tag_flush_devtlb_psi(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long mask) +{ + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + if (tag->pasid == IOMMU_NO_PASID) { + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + return; + } + + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, + domain->qi_batch); +} + /* * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive) * when the memory mappings in the target domain have been modified. @@ -261,38 +429,29 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, unsigned long end, int ih) { + struct intel_iommu *iommu = NULL; unsigned long pages, mask, addr; struct cache_tag *tag; unsigned long flags; - addr = calculate_psi_aligned_address(start, end, &pages, &mask); + if (start == 0 && end == ULONG_MAX) { + addr = 0; + pages = -1; + mask = MAX_AGAW_PFN_WIDTH; + } else { + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + } spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; switch (tag->type) { case CACHE_TAG_IOTLB: case CACHE_TAG_NESTING_IOTLB: - if (domain->use_first_level) { - qi_flush_piotlb(iommu, tag->domain_id, - tag->pasid, addr, pages, ih); - } else { - /* - * Fallback to domain selective flush if no - * PSI support or the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - addr | ih, mask, - DMA_TLB_PSI_FLUSH); - } + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, ih); break; case CACHE_TAG_NESTING_DEVTLB: /* @@ -306,23 +465,13 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, mask = MAX_AGAW_PFN_WIDTH; fallthrough; case CACHE_TAG_DEVTLB: - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - if (tag->pasid == IOMMU_NO_PASID) - qi_flush_dev_iotlb(iommu, sid, info->pfsid, - info->ats_qdep, addr, mask); - else - qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid, - tag->pasid, info->ats_qdep, - addr, mask); - - quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep); + cache_tag_flush_devtlb_psi(domain, tag, addr, mask); break; } trace_cache_tag_flush_range(tag, start, end, addr, pages, mask); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } @@ -332,40 +481,7 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, */ void cache_tag_flush_all(struct dmar_domain *domain) { - struct cache_tag *tag; - unsigned long flags; - - spin_lock_irqsave(&domain->cache_lock, flags); - list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; - - switch (tag->type) { - case CACHE_TAG_IOTLB: - case CACHE_TAG_NESTING_IOTLB: - if (domain->use_first_level) - qi_flush_piotlb(iommu, tag->domain_id, - tag->pasid, 0, -1, 0); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - break; - case CACHE_TAG_DEVTLB: - case CACHE_TAG_NESTING_DEVTLB: - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - qi_flush_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, - 0, MAX_AGAW_PFN_WIDTH); - quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, - IOMMU_NO_PASID, info->ats_qdep); - break; - } - - trace_cache_tag_flush_all(tag); - } - spin_unlock_irqrestore(&domain->cache_lock, flags); + cache_tag_flush_range(domain, 0, ULONG_MAX, 0); } /* @@ -382,6 +498,7 @@ void cache_tag_flush_all(struct dmar_domain *domain) void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end) { + struct intel_iommu *iommu = NULL; unsigned long pages, mask, addr; struct cache_tag *tag; unsigned long flags; @@ -390,30 +507,22 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; - if (!cap_caching_mode(iommu->cap) || domain->use_first_level) { + if (!cap_caching_mode(iommu->cap) || + intel_domain_is_fs_paging(domain)) { iommu_flush_write_buffer(iommu); continue; } if (tag->type == CACHE_TAG_IOTLB || - tag->type == CACHE_TAG_NESTING_IOTLB) { - /* - * Fallback to domain selective flush if no - * PSI support or the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - addr, mask, - DMA_TLB_PSI_FLUSH); - } + tag->type == CACHE_TAG_NESTING_IOTLB) + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, 0); trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c deleted file mode 100644 index 9862dc20b35e..000000000000 --- a/drivers/iommu/intel/cap_audit.c +++ /dev/null @@ -1,217 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * cap_audit.c - audit iommu capabilities for boot time and hot plug - * - * Copyright (C) 2021 Intel Corporation - * - * Author: Kyung Min Park <kyung.min.park@intel.com> - * Lu Baolu <baolu.lu@linux.intel.com> - */ - -#define pr_fmt(fmt) "DMAR: " fmt - -#include "iommu.h" -#include "cap_audit.h" - -static u64 intel_iommu_cap_sanity; -static u64 intel_iommu_ecap_sanity; - -static inline void check_irq_capabilities(struct intel_iommu *a, - struct intel_iommu *b) -{ - CHECK_FEATURE_MISMATCH(a, b, cap, pi_support, CAP_PI_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, eim_support, ECAP_EIM_MASK); -} - -static inline void check_dmar_capabilities(struct intel_iommu *a, - struct intel_iommu *b) -{ - MINIMAL_FEATURE_IOMMU(b, cap, CAP_MAMV_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_NFR_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_SLLPS_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_FRO_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_MGAW_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_SAGAW_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_NDOMS_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_PSS_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_MHMV_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_IRO_MASK); - - CHECK_FEATURE_MISMATCH(a, b, cap, fl5lp_support, CAP_FL5LP_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, fl1gp_support, CAP_FL1GP_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, read_drain, CAP_RD_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, write_drain, CAP_WD_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, pgsel_inv, CAP_PSI_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, zlr, CAP_ZLR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, caching_mode, CAP_CM_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, phmr, CAP_PHMR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, plmr, CAP_PLMR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, rwbf, CAP_RWBF_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, afl, CAP_AFL_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, rps, ECAP_RPS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, smpwc, ECAP_SMPWC_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, flts, ECAP_FLTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, slts, ECAP_SLTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, nwfs, ECAP_NWFS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, slads, ECAP_SLADS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, smts, ECAP_SMTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pds, ECAP_PDS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, dit, ECAP_DIT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pasid, ECAP_PASID_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, eafs, ECAP_EAFS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, srs, ECAP_SRS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, ers, ECAP_ERS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, prs, ECAP_PRS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, nest, ECAP_NEST_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, mts, ECAP_MTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, sc_support, ECAP_SC_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pass_through, ECAP_PT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, dev_iotlb_support, ECAP_DT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, qis, ECAP_QI_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, coherent, ECAP_C_MASK); -} - -static int cap_audit_hotplug(struct intel_iommu *iommu, enum cap_audit_type type) -{ - bool mismatch = false; - u64 old_cap = intel_iommu_cap_sanity; - u64 old_ecap = intel_iommu_ecap_sanity; - - if (type == CAP_AUDIT_HOTPLUG_IRQR) { - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pi_support, CAP_PI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eim_support, ECAP_EIM_MASK); - goto out; - } - - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl5lp_support, CAP_FL5LP_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl1gp_support, CAP_FL1GP_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, read_drain, CAP_RD_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, write_drain, CAP_WD_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pgsel_inv, CAP_PSI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, zlr, CAP_ZLR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, caching_mode, CAP_CM_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, phmr, CAP_PHMR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, plmr, CAP_PLMR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, rwbf, CAP_RWBF_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, afl, CAP_AFL_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, rps, ECAP_RPS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smpwc, ECAP_SMPWC_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, flts, ECAP_FLTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slts, ECAP_SLTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nwfs, ECAP_NWFS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slads, ECAP_SLADS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smts, ECAP_SMTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pds, ECAP_PDS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dit, ECAP_DIT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pasid, ECAP_PASID_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eafs, ECAP_EAFS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, srs, ECAP_SRS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, ers, ECAP_ERS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, prs, ECAP_PRS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nest, ECAP_NEST_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, mts, ECAP_MTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, sc_support, ECAP_SC_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pass_through, ECAP_PT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dev_iotlb_support, ECAP_DT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, qis, ECAP_QI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, coherent, ECAP_C_MASK); - - /* Abort hot plug if the hot plug iommu feature is smaller than global */ - MINIMAL_FEATURE_HOTPLUG(iommu, cap, max_amask_val, CAP_MAMV_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, num_fault_regs, CAP_NFR_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, super_page_val, CAP_SLLPS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, fault_reg_offset, CAP_FRO_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, mgaw, CAP_MGAW_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, sagaw, CAP_SAGAW_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, ndoms, CAP_NDOMS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, pss, ECAP_PSS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, max_handle_mask, ECAP_MHMV_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, iotlb_offset, ECAP_IRO_MASK, mismatch); - -out: - if (mismatch) { - intel_iommu_cap_sanity = old_cap; - intel_iommu_ecap_sanity = old_ecap; - return -EFAULT; - } - - return 0; -} - -static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type) -{ - struct dmar_drhd_unit *d; - struct intel_iommu *i; - int rc = 0; - - rcu_read_lock(); - if (list_empty(&dmar_drhd_units)) - goto out; - - for_each_active_iommu(i, d) { - if (!iommu) { - intel_iommu_ecap_sanity = i->ecap; - intel_iommu_cap_sanity = i->cap; - iommu = i; - continue; - } - - if (type == CAP_AUDIT_STATIC_DMAR) - check_dmar_capabilities(iommu, i); - else - check_irq_capabilities(iommu, i); - } - - /* - * If the system is sane to support scalable mode, either SL or FL - * should be sane. - */ - if (intel_cap_smts_sanity() && - !intel_cap_flts_sanity() && !intel_cap_slts_sanity()) - rc = -EOPNOTSUPP; - -out: - rcu_read_unlock(); - return rc; -} - -int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu) -{ - switch (type) { - case CAP_AUDIT_STATIC_DMAR: - case CAP_AUDIT_STATIC_IRQR: - return cap_audit_static(iommu, type); - case CAP_AUDIT_HOTPLUG_DMAR: - case CAP_AUDIT_HOTPLUG_IRQR: - return cap_audit_hotplug(iommu, type); - default: - break; - } - - return -EFAULT; -} - -bool intel_cap_smts_sanity(void) -{ - return ecap_smts(intel_iommu_ecap_sanity); -} - -bool intel_cap_pasid_sanity(void) -{ - return ecap_pasid(intel_iommu_ecap_sanity); -} - -bool intel_cap_nest_sanity(void) -{ - return ecap_nest(intel_iommu_ecap_sanity); -} - -bool intel_cap_flts_sanity(void) -{ - return ecap_flts(intel_iommu_ecap_sanity); -} - -bool intel_cap_slts_sanity(void) -{ - return ecap_slts(intel_iommu_ecap_sanity); -} diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h deleted file mode 100644 index d07b75938961..000000000000 --- a/drivers/iommu/intel/cap_audit.h +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * cap_audit.h - audit iommu capabilities header - * - * Copyright (C) 2021 Intel Corporation - * - * Author: Kyung Min Park <kyung.min.park@intel.com> - */ - -/* - * Capability Register Mask - */ -#define CAP_FL5LP_MASK BIT_ULL(60) -#define CAP_PI_MASK BIT_ULL(59) -#define CAP_FL1GP_MASK BIT_ULL(56) -#define CAP_RD_MASK BIT_ULL(55) -#define CAP_WD_MASK BIT_ULL(54) -#define CAP_MAMV_MASK GENMASK_ULL(53, 48) -#define CAP_NFR_MASK GENMASK_ULL(47, 40) -#define CAP_PSI_MASK BIT_ULL(39) -#define CAP_SLLPS_MASK GENMASK_ULL(37, 34) -#define CAP_FRO_MASK GENMASK_ULL(33, 24) -#define CAP_ZLR_MASK BIT_ULL(22) -#define CAP_MGAW_MASK GENMASK_ULL(21, 16) -#define CAP_SAGAW_MASK GENMASK_ULL(12, 8) -#define CAP_CM_MASK BIT_ULL(7) -#define CAP_PHMR_MASK BIT_ULL(6) -#define CAP_PLMR_MASK BIT_ULL(5) -#define CAP_RWBF_MASK BIT_ULL(4) -#define CAP_AFL_MASK BIT_ULL(3) -#define CAP_NDOMS_MASK GENMASK_ULL(2, 0) - -/* - * Extended Capability Register Mask - */ -#define ECAP_RPS_MASK BIT_ULL(49) -#define ECAP_SMPWC_MASK BIT_ULL(48) -#define ECAP_FLTS_MASK BIT_ULL(47) -#define ECAP_SLTS_MASK BIT_ULL(46) -#define ECAP_SLADS_MASK BIT_ULL(45) -#define ECAP_VCS_MASK BIT_ULL(44) -#define ECAP_SMTS_MASK BIT_ULL(43) -#define ECAP_PDS_MASK BIT_ULL(42) -#define ECAP_DIT_MASK BIT_ULL(41) -#define ECAP_PASID_MASK BIT_ULL(40) -#define ECAP_PSS_MASK GENMASK_ULL(39, 35) -#define ECAP_EAFS_MASK BIT_ULL(34) -#define ECAP_NWFS_MASK BIT_ULL(33) -#define ECAP_SRS_MASK BIT_ULL(31) -#define ECAP_ERS_MASK BIT_ULL(30) -#define ECAP_PRS_MASK BIT_ULL(29) -#define ECAP_NEST_MASK BIT_ULL(26) -#define ECAP_MTS_MASK BIT_ULL(25) -#define ECAP_MHMV_MASK GENMASK_ULL(23, 20) -#define ECAP_IRO_MASK GENMASK_ULL(17, 8) -#define ECAP_SC_MASK BIT_ULL(7) -#define ECAP_PT_MASK BIT_ULL(6) -#define ECAP_EIM_MASK BIT_ULL(4) -#define ECAP_DT_MASK BIT_ULL(2) -#define ECAP_QI_MASK BIT_ULL(1) -#define ECAP_C_MASK BIT_ULL(0) - -/* - * u64 intel_iommu_cap_sanity, intel_iommu_ecap_sanity will be adjusted as each - * IOMMU gets audited. - */ -#define DO_CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \ -do { \ - if (cap##_##feature(a) != cap##_##feature(b)) { \ - intel_iommu_##cap##_sanity &= ~(MASK); \ - pr_info("IOMMU feature %s inconsistent", #feature); \ - } \ -} while (0) - -#define CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \ - DO_CHECK_FEATURE_MISMATCH((a)->cap, (b)->cap, cap, feature, MASK) - -#define CHECK_FEATURE_MISMATCH_HOTPLUG(b, cap, feature, MASK) \ -do { \ - if (cap##_##feature(intel_iommu_##cap##_sanity)) \ - DO_CHECK_FEATURE_MISMATCH(intel_iommu_##cap##_sanity, \ - (b)->cap, cap, feature, MASK); \ -} while (0) - -#define MINIMAL_FEATURE_IOMMU(iommu, cap, MASK) \ -do { \ - u64 min_feature = intel_iommu_##cap##_sanity & (MASK); \ - min_feature = min_t(u64, min_feature, (iommu)->cap & (MASK)); \ - intel_iommu_##cap##_sanity = (intel_iommu_##cap##_sanity & ~(MASK)) | \ - min_feature; \ -} while (0) - -#define MINIMAL_FEATURE_HOTPLUG(iommu, cap, feature, MASK, mismatch) \ -do { \ - if ((intel_iommu_##cap##_sanity & (MASK)) > \ - (cap##_##feature((iommu)->cap))) \ - mismatch = true; \ - else \ - (iommu)->cap = ((iommu)->cap & ~(MASK)) | \ - (intel_iommu_##cap##_sanity & (MASK)); \ -} while (0) - -enum cap_audit_type { - CAP_AUDIT_STATIC_DMAR, - CAP_AUDIT_STATIC_IRQR, - CAP_AUDIT_HOTPLUG_DMAR, - CAP_AUDIT_HOTPLUG_IRQR, -}; - -bool intel_cap_smts_sanity(void); -bool intel_cap_pasid_sanity(void); -bool intel_cap_nest_sanity(void); -bool intel_cap_flts_sanity(void); -bool intel_cap_slts_sanity(void); - -static inline bool scalable_mode_support(void) -{ - return (intel_iommu_sm && intel_cap_smts_sanity()); -} - -static inline bool pasid_mode_support(void) -{ - return scalable_mode_support() && intel_cap_pasid_sanity(); -} - -static inline bool nested_mode_support(void) -{ - return scalable_mode_support() && intel_cap_nest_sanity(); -} - -int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index affbf4a1558d..617fd81a80f0 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -62,8 +62,6 @@ static const struct iommu_regset iommu_regs_64[] = { IOMMU_REGSET_ENTRY(CAP), IOMMU_REGSET_ENTRY(ECAP), IOMMU_REGSET_ENTRY(RTADDR), - IOMMU_REGSET_ENTRY(CCMD), - IOMMU_REGSET_ENTRY(AFLOG), IOMMU_REGSET_ENTRY(PHMBASE), IOMMU_REGSET_ENTRY(PHMLIMIT), IOMMU_REGSET_ENTRY(IQH), @@ -435,8 +433,21 @@ static int domain_translation_struct_show(struct seq_file *m, } pgd &= VTD_PAGE_MASK; } else { /* legacy mode */ - pgd = context->lo & VTD_PAGE_MASK; - agaw = context->hi & 7; + u8 tt = (u8)(context->lo & GENMASK_ULL(3, 2)) >> 2; + + /* + * According to Translation Type(TT), + * get the page table pointer(SSPTPTR). + */ + switch (tt) { + case CONTEXT_TT_MULTI_LEVEL: + case CONTEXT_TT_DEV_IOTLB: + pgd = context->lo & VTD_PAGE_MASK; + agaw = context->hi & 7; + break; + default: + goto iommu_unlock; + } } seq_printf(m, "Device %04x:%02x:%02x.%x ", @@ -648,17 +659,11 @@ DEFINE_SHOW_ATTRIBUTE(ir_translation_struct); static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu, struct dmar_drhd_unit *drhd) { - int ret; - seq_printf(m, "IOMMU: %s Register Base Address: %llx\n", iommu->name, drhd->reg_base_addr); - ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE); - if (ret < 0) - seq_puts(m, "Failed to get latency snapshot"); - else - seq_puts(m, debug_buf); - seq_puts(m, "\n"); + dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE); + seq_printf(m, "%s\n", debug_buf); } static int latency_show(struct seq_file *m, void *v) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 304e84949ca7..ec975c73cfe6 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -935,14 +935,11 @@ void __init detect_intel_iommu(void) pci_request_acs(); } -#ifdef CONFIG_X86 if (!ret) { x86_init.iommu.iommu_init = intel_iommu_init; x86_platform.iommu_shutdown = intel_iommu_shutdown; } -#endif - if (dmar_tbl) { acpi_put_table(dmar_tbl); dmar_tbl = NULL; @@ -1060,7 +1057,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err = iommu->seq_id; goto error; } - sprintf(iommu->name, "dmar%d", iommu->seq_id); + snprintf(iommu->name, sizeof(iommu->name), "dmar%d", iommu->seq_id); err = map_iommu(iommu, drhd); if (err) { @@ -1099,6 +1096,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) spin_lock_init(&iommu->device_rbtree_lock); mutex_init(&iommu->iopf_lock); iommu->node = NUMA_NO_NODE; + spin_lock_init(&iommu->lock); + ida_init(&iommu->domain_ida); + mutex_init(&iommu->did_lock); ver = readl(iommu->reg + DMAR_VER_REG); pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", @@ -1187,7 +1187,7 @@ static void free_iommu(struct intel_iommu *iommu) } if (iommu->qi) { - iommu_free_page(iommu->qi->desc); + iommu_free_pages(iommu->qi->desc); kfree(iommu->qi->desc_status); kfree(iommu->qi); } @@ -1195,6 +1195,7 @@ static void free_iommu(struct intel_iommu *iommu) if (iommu->reg) unmap_iommu(iommu); + ida_destroy(&iommu->domain_ida); ida_free(&dmar_seq_ids, iommu->seq_id); kfree(iommu); } @@ -1204,9 +1205,7 @@ static void free_iommu(struct intel_iommu *iommu) */ static inline void reclaim_free_desc(struct q_inval *qi) { - while (qi->desc_status[qi->free_tail] == QI_DONE || - qi->desc_status[qi->free_tail] == QI_ABORT) { - qi->desc_status[qi->free_tail] = QI_FREE; + while (qi->desc_status[qi->free_tail] == QI_FREE && qi->free_tail != qi->free_head) { qi->free_tail = (qi->free_tail + 1) % QI_LENGTH; qi->free_cnt++; } @@ -1446,7 +1445,7 @@ restart: */ writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG); - while (qi->desc_status[wait_index] != QI_DONE) { + while (READ_ONCE(qi->desc_status[wait_index]) != QI_DONE) { /* * We will leave the interrupts disabled, to prevent interrupt * context to queue another cmd while a cmd is already submitted @@ -1463,8 +1462,16 @@ restart: raw_spin_lock(&qi->q_lock); } - for (i = 0; i < count; i++) - qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE; + /* + * The reclaim code can free descriptors from multiple submissions + * starting from the tail of the queue. When count == 0, the + * status of the standalone wait descriptor at the tail of the queue + * must be set to QI_FREE to allow the reclaim code to proceed. + * It is also possible that descriptors from one of the previous + * submissions has to be reclaimed by a subsequent submission. + */ + for (i = 0; i <= count; i++) + qi->desc_status[(index + i) % QI_LENGTH] = QI_FREE; reclaim_free_desc(qi); raw_spin_unlock_irqrestore(&qi->q_lock, flags); @@ -1520,24 +1527,9 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type) { - u8 dw = 0, dr = 0; - struct qi_desc desc; - int ih = 0; - - if (cap_write_drain(iommu->cap)) - dw = 1; - - if (cap_read_drain(iommu->cap)) - dr = 1; - - desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) - | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; - desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) - | QI_IOTLB_AM(size_order); - desc.qw2 = 0; - desc.qw3 = 0; + qi_desc_iotlb(iommu, did, addr, size_order, type, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1555,20 +1547,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (!(iommu->gcmd & DMA_GCMD_TE)) return; - if (mask) { - addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; - desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; - } else - desc.qw1 = QI_DEV_IOTLB_ADDR(addr); - - if (qdep >= QI_DEV_IOTLB_MAX_INVS) - qdep = 0; - - desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | - QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); - desc.qw2 = 0; - desc.qw3 = 0; - + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1588,28 +1567,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, return; } - if (npages == -1) { - desc.qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc.qw1 = 0; - } else { - int mask = ilog2(__roundup_pow_of_two(npages)); - unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); - - if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) - addr = ALIGN_DOWN(addr, align); - - desc.qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | - QI_EIOTLB_TYPE; - desc.qw1 = QI_EIOTLB_ADDR(addr) | - QI_EIOTLB_IH(ih) | - QI_EIOTLB_AM(mask); - } - + qi_desc_piotlb(did, pasid, addr, npages, ih, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1617,7 +1575,6 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, u32 pasid, u16 qdep, u64 addr, unsigned int size_order) { - unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; /* @@ -1629,40 +1586,9 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (!(iommu->gcmd & DMA_GCMD_TE)) return; - desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(pfsid); - - /* - * If S bit is 0, we only flush a single page. If S bit is set, - * The least significant zero bit indicates the invalidation address - * range. VT-d spec 6.5.2.6. - * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. - * size order = 0 is PAGE_SIZE 4KB - * Max Invs Pending (MIP) is set to 0 for now until we have DIT in - * ECAP. - */ - if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) - pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", - addr, size_order); - - /* Take page address */ - desc.qw1 = QI_DEV_EIOTLB_ADDR(addr); - - if (size_order) { - /* - * Existing 0s in address below size_order may be the least - * significant bit, we must set them to 1s to avoid having - * smaller size than desired. - */ - desc.qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, - VTD_PAGE_SHIFT); - /* Clear size_order bit to indicate size */ - desc.qw1 &= ~mask; - /* Set the S bit to indicate flushing more than 1 page */ - desc.qw1 |= QI_DEV_EIOTLB_SIZE; - } - + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, + qdep, addr, size_order, + &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1756,7 +1682,6 @@ int dmar_enable_qi(struct intel_iommu *iommu) { struct q_inval *qi; void *desc; - int order; if (!ecap_qis(iommu->ecap)) return -ENOENT; @@ -1777,8 +1702,9 @@ int dmar_enable_qi(struct intel_iommu *iommu) * Need two pages to accommodate 256 descriptors of 256 bits each * if the remapping hardware supports scalable mode translation. */ - order = ecap_smts(iommu->ecap) ? 1 : 0; - desc = iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order); + desc = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, + ecap_smts(iommu->ecap) ? SZ_8K : + SZ_4K); if (!desc) { kfree(qi); iommu->qi = NULL; @@ -1789,7 +1715,7 @@ int dmar_enable_qi(struct intel_iommu *iommu) qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC); if (!qi->desc_status) { - iommu_free_page(qi->desc); + iommu_free_pages(qi->desc); kfree(qi); iommu->qi = NULL; return -ENOMEM; @@ -1970,19 +1896,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -void dmar_msi_read(int irq, struct msi_msg *msg) -{ - struct intel_iommu *iommu = irq_get_handler_data(irq); - int reg = dmar_msi_reg(iommu, irq); - unsigned long flag; - - raw_spin_lock_irqsave(&iommu->register_lock, flag); - msg->data = readl(iommu->reg + reg + 4); - msg->address_lo = readl(iommu->reg + reg + 8); - msg->address_hi = readl(iommu->reg + reg + 12); - raw_spin_unlock_irqrestore(&iommu->register_lock, flag); -} - static int dmar_fault_do_one(struct intel_iommu *iommu, int type, u8 fault_reason, u32 pasid, u16 source_id, unsigned long long addr) @@ -2131,6 +2044,7 @@ int enable_drhd_fault_handling(unsigned int cpu) /* * Enable fault control interrupt. */ + guard(rwsem_read)(&dmar_global_lock); for_each_iommu(iommu, drhd) { u32 fault_status; int ret; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index fd11a080380c..134302fbcd92 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -29,13 +29,12 @@ #include "../irq_remapping.h" #include "../iommu-pages.h" #include "pasid.h" -#include "cap_audit.h" #include "perfmon.h" #define ROOT_SIZE VTD_PAGE_SIZE #define CONTEXT_SIZE VTD_PAGE_SIZE -#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) +#define IS_GFX_DEVICE(pdev) pci_is_display(pdev) #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) @@ -46,18 +45,13 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 -#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) -#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) - -/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR - to match. That way, we can use 'unsigned long' for PFNs with impunity. */ -#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ - __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) -#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) - static void __init check_tylersburg_isoch(void); +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); static int rwbf_quirk; +#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) + /* * set to 1 to panic kernel if can't successfully enable VT-d * (used when kernel is launched w/ TXT) @@ -167,15 +161,6 @@ static void device_rbtree_remove(struct device_domain_info *info) spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); } -/* - * This domain is a statically identity mapping domain. - * 1. This domain creats a static 1:1 mapping to all usable memory. - * 2. It maps to each iommu if successful. - * 3. Each iommu mapps to this domain if successful. - */ -static struct dmar_domain *si_domain; -static int hw_pass_through = 1; - struct dmar_rmrr_unit { struct list_head list; /* list of rmrr units */ struct acpi_dmar_header *hdr; /* ACPI header */ @@ -225,7 +210,6 @@ static int disable_igfx_iommu; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; -static const struct iommu_dirty_ops intel_dirty_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { @@ -293,18 +277,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -static int domain_type_is_si(struct dmar_domain *domain) -{ - return domain->domain.type == IOMMU_DOMAIN_IDENTITY; -} - -static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) -{ - int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - - return !(addr_width < BITS_PER_LONG && pfn >> addr_width); -} - /* * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of @@ -366,135 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static void domain_update_iommu_coherency(struct dmar_domain *domain) -{ - struct iommu_domain_info *info; - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - bool found = false; - unsigned long i; - - domain->iommu_coherency = true; - xa_for_each(&domain->iommu_array, i, info) { - found = true; - if (!iommu_paging_structure_coherency(info->iommu)) { - domain->iommu_coherency = false; - break; - } - } - if (found) - return; - - /* No hardware attached; use lowest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (!iommu_paging_structure_coherency(iommu)) { - domain->iommu_coherency = false; - break; - } - } - rcu_read_unlock(); -} - -static int domain_update_iommu_superpage(struct dmar_domain *domain, - struct intel_iommu *skip) -{ - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - int mask = 0x3; - - if (!intel_iommu_superpage) - return 0; - - /* set iommu_superpage to the smallest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (iommu != skip) { - if (domain && domain->use_first_level) { - if (!cap_fl1gp_support(iommu->cap)) - mask = 0x1; - } else { - mask &= cap_super_page_val(iommu->cap); - } - - if (!mask) - break; - } - } - rcu_read_unlock(); - - return fls(mask); -} - -static int domain_update_device_node(struct dmar_domain *domain) -{ - struct device_domain_info *info; - int nid = NUMA_NO_NODE; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - /* - * There could possibly be multiple device numa nodes as devices - * within the same domain may sit behind different IOMMUs. There - * isn't perfect answer in such situation, so we select first - * come first served policy. - */ - nid = dev_to_node(info->dev); - if (nid != NUMA_NO_NODE) - break; - } - spin_unlock_irqrestore(&domain->lock, flags); - - return nid; -} - -/* Return the super pagesize bitmap if supported. */ -static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) -{ - unsigned long bitmap = 0; - - /* - * 1-level super page supports page size of 2MiB, 2-level super page - * supports page size of both 2MiB and 1GiB. - */ - if (domain->iommu_superpage == 1) - bitmap |= SZ_2M; - else if (domain->iommu_superpage == 2) - bitmap |= SZ_2M | SZ_1G; - - return bitmap; -} - -/* Some capabilities may be different across iommus */ -void domain_update_iommu_cap(struct dmar_domain *domain) -{ - domain_update_iommu_coherency(domain); - domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); - - /* - * If RHSA is missing, we should default to the device numa domain - * as fall back. - */ - if (domain->nid == NUMA_NO_NODE) - domain->nid = domain_update_device_node(domain); - - /* - * First-level translation restricts the input-address to a - * canonical address (i.e., address bits 63:N have the same - * value as address bit [N-1], where N is 48-bits with 4-level - * paging and 57-bits with 5-level paging). Hence, skip bit - * [N-1]. - */ - if (domain->use_first_level) - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); - else - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); - - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); - domain_update_iotlb(domain); -} - struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { @@ -524,7 +367,8 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, if (!alloc) return NULL; - context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); + context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, + SZ_4K); if (!context) return NULL; @@ -680,13 +524,6 @@ out: return iommu; } -static void domain_flush_cache(struct dmar_domain *domain, - void *addr, int size) -{ - if (!domain->iommu_coherency) - clflush_cache_range(addr, size); -} - static void free_context_table(struct intel_iommu *iommu) { struct context_entry *context; @@ -698,17 +535,17 @@ static void free_context_table(struct intel_iommu *iommu) for (i = 0; i < ROOT_ENTRY_NR; i++) { context = iommu_context_addr(iommu, i, 0, 0); if (context) - iommu_free_page(context); + iommu_free_pages(context); if (!sm_supported(iommu)) continue; context = iommu_context_addr(iommu, i, 0x80, 0); if (context) - iommu_free_page(context); + iommu_free_pages(context); } - iommu_free_page(iommu->root_entry); + iommu_free_pages(iommu->root_entry); iommu->root_entry = NULL; } @@ -722,14 +559,15 @@ static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, while (1) { offset = pfn_level_offset(pfn, level); pte = &parent[offset]; - if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { - pr_info("PTE not present at level %d\n", level); - break; - } pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); - if (level == 1) + if (!dma_pte_present(pte)) { + pr_info("page table not present at level %d\n", level - 1); + break; + } + + if (level == 1 || dma_pte_superpage(pte)) break; parent = phys_to_virt(dma_pte_addr(pte)); @@ -752,11 +590,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); /* root entry dump */ - rt_entry = &iommu->root_entry[bus]; - if (!rt_entry) { - pr_info("root table entry is not present\n"); + if (!iommu->root_entry) { + pr_info("root table is not present\n"); return; } + rt_entry = &iommu->root_entry[bus]; if (sm_supported(iommu)) pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", @@ -767,7 +605,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* context entry dump */ ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); if (!ctx_entry) { - pr_info("context table entry is not present\n"); + pr_info("context table is not present\n"); return; } @@ -776,17 +614,23 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* legacy mode does not require PASID entries */ if (!sm_supported(iommu)) { + if (!context_present(ctx_entry)) { + pr_info("legacy mode page table is not present\n"); + return; + } level = agaw_to_level(ctx_entry->hi & 7); pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); goto pgtable_walk; } - /* get the pointer to pasid directory entry */ - dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); - if (!dir) { - pr_info("pasid directory entry is not present\n"); + if (!context_present(ctx_entry)) { + pr_info("pasid directory table is not present\n"); return; } + + /* get the pointer to pasid directory entry */ + dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + /* For request-without-pasid, get the pasid from context entry */ if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) pasid = IOMMU_NO_PASID; @@ -798,7 +642,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* get the pointer to the pasid table entry */ entries = get_pasid_table_from_pde(pde); if (!entries) { - pr_info("pasid table entry is not present\n"); + pr_info("pasid table is not present\n"); return; } index = pasid & PASID_PTE_MASK; @@ -806,6 +650,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, for (i = 0; i < ARRAY_SIZE(pte->val); i++) pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); + if (!pasid_pte_is_present(pte)) { + pr_info("scalable mode page table is not present\n"); + return; + } + if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { level = pte->val[2] & BIT_ULL(2) ? 5 : 4; pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); @@ -819,286 +668,12 @@ pgtable_walk: } #endif -static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, - unsigned long pfn, int *target_level, - gfp_t gfp) -{ - struct dma_pte *parent, *pte; - int level = agaw_to_level(domain->agaw); - int offset; - - if (!domain_pfn_supported(domain, pfn)) - /* Address beyond IOMMU's addressing capabilities. */ - return NULL; - - parent = domain->pgd; - - while (1) { - void *tmp_page; - - offset = pfn_level_offset(pfn, level); - pte = &parent[offset]; - if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) - break; - if (level == *target_level) - break; - - if (!dma_pte_present(pte)) { - uint64_t pteval, tmp; - - tmp_page = iommu_alloc_page_node(domain->nid, gfp); - - if (!tmp_page) - return NULL; - - domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); - pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; - if (domain->use_first_level) - pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - - tmp = 0ULL; - if (!try_cmpxchg64(&pte->val, &tmp, pteval)) - /* Someone else set it while we were thinking; use theirs. */ - iommu_free_page(tmp_page); - else - domain_flush_cache(domain, pte, sizeof(*pte)); - } - if (level == 1) - break; - - parent = phys_to_virt(dma_pte_addr(pte)); - level--; - } - - if (!*target_level) - *target_level = level; - - return pte; -} - -/* return address's pte at specific level */ -static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, - unsigned long pfn, - int level, int *large_page) -{ - struct dma_pte *parent, *pte; - int total = agaw_to_level(domain->agaw); - int offset; - - parent = domain->pgd; - while (level <= total) { - offset = pfn_level_offset(pfn, total); - pte = &parent[offset]; - if (level == total) - return pte; - - if (!dma_pte_present(pte)) { - *large_page = total; - break; - } - - if (dma_pte_superpage(pte)) { - *large_page = total; - return pte; - } - - parent = phys_to_virt(dma_pte_addr(pte)); - total--; - } - return NULL; -} - -/* clear last level pte, a tlb flush should be followed */ -static void dma_pte_clear_range(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long last_pfn) -{ - unsigned int large_page; - struct dma_pte *first_pte, *pte; - - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || - WARN_ON(start_pfn > last_pfn)) - return; - - /* we don't need lock here; nobody else touches the iova range */ - do { - large_page = 1; - first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); - if (!pte) { - start_pfn = align_to_level(start_pfn + 1, large_page + 1); - continue; - } - do { - dma_clear_pte(pte); - start_pfn += lvl_to_nr_pages(large_page); - pte++; - } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); - - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - - } while (start_pfn && start_pfn <= last_pfn); -} - -static void dma_pte_free_level(struct dmar_domain *domain, int level, - int retain_level, struct dma_pte *pte, - unsigned long pfn, unsigned long start_pfn, - unsigned long last_pfn) -{ - pfn = max(start_pfn, pfn); - pte = &pte[pfn_level_offset(pfn, level)]; - - do { - unsigned long level_pfn; - struct dma_pte *level_pte; - - if (!dma_pte_present(pte) || dma_pte_superpage(pte)) - goto next; - - level_pfn = pfn & level_mask(level); - level_pte = phys_to_virt(dma_pte_addr(pte)); - - if (level > 2) { - dma_pte_free_level(domain, level - 1, retain_level, - level_pte, level_pfn, start_pfn, - last_pfn); - } - - /* - * Free the page table if we're below the level we want to - * retain and the range covers the entire table. - */ - if (level < retain_level && !(start_pfn > level_pfn || - last_pfn < level_pfn + level_size(level) - 1)) { - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - iommu_free_page(level_pte); - } -next: - pfn += level_size(level); - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); -} - -/* - * clear last level (leaf) ptes and free page table pages below the - * level we wish to keep intact. - */ -static void dma_pte_free_pagetable(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long last_pfn, - int retain_level) -{ - dma_pte_clear_range(domain, start_pfn, last_pfn); - - /* We don't need lock here; nobody else touches the iova range */ - dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, - domain->pgd, 0, start_pfn, last_pfn); - - /* free pgd */ - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - iommu_free_page(domain->pgd); - domain->pgd = NULL; - } -} - -/* When a page at a given level is being unlinked from its parent, we don't - need to *modify* it at all. All we need to do is make a list of all the - pages which can be freed just as soon as we've flushed the IOTLB and we - know the hardware page-walk will no longer touch them. - The 'pte' argument is the *parent* PTE, pointing to the page that is to - be freed. */ -static void dma_pte_list_pagetables(struct dmar_domain *domain, - int level, struct dma_pte *pte, - struct list_head *freelist) -{ - struct page *pg; - - pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); - list_add_tail(&pg->lru, freelist); - - if (level == 1) - return; - - pte = page_address(pg); - do { - if (dma_pte_present(pte) && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); - pte++; - } while (!first_pte_in_page(pte)); -} - -static void dma_pte_clear_level(struct dmar_domain *domain, int level, - struct dma_pte *pte, unsigned long pfn, - unsigned long start_pfn, unsigned long last_pfn, - struct list_head *freelist) -{ - struct dma_pte *first_pte = NULL, *last_pte = NULL; - - pfn = max(start_pfn, pfn); - pte = &pte[pfn_level_offset(pfn, level)]; - - do { - unsigned long level_pfn = pfn & level_mask(level); - - if (!dma_pte_present(pte)) - goto next; - - /* If range covers entire pagetable, free it */ - if (start_pfn <= level_pfn && - last_pfn >= level_pfn + level_size(level) - 1) { - /* These suborbinate page tables are going away entirely. Don't - bother to clear them; we're just going to *free* them. */ - if (level > 1 && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); - - dma_clear_pte(pte); - if (!first_pte) - first_pte = pte; - last_pte = pte; - } else if (level > 1) { - /* Recurse down into a level that isn't *entirely* obsolete */ - dma_pte_clear_level(domain, level - 1, - phys_to_virt(dma_pte_addr(pte)), - level_pfn, start_pfn, last_pfn, - freelist); - } -next: - pfn = level_pfn + level_size(level); - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); - - if (first_pte) - domain_flush_cache(domain, first_pte, - (void *)++last_pte - (void *)first_pte); -} - -/* We can't just free the pages because the IOMMU may still be walking - the page tables, and may have cached the intermediate levels. The - pages can only be freed after the IOTLB flush has been done. */ -static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, - unsigned long last_pfn, struct list_head *freelist) -{ - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || - WARN_ON(start_pfn > last_pfn)) - return; - - /* we don't need lock here; nobody else touches the iova range */ - dma_pte_clear_level(domain, agaw_to_level(domain->agaw), - domain->pgd, 0, start_pfn, last_pfn, freelist); - - /* free pgd */ - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - struct page *pgd_page = virt_to_page(domain->pgd); - list_add_tail(&pgd_page->lru, freelist); - domain->pgd = NULL; - } -} - /* iommu handling */ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { struct root_entry *root; - root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); + root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); if (!root) { pr_err("Allocating root entry for %s failed\n", iommu->name); @@ -1199,9 +774,8 @@ static void __iommu_flush_context(struct intel_iommu *iommu, raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -/* return value determine if we need a write buffer flush */ -static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int size_order, u64 type) +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; @@ -1270,32 +844,6 @@ domain_lookup_dev_info(struct dmar_domain *domain, return NULL; } -void domain_update_iotlb(struct dmar_domain *domain) -{ - struct dev_pasid_info *dev_pasid; - struct device_domain_info *info; - bool has_iotlb_device = false; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - if (info->ats_enabled) { - has_iotlb_device = true; - break; - } - } - - list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { - info = dev_iommu_priv_get(dev_pasid->dev); - if (info->ats_enabled) { - has_iotlb_device = true; - break; - } - } - domain->has_iotlb_device = has_iotlb_device; - spin_unlock_irqrestore(&domain->lock, flags); -} - /* * The extra devTLB flush quirk impacts those QAT devices with PCI device * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() @@ -1314,64 +862,59 @@ static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) return true; } -static void iommu_enable_pci_caps(struct device_domain_info *info) +static void iommu_enable_pci_ats(struct device_domain_info *info) { struct pci_dev *pdev; - if (!dev_is_pci(info->dev)) + if (!info->ats_supported) return; pdev = to_pci_dev(info->dev); + if (!pci_ats_page_aligned(pdev)) + return; - /* The PCIe spec, in its wisdom, declares that the behaviour of - the device if you enable PASID support after ATS support is - undefined. So always enable PASID support on devices which - have it, even if we can't yet know if we're ever going to - use it. */ - if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) - info->pasid_enabled = 1; - - if (info->ats_supported && pci_ats_page_aligned(pdev) && - !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { + if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) info->ats_enabled = 1; - domain_update_iotlb(info->domain); - } } -static void iommu_disable_pci_caps(struct device_domain_info *info) +static void iommu_disable_pci_ats(struct device_domain_info *info) +{ + if (!info->ats_enabled) + return; + + pci_disable_ats(to_pci_dev(info->dev)); + info->ats_enabled = 0; +} + +static void iommu_enable_pci_pri(struct device_domain_info *info) { struct pci_dev *pdev; - if (!dev_is_pci(info->dev)) + if (!info->ats_enabled || !info->pri_supported) return; pdev = to_pci_dev(info->dev); + /* PASID is required in PRG Response Message. */ + if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) + return; - if (info->ats_enabled) { - pci_disable_ats(pdev); - info->ats_enabled = 0; - domain_update_iotlb(info->domain); - } + if (pci_reset_pri(pdev)) + return; - if (info->pasid_enabled) { - pci_disable_pasid(pdev); - info->pasid_enabled = 0; - } + if (!pci_enable_pri(pdev, PRQ_DEPTH)) + info->pri_enabled = 1; } -static void __iommu_flush_dev_iotlb(struct device_domain_info *info, - u64 addr, unsigned int mask) +static void iommu_disable_pci_pri(struct device_domain_info *info) { - u16 sid, qdep; - - if (!info || !info->ats_enabled) + if (!info->pri_enabled) return; - sid = info->bus << 8 | info->devfn; - qdep = info->ats_qdep; - qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, - qdep, addr, mask); - quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); + if (WARN_ON(info->iopf_refcount)) + iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); + + pci_disable_pri(to_pci_dev(info->dev)); + info->pri_enabled = 0; } static void intel_flush_iotlb_all(struct iommu_domain *domain) @@ -1435,52 +978,13 @@ static void iommu_disable_translation(struct intel_iommu *iommu) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -static int iommu_init_domains(struct intel_iommu *iommu) -{ - u32 ndomains; - - ndomains = cap_ndoms(iommu->cap); - pr_debug("%s: Number of Domains supported <%d>\n", - iommu->name, ndomains); - - spin_lock_init(&iommu->lock); - - iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); - if (!iommu->domain_ids) - return -ENOMEM; - - /* - * If Caching mode is set, then invalid translations are tagged - * with domain-id 0, hence we need to pre-allocate it. We also - * use domain-id 0 as a marker for non-allocated domain-id, so - * make sure it is not used for a real domain. - */ - set_bit(0, iommu->domain_ids); - - /* - * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid - * entry for first-level or pass-through translation modes should - * be programmed with a domain id different from those used for - * second-level or nested translation. We reserve a domain id for - * this purpose. - */ - if (sm_supported(iommu)) - set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); - - return 0; -} - static void disable_dmar_iommu(struct intel_iommu *iommu) { - if (!iommu->domain_ids) - return; - /* * All iommu domains must have been detached from the devices, * hence there should be no domain IDs in use. */ - if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) - > NUM_RESERVED_DID)) + if (WARN_ON(!ida_is_empty(&iommu->domain_ida))) return; if (iommu->gcmd & DMA_GCMD_TE) @@ -1489,11 +993,6 @@ static void disable_dmar_iommu(struct intel_iommu *iommu) static void free_dmar_iommu(struct intel_iommu *iommu) { - if (iommu->domain_ids) { - bitmap_free(iommu->domain_ids); - iommu->domain_ids = NULL; - } - if (iommu->copied_tables) { bitmap_free(iommu->copied_tables); iommu->copied_tables = NULL; @@ -1502,58 +1001,30 @@ static void free_dmar_iommu(struct intel_iommu *iommu) /* free context mapping */ free_context_table(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu)) { - if (ecap_prs(iommu->ecap)) - intel_svm_finish_prq(iommu); - } -#endif + if (ecap_prs(iommu->ecap)) + intel_iommu_finish_prq(iommu); } /* * Check and return whether first level is used by default for * DMA translation. */ -static bool first_level_by_default(unsigned int type) +static bool first_level_by_default(struct intel_iommu *iommu) { /* Only SL is available in legacy mode */ - if (!scalable_mode_support()) + if (!sm_supported(iommu)) return false; /* Only level (either FL or SL) is available, just use it */ - if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) - return intel_cap_flts_sanity(); - - /* Both levels are available, decide it based on domain type */ - return type != IOMMU_DOMAIN_UNMANAGED; -} - -static struct dmar_domain *alloc_domain(unsigned int type) -{ - struct dmar_domain *domain; + if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) + return ecap_flts(iommu->ecap); - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - - domain->nid = NUMA_NO_NODE; - if (first_level_by_default(type)) - domain->use_first_level = true; - domain->has_iotlb_device = false; - INIT_LIST_HEAD(&domain->devices); - INIT_LIST_HEAD(&domain->dev_pasids); - INIT_LIST_HEAD(&domain->cache_tags); - spin_lock_init(&domain->lock); - spin_lock_init(&domain->cache_lock); - xa_init(&domain->iommu_array); - - return domain; + return true; } int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; - unsigned long ndomains; int num, ret = -ENOSPC; if (domain->domain.type == IOMMU_DOMAIN_SVA) @@ -1563,41 +1034,36 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) if (!info) return -ENOMEM; - spin_lock(&iommu->lock); + guard(mutex)(&iommu->did_lock); curr = xa_load(&domain->iommu_array, iommu->seq_id); if (curr) { curr->refcnt++; - spin_unlock(&iommu->lock); kfree(info); return 0; } - ndomains = cap_ndoms(iommu->cap); - num = find_first_zero_bit(iommu->domain_ids, ndomains); - if (num >= ndomains) { + num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, + cap_ndoms(iommu->cap) - 1, GFP_KERNEL); + if (num < 0) { pr_err("%s: No free domain ids\n", iommu->name); goto err_unlock; } - set_bit(num, iommu->domain_ids); info->refcnt = 1; info->did = num; info->iommu = iommu; curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, - NULL, info, GFP_ATOMIC); + NULL, info, GFP_KERNEL); if (curr) { ret = xa_err(curr) ? : -EBUSY; goto err_clear; } - domain_update_iommu_cap(domain); - spin_unlock(&iommu->lock); return 0; err_clear: - clear_bit(info->did, iommu->domain_ids); + ida_free(&iommu->domain_ida, info->did); err_unlock: - spin_unlock(&iommu->lock); kfree(info); return ret; } @@ -1609,45 +1075,68 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) if (domain->domain.type == IOMMU_DOMAIN_SVA) return; - spin_lock(&iommu->lock); + guard(mutex)(&iommu->did_lock); info = xa_load(&domain->iommu_array, iommu->seq_id); if (--info->refcnt == 0) { - clear_bit(info->did, iommu->domain_ids); + ida_free(&iommu->domain_ida, info->did); xa_erase(&domain->iommu_array, iommu->seq_id); - domain->nid = NUMA_NO_NODE; - domain_update_iommu_cap(domain); kfree(info); } - spin_unlock(&iommu->lock); } -static int guestwidth_to_adjustwidth(int gaw) +/* + * For kdump cases, old valid entries may be cached due to the + * in-flight DMA and copied pgtable, but there is no unmapping + * behaviour for them, thus we need an explicit cache flush for + * the newly-mapped device. For kdump, at this point, the device + * is supposed to finish reset at its driver probe stage, so no + * in-flight DMA will exist, and we don't need to worry anymore + * hereafter. + */ +static void copied_context_tear_down(struct intel_iommu *iommu, + struct context_entry *context, + u8 bus, u8 devfn) { - int agaw; - int r = (gaw - 12) % 9; + u16 did_old; - if (r == 0) - agaw = gaw; - else - agaw = gaw + 9 - r; - if (agaw > 64) - agaw = 64; - return agaw; -} + if (!context_copied(iommu, bus, devfn)) + return; -static void domain_exit(struct dmar_domain *domain) -{ - if (domain->pgd) { - LIST_HEAD(freelist); + assert_spin_locked(&iommu->lock); + + did_old = context_domain_id(context); + context_clear_entry(context); - domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); - iommu_put_pages_list(&freelist); + if (did_old < cap_ndoms(iommu->cap)) { + iommu->flush.flush_context(iommu, did_old, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did_old, 0, 0, + DMA_TLB_DSI_FLUSH); } - if (WARN_ON(!list_empty(&domain->devices))) - return; + clear_context_copied(iommu, bus, devfn); +} - kfree(domain); +/* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ +static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, + u8 bus, u8 devfn) +{ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + } else { + iommu_flush_write_buffer(iommu); + } } static int domain_context_mapping_one(struct dmar_domain *domain, @@ -1658,12 +1147,14 @@ static int domain_context_mapping_one(struct dmar_domain *domain, domain_lookup_dev_info(domain, iommu, bus, devfn); u16 did = domain_id_iommu(domain, iommu); int translation = CONTEXT_TT_MULTI_LEVEL; - struct dma_pte *pgd = domain->pgd; + struct pt_iommu_vtdss_hw_info pt_info; struct context_entry *context; - int agaw, ret; + int ret; - if (hw_pass_through && domain_type_is_si(domain)) - translation = CONTEXT_TT_PASS_THROUGH; + if (WARN_ON(!intel_domain_is_ss_paging(domain))) + return -EINVAL; + + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1678,83 +1169,23 @@ static int domain_context_mapping_one(struct dmar_domain *domain, if (context_present(context) && !context_copied(iommu, bus, devfn)) goto out_unlock; - /* - * For kdump cases, old valid entries may be cached due to the - * in-flight DMA and copied pgtable, but there is no unmapping - * behaviour for them, thus we need an explicit cache flush for - * the newly-mapped device. For kdump, at this point, the device - * is supposed to finish reset at its driver probe stage, so no - * in-flight DMA will exist, and we don't need to worry anymore - * hereafter. - */ - if (context_copied(iommu, bus, devfn)) { - u16 did_old = context_domain_id(context); - - if (did_old < cap_ndoms(iommu->cap)) { - iommu->flush.flush_context(iommu, did_old, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, did_old, 0, 0, - DMA_TLB_DSI_FLUSH); - } - - clear_context_copied(iommu, bus, devfn); - } - + copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); context_set_domain_id(context, did); - if (translation != CONTEXT_TT_PASS_THROUGH) { - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - - if (info && info->ats_supported) - translation = CONTEXT_TT_DEV_IOTLB; - else - translation = CONTEXT_TT_MULTI_LEVEL; - - context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); - } else { - /* - * In pass through mode, AW must be programmed to - * indicate the largest AGAW value supported by - * hardware. And ASR is ignored by hardware. - */ - context_set_address_width(context, iommu->msagaw); - } + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; + context_set_address_root(context, pt_info.ssptptr); + context_set_address_width(context, pt_info.aw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); - - /* - * It's a non-present to present mapping. If hardware doesn't cache - * non-present entry we only need to flush the write-buffer. If the - * _does_ cache non-present entries, then it does so in the special - * domain #0, which we have to flush: - */ - if (cap_caching_mode(iommu->cap)) { - iommu->flush.flush_context(iommu, 0, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - } else { - iommu_flush_write_buffer(iommu); - } - + context_present_cache_flush(iommu, did, bus, devfn); ret = 0; out_unlock: @@ -1780,177 +1211,17 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; + int ret; if (!dev_is_pci(dev)) return domain_context_mapping_one(domain, iommu, bus, devfn); - return pci_for_each_dma_alias(to_pci_dev(dev), - domain_context_mapping_cb, domain); -} - -/* Return largest possible superpage level for a given mapping */ -static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phy_pfn, unsigned long pages) -{ - int support, level = 1; - unsigned long pfnmerge; - - support = domain->iommu_superpage; - - /* To use a large page, the virtual *and* physical addresses - must be aligned to 2MiB/1GiB/etc. Lower bits set in either - of them will mean we have to use smaller pages. So just - merge them and check both at once. */ - pfnmerge = iov_pfn | phy_pfn; - - while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { - pages >>= VTD_STRIDE_SHIFT; - if (!pages) - break; - pfnmerge >>= VTD_STRIDE_SHIFT; - level++; - support--; - } - return level; -} - -/* - * Ensure that old small page tables are removed to make room for superpage(s). - * We're going to add new large pages, so make sure we don't remove their parent - * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. - */ -static void switch_to_super_page(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long end_pfn, int level) -{ - unsigned long lvl_pages = lvl_to_nr_pages(level); - struct dma_pte *pte = NULL; - - while (start_pfn <= end_pfn) { - if (!pte) - pte = pfn_to_dma_pte(domain, start_pfn, &level, - GFP_ATOMIC); - - if (dma_pte_present(pte)) { - dma_pte_free_pagetable(domain, start_pfn, - start_pfn + lvl_pages - 1, - level + 1); - - cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, - end_pfn << VTD_PAGE_SHIFT, 0); - } - - pte++; - start_pfn += lvl_pages; - if (first_pte_in_page(pte)) - pte = NULL; - } -} - -static int -__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phys_pfn, unsigned long nr_pages, int prot, - gfp_t gfp) -{ - struct dma_pte *first_pte = NULL, *pte = NULL; - unsigned int largepage_lvl = 0; - unsigned long lvl_pages = 0; - phys_addr_t pteval; - u64 attr; - - if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) - return -EINVAL; - - if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) - return -EINVAL; - - if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { - pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); - return -EINVAL; - } - - attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); - attr |= DMA_FL_PTE_PRESENT; - if (domain->use_first_level) { - attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - if (prot & DMA_PTE_WRITE) - attr |= DMA_FL_PTE_DIRTY; - } - - domain->has_mappings = true; - - pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; - - while (nr_pages > 0) { - uint64_t tmp; - - if (!pte) { - largepage_lvl = hardware_largepage_caps(domain, iov_pfn, - phys_pfn, nr_pages); - - pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, - gfp); - if (!pte) - return -ENOMEM; - first_pte = pte; - - lvl_pages = lvl_to_nr_pages(largepage_lvl); - - /* It is large page*/ - if (largepage_lvl > 1) { - unsigned long end_pfn; - unsigned long pages_to_remove; - - pteval |= DMA_PTE_LARGE_PAGE; - pages_to_remove = min_t(unsigned long, nr_pages, - nr_pte_to_next_page(pte) * lvl_pages); - end_pfn = iov_pfn + pages_to_remove - 1; - switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); - } else { - pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; - } - - } - /* We don't need lock here, nobody else - * touches the iova range - */ - tmp = 0ULL; - if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { - static int dumps = 5; - pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", - iov_pfn, tmp, (unsigned long long)pteval); - if (dumps) { - dumps--; - debug_dma_dump_mappings(NULL); - } - WARN_ON(1); - } + ret = pci_for_each_dma_alias(to_pci_dev(dev), + domain_context_mapping_cb, domain); + if (ret) + return ret; - nr_pages -= lvl_pages; - iov_pfn += lvl_pages; - phys_pfn += lvl_pages; - pteval += lvl_pages * VTD_PAGE_SIZE; - - /* If the next PTE would be the first in a new page, then we - * need to flush the cache on the entries we've just written. - * And then we'll need to recalculate 'pte', so clear it and - * let it get set again in the if (!pte) block above. - * - * If we're done (!nr_pages) we need to flush the cache too. - * - * Also if we've been setting superpages, we may need to - * recalculate 'pte' and switch back to smaller pages for the - * end of the mapping, if the trailing size is not enough to - * use another superpage (i.e. nr_pages < lvl_pages). - */ - pte++; - if (!nr_pages || first_pte_in_page(pte) || - (largepage_lvl > 1 && nr_pages < lvl_pages)) { - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - pte = NULL; - } - } + iommu_enable_pci_ats(info); return 0; } @@ -1959,7 +1230,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 { struct intel_iommu *iommu = info->iommu; struct context_entry *context; - u16 did_old; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); @@ -1968,138 +1239,74 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 return; } - did_old = context_domain_id(context); - + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - iommu->flush.flush_context(iommu, - did_old, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - - iommu->flush.flush_iotlb(iommu, - did_old, - 0, - 0, - DMA_TLB_DSI_FLUSH); - - __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); + intel_context_flush_no_pasid(info, context, did); } -static int domain_setup_first_level(struct intel_iommu *iommu, - struct dmar_domain *domain, - struct device *dev, - u32 pasid) +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old) { - struct dma_pte *pgd = domain->pgd; - int agaw, level; - int flags = 0; - - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - return -ENOMEM; - } - - level = agaw_to_level(agaw); - if (level != 4 && level != 5) - return -EINVAL; - - if (level == 5) - flags |= PASID_FLAG_FL5LP; - - if (domain->force_snooping) - flags |= PASID_FLAG_PAGE_SNOOP; - - return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, - domain_id_iommu(domain, iommu), - flags); + if (!old) + return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, + did, flags); + return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did, + iommu_domain_did(old, iommu), + flags); } -static bool dev_is_real_dma_subdevice(struct device *dev) +static int domain_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { - return dev && dev_is_pci(dev) && - pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); + if (!old) + return intel_pasid_setup_second_level(iommu, domain, + dev, pasid); + return intel_pasid_replace_second_level(iommu, domain, dev, + iommu_domain_did(old, iommu), + pasid); } -static int iommu_domain_identity_map(struct dmar_domain *domain, - unsigned long first_vpfn, - unsigned long last_vpfn) +static int domain_setup_passthrough(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { - /* - * RMRR range might have overlap with physical memory range, - * clear it first - */ - dma_pte_clear_range(domain, first_vpfn, last_vpfn); - - return __domain_mapping(domain, first_vpfn, - first_vpfn, last_vpfn - first_vpfn + 1, - DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); + if (!old) + return intel_pasid_setup_pass_through(iommu, dev, pasid); + return intel_pasid_replace_pass_through(iommu, dev, + iommu_domain_did(old, iommu), + pasid); } -static int md_domain_init(struct dmar_domain *domain, int guest_width); - -static int __init si_domain_init(int hw) +static int domain_setup_first_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, + u32 pasid, struct iommu_domain *old) { - struct dmar_rmrr_unit *rmrr; - struct device *dev; - int i, nid, ret; - - si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); - if (!si_domain) - return -EFAULT; + struct pt_iommu_x86_64_hw_info pt_info; + unsigned int flags = 0; - if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - domain_exit(si_domain); - si_domain = NULL; - return -EFAULT; - } - - if (hw) - return 0; - - for_each_online_node(nid) { - unsigned long start_pfn, end_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - ret = iommu_domain_identity_map(si_domain, - mm_to_dma_pfn_start(start_pfn), - mm_to_dma_pfn_end(end_pfn)); - if (ret) - return ret; - } - } + pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info); + if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5)) + return -EINVAL; - /* - * Identity map the RMRRs so that devices with RMRRs could also use - * the si_domain. - */ - for_each_rmrr_units(rmrr) { - for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, - i, dev) { - unsigned long long start = rmrr->base_address; - unsigned long long end = rmrr->end_address; + if (pt_info.levels == 5) + flags |= PASID_FLAG_FL5LP; - if (WARN_ON(end < start || - end >> agaw_to_width(si_domain->agaw))) - continue; + if (domain->force_snooping) + flags |= PASID_FLAG_PAGE_SNOOP; - ret = iommu_domain_identity_map(si_domain, - mm_to_dma_pfn_start(start >> PAGE_SHIFT), - mm_to_dma_pfn_end(end >> PAGE_SHIFT)); - if (ret) - return ret; - } - } + if (!(domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + flags |= PASID_FLAG_PWSNP; - return 0; + return __domain_setup_first_level(iommu, dev, pasid, + domain_id_iommu(domain, iommu), + pt_info.gcr3_pt, flags, old); } static int dmar_domain_attach_device(struct dmar_domain *domain, @@ -2115,6 +1322,7 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, return ret; info->domain = domain; + info->domain_attached = true; spin_lock_irqsave(&domain->lock, flags); list_add(&info->link, &domain->devices); spin_unlock_irqrestore(&domain->lock, flags); @@ -2124,19 +1332,18 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); - else if (hw_pass_through && domain_type_is_si(domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); - else if (domain->use_first_level) - ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); - else - ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); + else if (intel_domain_is_fs_paging(domain)) + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (intel_domain_is_ss_paging(domain)) + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (WARN_ON(true)) + ret = -EINVAL; if (ret) goto out_block_translation; - if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) - iommu_enable_pci_caps(info); - ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); if (ret) goto out_block_translation; @@ -2177,19 +1384,18 @@ static bool device_rmrr_is_relaxable(struct device *dev) return false; } -/* - * Return the required default domain type for a specific device. - * - * @dev: the device in query - * @startup: true if this is during early boot - * - * Returns: - * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain - * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain - * - 0: both identity and dynamic domains work for this device - */ static int device_def_domain_type(struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * Hardware does not support the passthrough translation mode. + * Always use a dynamaic mapping domain. + */ + if (!ecap_pass_through(iommu->ecap)) + return IOMMU_DOMAIN_DMA; + if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); @@ -2287,7 +1493,8 @@ static int copy_context_table(struct intel_iommu *iommu, if (!old_ce) goto out; - new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); + new_ce = iommu_alloc_pages_node_sz(iommu->node, + GFP_KERNEL, SZ_4K); if (!new_ce) goto out_unmap; @@ -2302,7 +1509,7 @@ static int copy_context_table(struct intel_iommu *iommu, did = context_domain_id(&ce); if (did >= 0 && did < cap_ndoms(iommu->cap)) - set_bit(did, iommu->domain_ids); + ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL); set_context_copied(iommu, bus, devfn); new_ce[idx] = ce; @@ -2410,10 +1617,6 @@ static int __init init_dmars(void) struct intel_iommu *iommu; int ret; - ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); - if (ret) - goto free_iommu; - for_each_iommu(iommu, drhd) { if (drhd->ignored) { iommu_disable_translation(iommu); @@ -2433,11 +1636,6 @@ static int __init init_dmars(void) } intel_iommu_init_qi(iommu); - - ret = iommu_init_domains(iommu); - if (ret) - goto free_iommu; - init_translation_status(iommu); if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { @@ -2480,8 +1678,6 @@ static int __init init_dmars(void) } } - if (!ecap_pass_through(iommu->ecap)) - hw_pass_through = 0; intel_svm_check(iommu); } @@ -2497,10 +1693,6 @@ static int __init init_dmars(void) check_tylersburg_isoch(); - ret = si_domain_init(hw_pass_through); - if (ret) - goto free_iommu; - /* * for each drhd * enable fault log @@ -2521,19 +1713,18 @@ static int __init init_dmars(void) iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { + if (ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. */ up_write(&dmar_global_lock); - ret = intel_svm_enable_prq(iommu); + ret = intel_iommu_enable_prq(iommu); down_write(&dmar_global_lock); if (ret) goto free_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; @@ -2546,10 +1737,6 @@ free_iommu: disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } - if (si_domain) { - domain_exit(si_domain); - si_domain = NULL; - } return ret; } @@ -2638,7 +1825,7 @@ static void iommu_flush_all(void) } } -static int iommu_suspend(void) +static int iommu_suspend(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -2665,7 +1852,7 @@ static int iommu_suspend(void) return 0; } -static void iommu_resume(void) +static void iommu_resume(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -2696,14 +1883,18 @@ static void iommu_resume(void) } } -static struct syscore_ops iommu_syscore_ops = { +static const struct syscore_ops iommu_syscore_ops = { .resume = iommu_resume, .suspend = iommu_suspend, }; +static struct syscore iommu_syscore = { + .ops = &iommu_syscore_ops, +}; + static void __init init_iommu_pm_ops(void) { - register_syscore_ops(&iommu_syscore_ops); + register_syscore(&iommu_syscore); } #else @@ -2917,25 +2108,8 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { - int sp, ret; struct intel_iommu *iommu = dmaru->iommu; - - ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); - if (ret) - goto out; - - if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { - pr_warn("%s: Doesn't support hardware pass through.\n", - iommu->name); - return -ENXIO; - } - - sp = domain_update_iommu_superpage(NULL, iommu) - 1; - if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { - pr_warn("%s: Doesn't support large page.\n", - iommu->name); - return -ENXIO; - } + int ret; /* * Disable translation if already enabled prior to OS handover. @@ -2943,9 +2117,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) if (iommu->gcmd & DMA_GCMD_TE) iommu_disable_translation(iommu); - ret = iommu_init_domains(iommu); - if (ret == 0) - ret = iommu_alloc_root_entry(iommu); + ret = iommu_alloc_root_entry(iommu); if (ret) goto out; @@ -2963,13 +2135,12 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { - ret = intel_svm_enable_prq(iommu); + if (ecap_prs(iommu->ecap)) { + ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto disable_iommu; @@ -3037,7 +2208,6 @@ static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) struct device *tmp; int i; - dev = pci_physfn(dev); rcu_read_lock(); list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { @@ -3054,15 +2224,16 @@ out: return satcu; } -static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) +static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) { - int i, ret = 1; - struct pci_bus *bus; struct pci_dev *bridge = NULL; - struct device *tmp; - struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; struct dmar_satc_unit *satcu; + struct acpi_dmar_atsr *atsr; + bool supported = true; + struct pci_bus *bus; + struct device *tmp; + int i; dev = pci_physfn(dev); satcu = dmar_find_matched_satc_unit(dev); @@ -3080,11 +2251,11 @@ static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) bridge = bus->self; /* If it's an integrated device, allow ATS */ if (!bridge) - return 1; + return true; /* Connected via non-PCIe: no ATS */ if (!pci_is_pcie(bridge) || pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) - return 0; + return false; /* If we found the root port, look it up in the ATSR */ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) break; @@ -3103,11 +2274,11 @@ static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) if (atsru->include_all) goto out; } - ret = 0; + supported = false; out: rcu_read_unlock(); - return ret; + return supported; } int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) @@ -3180,43 +2351,6 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) return 0; } -static int intel_iommu_memory_notifier(struct notifier_block *nb, - unsigned long val, void *v) -{ - struct memory_notify *mhp = v; - unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); - unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + - mhp->nr_pages - 1); - - switch (val) { - case MEM_GOING_ONLINE: - if (iommu_domain_identity_map(si_domain, - start_vpfn, last_vpfn)) { - pr_warn("Failed to build identity map for [%lx-%lx]\n", - start_vpfn, last_vpfn); - return NOTIFY_BAD; - } - break; - - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - { - LIST_HEAD(freelist); - - domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); - iommu_put_pages_list(&freelist); - } - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block intel_iommu_memory_nb = { - .notifier_call = intel_iommu_memory_notifier, - .priority = 0 -}; - static void intel_disable_iommus(void) { struct intel_iommu *iommu = NULL; @@ -3234,16 +2368,19 @@ void intel_iommu_shutdown(void) if (no_iommu || dmar_disabled) return; - down_write(&dmar_global_lock); + /* + * All other CPUs were brought down, hotplug interrupts were disabled, + * no lock and RCU checking needed anymore + */ + list_for_each_entry(drhd, &dmar_drhd_units, list) { + iommu = drhd->iommu; - /* Disable PMRs explicitly here. */ - for_each_iommu(iommu, drhd) + /* Disable PMRs explicitly here. */ iommu_disable_protect_mem_regions(iommu); - /* Make sure the IOMMUs are switched off */ - intel_disable_iommus(); - - up_write(&dmar_global_lock); + /* Make sure the IOMMUs are switched off */ + iommu_disable_translation(iommu); + } } static struct intel_iommu *dev_to_intel_iommu(struct device *dev) @@ -3299,9 +2436,14 @@ static ssize_t domains_used_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); - return sysfs_emit(buf, "%d\n", - bitmap_weight(iommu->domain_ids, - cap_ndoms(iommu->cap))); + unsigned int count = 0; + int id; + + for (id = 0; id < cap_ndoms(iommu->cap); id++) + if (ida_exists(&iommu->domain_ida, id)) + count++; + + return sysfs_emit(buf, "%d\n", count); } static DEVICE_ATTR_RO(domains_used); @@ -3376,6 +2518,7 @@ static int __init probe_acpi_namespace_devices(void) if (dev->bus != &acpi_bus_type) continue; + up_read(&dmar_global_lock); adev = to_acpi_device(dev); mutex_lock(&adev->physical_node_lock); list_for_each_entry(pn, @@ -3385,6 +2528,7 @@ static int __init probe_acpi_namespace_devices(void) break; } mutex_unlock(&adev->physical_node_lock); + down_read(&dmar_global_lock); if (ret) return ret; @@ -3502,23 +2646,25 @@ int __init intel_iommu_init(void) * the virtual and physical IOMMU page-tables. */ if (cap_caching_mode(iommu->cap) && - !first_level_by_default(IOMMU_DOMAIN_DMA)) { + !first_level_by_default(iommu)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, "%s", iommu->name); + /* + * The iommu device probe is protected by the iommu_probe_device_lock. + * Release the dmar_global_lock before entering the device probe path + * to avoid unnecessary lock order splat. + */ + up_read(&dmar_global_lock); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + down_read(&dmar_global_lock); iommu_pmu_register(iommu); } - up_read(&dmar_global_lock); - - if (si_domain && !hw_pass_through) - register_memory_notifier(&intel_iommu_memory_nb); - down_read(&dmar_global_lock); if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); @@ -3559,11 +2705,14 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op */ static void domain_context_clear(struct device_domain_info *info) { - if (!dev_is_pci(info->dev)) + if (!dev_is_pci(info->dev)) { domain_context_clear_one(info, info->bus, info->devfn); + return; + } pci_for_each_dma_alias(to_pci_dev(info->dev), &domain_context_clear_one_cb, info); + iommu_disable_pci_ats(info); } /* @@ -3577,7 +2726,13 @@ void device_block_translation(struct device *dev) struct intel_iommu *iommu = info->iommu; unsigned long flags; - iommu_disable_pci_caps(info); + /* Device in DMA blocking state. Noting to do. */ + if (!info->domain_attached) + return; + + if (info->domain) + cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); + if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu)) intel_pasid_tear_down_entry(iommu, dev, @@ -3586,6 +2741,9 @@ void device_block_translation(struct device *dev) domain_context_clear(info); } + /* Device now in DMA blocking state. */ + info->domain_attached = false; + if (!info->domain) return; @@ -3593,335 +2751,410 @@ void device_block_translation(struct device *dev) list_del(&info->link); spin_unlock_irqrestore(&info->domain->lock, flags); - cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); domain_detach_iommu(info->domain, iommu); info->domain = NULL; } -static int md_domain_init(struct dmar_domain *domain, int guest_width) -{ - int adjust_width; - - /* calculate AGAW */ - domain->gaw = guest_width; - adjust_width = guestwidth_to_adjustwidth(guest_width); - domain->agaw = width_to_agaw(adjust_width); - - domain->iommu_coherency = false; - domain->iommu_superpage = 0; - domain->max_addr = 0; - - /* always allocate the top pgd */ - domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); - if (!domain->pgd) - return -ENOMEM; - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); - return 0; -} - static int blocking_domain_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + + iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev); device_block_translation(dev); return 0; } +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); + static struct iommu_domain blocking_domain = { .type = IOMMU_DOMAIN_BLOCKED, .ops = &(const struct iommu_domain_ops) { .attach_dev = blocking_domain_attach_dev, + .set_dev_pasid = blocking_domain_set_dev_pasid, } }; -static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) +static struct dmar_domain *paging_domain_alloc(void) { - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; + struct dmar_domain *domain; - switch (type) { - case IOMMU_DOMAIN_DMA: - case IOMMU_DOMAIN_UNMANAGED: - dmar_domain = alloc_domain(type); - if (!dmar_domain) { - pr_err("Can't allocate dmar_domain\n"); - return NULL; - } - if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - pr_err("Domain initialization failed\n"); - domain_exit(dmar_domain); - return NULL; - } + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); - domain = &dmar_domain->domain; - domain->geometry.aperture_start = 0; - domain->geometry.aperture_end = - __DOMAIN_MAX_ADDR(dmar_domain->gaw); - domain->geometry.force_aperture = true; + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); + spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); + xa_init(&domain->iommu_array); + INIT_LIST_HEAD(&domain->s1_domains); + spin_lock_init(&domain->s1_lock); - return domain; - case IOMMU_DOMAIN_IDENTITY: - return &si_domain->domain; - default: - return NULL; + return domain; +} + +static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int mgaw = cap_mgaw(iommu->cap); + + /* + * Spec 3.6 First-Stage Translation: + * + * Software must limit addresses to less than the minimum of MGAW + * and the lower canonical address width implied by FSPM (i.e., + * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level). + */ + if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) { + *top_level = 4; + return min(57, mgaw); } - return NULL; + /* Four level is always supported */ + *top_level = 3; + return min(48, mgaw); } static struct iommu_domain * -intel_iommu_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +intel_iommu_domain_alloc_first_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - struct intel_iommu *iommu = info->iommu; + struct pt_iommu_x86_64_cfg cfg = {}; struct dmar_domain *dmar_domain; - struct iommu_domain *domain; - - /* Must be NESTING domain */ - if (parent) { - if (!nested_supported(iommu) || flags) - return ERR_PTR(-EOPNOTSUPP); - return intel_nested_domain_alloc(parent, user_data); - } + int ret; - if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) - return ERR_PTR(-EOPNOTSUPP); - if (nested_parent && !nested_supported(iommu)) + if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); - if (user_data || (dirty_tracking && !ssads_supported(iommu))) + + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); + dmar_domain = paging_domain_alloc(); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + cfg.common.hw_max_vasz_lg2 = + compute_vasz_lg2_fs(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | + BIT(PT_FEAT_FLUSH_RANGE); + /* First stage always uses scalable mode */ + if (!ecap_smpwc(iommu->ecap)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); + dmar_domain->domain.ops = &intel_fs_paging_domain_ops; /* - * domain_alloc_user op needs to fully initialize a domain before - * return, so uses iommu_domain_alloc() here for simple. + * iotlb sync for map is only needed for legacy implementations that + * explicitly require flushing internal write buffers to ensure memory + * coherence. */ - domain = iommu_domain_alloc(dev->bus); - if (!domain) - return ERR_PTR(-ENOMEM); + if (rwbf_required(iommu)) + dmar_domain->iotlb_sync_map = true; - dmar_domain = to_dmar_domain(domain); - - if (nested_parent) { - dmar_domain->nested_parent = true; - INIT_LIST_HEAD(&dmar_domain->s1_domains); - spin_lock_init(&dmar_domain->s1_lock); + ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); } - if (dirty_tracking) { - if (dmar_domain->use_first_level) { - iommu_domain_free(domain); - return ERR_PTR(-EOPNOTSUPP); - } - domain->dirty_ops = &intel_dirty_ops; - } + if (!cap_fl1gp_support(iommu->cap)) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; - return domain; + return &dmar_domain->domain; } -static void intel_iommu_domain_free(struct iommu_domain *domain) +static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu, + unsigned int *top_level) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); + unsigned int sagaw = cap_sagaw(iommu->cap); + unsigned int mgaw = cap_mgaw(iommu->cap); - WARN_ON(dmar_domain->nested_parent && - !list_empty(&dmar_domain->s1_domains)); - if (domain != &si_domain->domain) - domain_exit(dmar_domain); + /* + * Find the largest table size that both the mgaw and sagaw support. + * This sets the valid range of IOVA and the top starting level. + * Some HW may only support a 4 or 5 level walk but must limit IOVA to + * 3 levels. + */ + if (mgaw > 48 && sagaw >= BIT(3)) { + *top_level = 4; + return min(57, mgaw); + } else if (mgaw > 39 && sagaw >= BIT(2)) { + *top_level = 3 + ffs(sagaw >> 3); + return min(48, mgaw); + } else if (mgaw > 30 && sagaw >= BIT(1)) { + *top_level = 2 + ffs(sagaw >> 2); + return min(39, mgaw); + } + return 0; } -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { + IOMMU_PT_DIRTY_OPS(vtdss), + .set_dirty_tracking = intel_iommu_set_dirty_tracking, +}; + +static struct iommu_domain * +intel_iommu_domain_alloc_second_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_iommu *iommu = info->iommu; - int addr_width; + struct pt_iommu_vtdss_cfg cfg = {}; + struct dmar_domain *dmar_domain; + unsigned int sslps; + int ret; - if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) - return -EINVAL; + if (flags & + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID))) + return ERR_PTR(-EOPNOTSUPP); - if (domain->dirty_ops && !ssads_supported(iommu)) - return -EINVAL; + if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && + !nested_supported(iommu)) || + ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + !ssads_supported(iommu))) + return ERR_PTR(-EOPNOTSUPP); - /* check if this iommu agaw is sufficient for max mapped address */ - addr_width = agaw_to_width(iommu->agaw); - if (addr_width > cap_mgaw(iommu->cap)) - addr_width = cap_mgaw(iommu->cap); + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); - if (dmar_domain->max_addr > (1LL << addr_width)) - return -EINVAL; - dmar_domain->gaw = addr_width; + dmar_domain = paging_domain_alloc(); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE); /* - * Knock out extra levels of page tables if necessary + * Read-only mapping is disallowed on the domain which serves as the + * parent in a nested configuration, due to HW errata + * (ERRATA_772415_SPR17) */ - while (iommu->agaw < dmar_domain->agaw) { - struct dma_pte *pte; + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) + cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE); - pte = dmar_domain->pgd; - if (dma_pte_present(pte)) { - dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); - iommu_free_page(pte); - } - dmar_domain->agaw--; + if (!iommu_paging_structure_coherency(iommu)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); + dmar_domain->domain.ops = &intel_ss_paging_domain_ops; + dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; + + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops; + + ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); } - if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && - context_copied(iommu, info->bus, info->devfn)) - return intel_pasid_setup_sm_context(dev); + /* Adjust the supported page sizes to HW capability */ + sslps = cap_super_page_val(iommu->cap); + if (!(sslps & BIT(0))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M; + if (!(sslps & BIT(1))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; - return 0; + /* + * Besides the internal write buffer flush, the caching mode used for + * legacy nested translation (which utilizes shadowing page tables) + * also requires iotlb sync on map. + */ + if (rwbf_required(iommu) || cap_caching_mode(iommu->cap)) + dmar_domain->iotlb_sync_map = true; + + return &dmar_domain->domain; } -static int intel_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) +static struct iommu_domain * +intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { struct device_domain_info *info = dev_iommu_priv_get(dev); - int ret; - - if (info->domain) - device_block_translation(dev); + struct intel_iommu *iommu = info->iommu; + struct iommu_domain *domain; - ret = prepare_domain_attach_device(domain, dev); - if (ret) - return ret; + if (user_data) + return ERR_PTR(-EOPNOTSUPP); - return dmar_domain_attach_device(to_dmar_domain(domain), dev); + /* Prefer first stage if possible by default. */ + domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags); + if (domain != ERR_PTR(-EOPNOTSUPP)) + return domain; + return intel_iommu_domain_alloc_second_stage(dev, iommu, flags); } -static int intel_iommu_map(struct iommu_domain *domain, - unsigned long iova, phys_addr_t hpa, - size_t size, int iommu_prot, gfp_t gfp) +static void intel_iommu_domain_free(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - u64 max_addr; - int prot = 0; - - if (iommu_prot & IOMMU_READ) - prot |= DMA_PTE_READ; - if (iommu_prot & IOMMU_WRITE) - prot |= DMA_PTE_WRITE; - if (dmar_domain->set_pte_snp) - prot |= DMA_PTE_SNP; - - max_addr = iova + size; - if (dmar_domain->max_addr < max_addr) { - u64 end; - - /* check if minimum agaw is sufficient for mapped address */ - end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; - if (end < max_addr) { - pr_err("%s: iommu width (%d) is not " - "sufficient for the mapped address (%llx)\n", - __func__, dmar_domain->gaw, max_addr); - return -EFAULT; - } - dmar_domain->max_addr = max_addr; - } - /* Round up size to next multiple of PAGE_SIZE, if it and - the low bits of hpa would take us onto the next page */ - size = aligned_nrpages(hpa, size); - return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, - hpa >> VTD_PAGE_SHIFT, size, prot, gfp); + + if (WARN_ON(dmar_domain->nested_parent && + !list_empty(&dmar_domain->s1_domains))) + return; + + if (WARN_ON(!list_empty(&dmar_domain->devices))) + return; + + pt_iommu_deinit(&dmar_domain->iommu); + + kfree(dmar_domain->qi_batch); + kfree(dmar_domain); } -static int intel_iommu_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) +static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) { - unsigned long pgshift = __ffs(pgsize); - size_t size = pgcount << pgshift; - int ret; + if (WARN_ON(dmar_domain->domain.dirty_ops || + dmar_domain->nested_parent)) + return -EINVAL; - if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return -EINVAL; - if (!IS_ALIGNED(iova | paddr, pgsize)) + if (!ecap_smpwc(iommu->ecap) && + !(dmar_domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) return -EINVAL; - ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); - if (!ret && mapped) - *mapped = size; + /* Supports the number of table levels */ + if (!cap_fl5lp_support(iommu->cap) && + dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48) + return -EINVAL; - return ret; + /* Same page size support */ + if (!cap_fl1gp_support(iommu->cap) && + (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map) + return -EINVAL; + + return 0; } -static size_t intel_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *gather) +static int +paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long start_pfn, last_pfn; - int level = 0; + unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2; + unsigned int sslps = cap_super_page_val(iommu->cap); + struct pt_iommu_vtdss_hw_info pt_info; - /* Cope with horrid API which requires us to unmap more than the - size argument if it happens to be a large-page mapping. */ - if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, - &level, GFP_ATOMIC))) - return 0; + pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info); - if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) - size = VTD_PAGE_SIZE << level_to_offset_bits(level); + if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) + return -EINVAL; + if (dmar_domain->nested_parent && !nested_supported(iommu)) + return -EINVAL; + + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return -EINVAL; - start_pfn = iova >> VTD_PAGE_SHIFT; - last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; + if (!iommu_paging_structure_coherency(iommu) && + !(dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; - domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); + /* Address width falls within the capability */ + if (cap_mgaw(iommu->cap) < vasz_lg2) + return -EINVAL; - if (dmar_domain->max_addr == iova + size) - dmar_domain->max_addr = iova; + /* Page table level is supported. */ + if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw))) + return -EINVAL; + + /* Same page size support */ + if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) + return -EINVAL; + if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) && + !dmar_domain->iotlb_sync_map) + return -EINVAL; /* - * We do not use page-selective IOTLB invalidation in flush queue, - * so there is no need to track page and sync iotlb. + * FIXME this is locked wrong, it needs to be under the + * dmar_domain->lock */ - if (!iommu_iotlb_gather_queued(gather)) - iommu_iotlb_gather_add_page(domain, gather, iova, size); + if ((dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) && + !ecap_sc_support(iommu->ecap)) + return -EINVAL; + return 0; +} - return size; +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + int ret = -EINVAL; + + if (intel_domain_is_fs_paging(dmar_domain)) + ret = paging_domain_compatible_first_stage(dmar_domain, iommu); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = paging_domain_compatible_second_stage(dmar_domain, iommu); + else if (WARN_ON(true)) + ret = -EINVAL; + if (ret) + return ret; + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && + context_copied(iommu, info->bus, info->devfn)) + return intel_pasid_setup_sm_context(dev); + + return 0; } -static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) +static int intel_iommu_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { - unsigned long pgshift = __ffs(pgsize); - size_t size = pgcount << pgshift; + int ret; + + device_block_translation(dev); - return intel_iommu_unmap(domain, iova, size, gather); + ret = paging_domain_compatible(domain, dev); + if (ret) + return ret; + + ret = iopf_for_domain_set(domain, dev); + if (ret) + return ret; + + ret = dmar_domain_attach_device(to_dmar_domain(domain), dev); + if (ret) + iopf_for_domain_remove(domain, dev); + + return ret; } static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { cache_tag_flush_range(to_dmar_domain(domain), gather->start, - gather->end, list_empty(&gather->freelist)); + gather->end, + iommu_pages_list_empty(&gather->freelist)); iommu_put_pages_list(&gather->freelist); } -static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct dma_pte *pte; - int level = 0; - u64 phys = 0; - - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, - GFP_ATOMIC); - if (pte && dma_pte_present(pte)) - phys = dma_pte_addr(pte) + - (iova & (BIT_MASK(level_to_offset_bits(level) + - VTD_PAGE_SHIFT) - 1)); - - return phys; -} - static bool domain_support_force_snooping(struct dmar_domain *domain) { struct device_domain_info *info; @@ -3938,44 +3171,41 @@ static bool domain_support_force_snooping(struct dmar_domain *domain) return support; } -static void domain_set_force_snooping(struct dmar_domain *domain) +static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain) { + struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct device_domain_info *info; - assert_spin_locked(&domain->lock); - /* - * Second level page table supports per-PTE snoop control. The - * iommu_map() interface will handle this by setting SNP bit. - */ - if (!domain->use_first_level) { - domain->set_pte_snp = true; - return; - } + guard(spinlock_irqsave)(&dmar_domain->lock); + + if (dmar_domain->force_snooping) + return true; - list_for_each_entry(info, &domain->devices, link) + if (!domain_support_force_snooping(dmar_domain)) + return false; + + dmar_domain->force_snooping = true; + list_for_each_entry(info, &dmar_domain->devices, link) intel_pasid_setup_page_snoop_control(info->iommu, info->dev, IOMMU_NO_PASID); + return true; } -static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) +static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long flags; - - if (dmar_domain->force_snooping) - return true; - spin_lock_irqsave(&dmar_domain->lock, flags); - if (!domain_support_force_snooping(dmar_domain) || - (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { - spin_unlock_irqrestore(&dmar_domain->lock, flags); + guard(spinlock_irqsave)(&dmar_domain->lock); + if (!domain_support_force_snooping(dmar_domain)) return false; - } - domain_set_force_snooping(dmar_domain); + /* + * Second level page table supports per-PTE snoop control. The + * iommu_map() interface will handle this by setting SNP bit. + */ + dmar_domain->sspt.vtdss_pt.common.features |= + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE); dmar_domain->force_snooping = true; - spin_unlock_irqrestore(&dmar_domain->lock, flags); - return true; } @@ -4053,13 +3283,14 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } if (info->ats_supported && ecap_prs(iommu->ecap) && - pci_pri_supported(pdev)) + ecap_pds(iommu->ecap) && pci_pri_supported(pdev)) info->pri_supported = 1; } } dev_iommu_priv_set(dev, info); if (pdev && pci_ats_supported(pdev)) { + pci_prepare_ats(pdev, VTD_PAGE_SHIFT); ret = device_rbtree_insert(iommu, info); if (ret) goto free; @@ -4092,11 +3323,48 @@ free: return ERR_PTR(ret); } +static void intel_iommu_probe_finalize(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * The PCIe spec, in its wisdom, declares that the behaviour of the + * device is undefined if you enable PASID support after ATS support. + * So always enable PASID support on devices which have it, even if + * we can't yet know if we're ever going to use it. + */ + if (info->pasid_supported && + !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1)) + info->pasid_enabled = 1; + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { + iommu_enable_pci_ats(info); + /* Assign a DEVTLB cache tag to the default domain. */ + if (info->ats_enabled && info->domain) { + u16 did = domain_id_iommu(info->domain, iommu); + + if (cache_tag_assign(info->domain, did, dev, + IOMMU_NO_PASID, CACHE_TAG_DEVTLB)) + iommu_disable_pci_ats(info); + } + } + iommu_enable_pci_pri(info); +} + static void intel_iommu_release_device(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + iommu_disable_pci_pri(info); + iommu_disable_pci_ats(info); + + if (info->pasid_enabled) { + pci_disable_pasid(to_pci_dev(dev)); + info->pasid_enabled = 0; + } + mutex_lock(&iommu->iopf_lock); if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) device_rbtree_remove(info); @@ -4109,7 +3377,6 @@ static void intel_iommu_release_device(struct device *dev) intel_pasid_free_table(dev); intel_iommu_debugfs_remove_dev(info); kfree(info); - set_dma_ops(dev, NULL); } static void intel_iommu_get_resv_regions(struct device *device, @@ -4178,132 +3445,44 @@ static struct iommu_group *intel_iommu_device_group(struct device *dev) return generic_device_group(dev); } -static int intel_iommu_enable_sva(struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; - - if (!info || dmar_disabled) - return -EINVAL; - - iommu = info->iommu; - if (!iommu) - return -EINVAL; - - if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) - return -ENODEV; - - if (!info->pasid_enabled || !info->ats_enabled) - return -EINVAL; - - /* - * Devices having device-specific I/O fault handling should not - * support PCI/PRI. The IOMMU side has no means to check the - * capability of device-specific IOPF. Therefore, IOMMU can only - * default that if the device driver enables SVA on a non-PRI - * device, it will handle IOPF in its own way. - */ - if (!info->pri_supported) - return 0; - - /* Devices supporting PRI should have it enabled. */ - if (!info->pri_enabled) - return -EINVAL; - - return 0; -} - -static int intel_iommu_enable_iopf(struct device *dev) +int intel_iommu_enable_iopf(struct device *dev) { - struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; + struct intel_iommu *iommu = info->iommu; int ret; - if (!pdev || !info || !info->ats_enabled || !info->pri_supported) + if (!info->pri_enabled) return -ENODEV; - if (info->pri_enabled) - return -EBUSY; - - iommu = info->iommu; - if (!iommu) - return -EINVAL; - - /* PASID is required in PRG Response Message. */ - if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) - return -EINVAL; - - ret = pci_reset_pri(pdev); - if (ret) - return ret; + /* pri_enabled is protected by the group mutex. */ + iommu_group_mutex_assert(dev); + if (info->iopf_refcount) { + info->iopf_refcount++; + return 0; + } ret = iopf_queue_add_device(iommu->iopf_queue, dev); if (ret) return ret; - ret = pci_enable_pri(pdev, PRQ_DEPTH); - if (ret) { - iopf_queue_remove_device(iommu->iopf_queue, dev); - return ret; - } - - info->pri_enabled = 1; + info->iopf_refcount = 1; return 0; } -static int intel_iommu_disable_iopf(struct device *dev) +void intel_iommu_disable_iopf(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; - if (!info->pri_enabled) - return -EINVAL; - - /* - * PCIe spec states that by clearing PRI enable bit, the Page - * Request Interface will not issue new page requests, but has - * outstanding page requests that have been transmitted or are - * queued for transmission. This is supposed to be called after - * the device driver has stopped DMA, all PASIDs have been - * unbound and the outstanding PRQs have been drained. - */ - pci_disable_pri(to_pci_dev(dev)); - info->pri_enabled = 0; - iopf_queue_remove_device(iommu->iopf_queue, dev); - - return 0; -} - -static int -intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) -{ - switch (feat) { - case IOMMU_DEV_FEAT_IOPF: - return intel_iommu_enable_iopf(dev); - - case IOMMU_DEV_FEAT_SVA: - return intel_iommu_enable_sva(dev); - - default: - return -ENODEV; - } -} - -static int -intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) -{ - switch (feat) { - case IOMMU_DEV_FEAT_IOPF: - return intel_iommu_disable_iopf(dev); + if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) + return; - case IOMMU_DEV_FEAT_SVA: - return 0; + iommu_group_mutex_assert(dev); + if (--info->iopf_refcount) + return; - default: - return -ENODEV; - } + iopf_queue_remove_device(iommu->iopf_queue, dev); } static bool intel_iommu_is_attach_deferred(struct device *dev) @@ -4333,20 +3512,31 @@ static bool risky_device(struct pci_dev *pdev) static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { - cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + if (dmar_domain->iotlb_sync_map) + cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1); return 0; } -static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain) +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct dev_pasid_info *curr, *dev_pasid = NULL; struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; unsigned long flags; + if (!domain) + return; + + /* Identity domain has no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY) + return; + + dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { @@ -4355,27 +3545,79 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, break; } } - WARN_ON_ONCE(!dev_pasid); spin_unlock_irqrestore(&dmar_domain->lock, flags); cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); - intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + if (!WARN_ON_ONCE(!dev_pasid)) { + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + kfree(dev_pasid); + } +} + +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); + iopf_for_domain_remove(old, dev); + domain_remove_dev_pasid(old, dev, pasid); + + return 0; +} + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + unsigned long flags; + int ret; + + dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); + if (!dev_pasid) + return ERR_PTR(-ENOMEM); + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) + goto out_free; + + ret = cache_tag_assign_domain(dmar_domain, dev, pasid); + if (ret) + goto out_detach_iommu; + + dev_pasid->dev = dev; + dev_pasid->pasid = pasid; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return dev_pasid; +out_detach_iommu: + domain_detach_iommu(dmar_domain, iommu); +out_free: kfree(dev_pasid); - intel_pasid_tear_down_entry(iommu, dev, pasid, false); - intel_drain_pasid_prq(dev, pasid); + return ERR_PTR(ret); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct dev_pasid_info *dev_pasid; - unsigned long flags; int ret; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EINVAL; + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; @@ -4385,58 +3627,54 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); - ret = domain_attach_iommu(dmar_domain, iommu); + ret = iopf_for_domain_replace(domain, old, dev); if (ret) - goto out_free; + goto out_remove_dev_pasid; - ret = cache_tag_assign_domain(dmar_domain, dev, pasid); - if (ret) - goto out_detach_iommu; - - if (domain_type_is_si(dmar_domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, pasid); - else if (dmar_domain->use_first_level) + if (intel_domain_is_fs_paging(dmar_domain)) ret = domain_setup_first_level(iommu, dmar_domain, - dev, pasid); - else - ret = intel_pasid_setup_second_level(iommu, dmar_domain, - dev, pasid); + dev, pasid, old); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = domain_setup_second_level(iommu, dmar_domain, + dev, pasid, old); + else if (WARN_ON(true)) + ret = -EINVAL; + if (ret) - goto out_unassign_tag; + goto out_unwind_iopf; - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_remove_dev_pasid(old, dev, pasid); - if (domain->type & __IOMMU_DOMAIN_PAGING) - intel_iommu_debugfs_create_dev_pasid(dev_pasid); + intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; -out_unassign_tag: - cache_tag_unassign_domain(dmar_domain, dev, pasid); -out_detach_iommu: - domain_detach_iommu(dmar_domain, iommu); -out_free: - kfree(dev_pasid); + +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); return ret; } -static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) +static void *intel_iommu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct iommu_hw_info_vtd *vtd; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_INTEL_VTD) + return ERR_PTR(-EOPNOTSUPP); + vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); if (!vtd) return ERR_PTR(-ENOMEM); @@ -4530,82 +3768,163 @@ err_unwind: return ret; } -static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) +static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long end = iova + size - 1; - unsigned long pgsize; + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, 1); + if (!context) { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + if (context_present(context) && !context_copied(iommu, bus, devfn)) { + spin_unlock(&iommu->lock); + return 0; + } + + copied_context_tear_down(iommu, context, bus, devfn); + context_clear_entry(context); + context_set_domain_id(context, FLPT_DEFAULT_DID); /* - * IOMMUFD core calls into a dirty tracking disabled domain without an - * IOVA bitmap set in order to clean dirty bits in all PTEs that might - * have occurred when we stopped dirty tracking. This ensures that we - * never inherit dirtied bits from a previous cycle. + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. */ - if (!dmar_domain->dirty_tracking && dirty->bitmap) - return -EINVAL; + context_set_address_width(context, iommu->msagaw); + context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); + context_set_fault_enable(context); + context_set_present(context); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(context, sizeof(*context)); + context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); + spin_unlock(&iommu->lock); + + return 0; +} - do { - struct dma_pte *pte; - int lvl = 0; +static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, - GFP_ATOMIC); - pgsize = level_size(lvl) << VTD_PAGE_SHIFT; - if (!pte || !dma_pte_present(pte)) { - iova += pgsize; - continue; - } + return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); +} + +static int device_setup_pass_through(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); - if (dma_sl_pte_test_and_clear_dirty(pte, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); + if (!dev_is_pci(dev)) + return context_setup_pass_through(dev, info->bus, info->devfn); + + return pci_for_each_dma_alias(to_pci_dev(dev), + context_setup_pass_through_cb, dev); +} +static int identity_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + device_block_translation(dev); + + if (dev_is_real_dma_subdevice(dev)) + return 0; + + /* + * No PRI support with the global identity domain. No need to enable or + * disable PRI in this path as the iommu has been put in the blocking + * state. + */ + if (sm_supported(iommu)) + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); + else + ret = device_setup_pass_through(dev); + + if (!ret) + info->domain_attached = true; + + return ret; +} + +static int identity_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + return ret; + + ret = domain_setup_passthrough(iommu, dev, pasid, old); + if (ret) { + iopf_for_domain_replace(old, domain, dev); + return ret; + } + + domain_remove_dev_pasid(old, dev, pasid); return 0; } -static const struct iommu_dirty_ops intel_dirty_ops = { - .set_dirty_tracking = intel_iommu_set_dirty_tracking, - .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, +static struct iommu_domain identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = identity_domain_attach_dev, + .set_dev_pasid = identity_domain_set_dev_pasid, + }, +}; + +const struct iommu_domain_ops intel_fs_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, +}; + +const struct iommu_domain_ops intel_ss_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(vtdss), + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, }; const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, + .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, - .domain_alloc = intel_iommu_domain_alloc, - .domain_alloc_user = intel_iommu_domain_alloc_user, + .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, .domain_alloc_sva = intel_svm_domain_alloc, + .domain_alloc_nested = intel_iommu_domain_alloc_nested, .probe_device = intel_iommu_probe_device, + .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, .device_group = intel_iommu_device_group, - .dev_enable_feat = intel_iommu_dev_enable_feat, - .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, - .remove_dev_pasid = intel_iommu_remove_dev_pasid, - .pgsize_bitmap = SZ_4K, -#ifdef CONFIG_INTEL_IOMMU_SVM - .page_response = intel_svm_page_response, -#endif - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = intel_iommu_attach_device, - .set_dev_pasid = intel_iommu_set_dev_pasid, - .map_pages = intel_iommu_map_pages, - .unmap_pages = intel_iommu_unmap_pages, - .iotlb_sync_map = intel_iommu_iotlb_sync_map, - .flush_iotlb_all = intel_flush_iotlb_all, - .iotlb_sync = intel_iommu_tlb_sync, - .iova_to_phys = intel_iommu_iova_to_phys, - .free = intel_iommu_domain_free, - .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, - } + .page_response = intel_iommu_page_response, }; static void quirk_iommu_igfx(struct pci_dev *dev) @@ -4626,6 +3945,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); +/* QM57/QS57 integrated gfx malfunctions with dmar */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx); + /* Broadwell igfx malfunctions with dmar */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); @@ -4703,7 +4025,6 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); @@ -4897,3 +4218,5 @@ err: return ret; } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index eaf015b4353b..25c5e22096d4 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -22,8 +22,9 @@ #include <linux/bitfield.h> #include <linux/xarray.h> #include <linux/perf_event.h> +#include <linux/pci.h> +#include <linux/generic_pt/iommu.h> -#include <asm/cacheflush.h> #include <asm/iommu.h> #include <uapi/linux/iommufd.h> @@ -49,7 +50,6 @@ #define DMA_FL_PTE_US BIT_ULL(2) #define DMA_FL_PTE_ACCESS BIT_ULL(5) #define DMA_FL_PTE_DIRTY BIT_ULL(6) -#define DMA_FL_PTE_XD BIT_ULL(63) #define DMA_SL_PTE_DIRTY_BIT 9 #define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT) @@ -77,7 +77,6 @@ #define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ #define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ #define DMAR_FEUADDR_REG 0x44 /* Upper address register */ -#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ #define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ #define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ #define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ @@ -173,8 +172,6 @@ #define cap_pgsel_inv(c) (((c) >> 39) & 1) #define cap_super_page_val(c) (((c) >> 34) & 0xf) -#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ - * OFFSET_STRIDE) + 21) #define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) #define cap_max_fault_reg_offset(c) \ @@ -462,7 +459,6 @@ enum { #define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) /* Page group response descriptor QW1 */ -#define QI_PGRP_LPIG(x) (((u64)(x)) << 2) #define QI_PGRP_IDX(idx) (((u64)(idx)) << 3) @@ -493,14 +489,13 @@ struct q_inval { /* Page Request Queue depth */ #define PRQ_ORDER 4 -#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) -#define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) +#define PRQ_SIZE (SZ_4K << PRQ_ORDER) +#define PRQ_RING_MASK (PRQ_SIZE - 0x20) +#define PRQ_DEPTH (PRQ_SIZE >> 5) struct dmar_pci_notify_info; #ifdef CONFIG_IRQ_REMAP -/* 1MB - maximum possible interrupt remapping table size */ -#define INTR_REMAP_PAGE_ORDER 8 #define INTR_REMAP_TABLE_REG_SIZE 0xf #define INTR_REMAP_TABLE_REG_SIZE_MASK 0xf @@ -542,7 +537,8 @@ enum { #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) #define ssads_supported(iommu) (sm_supported(iommu) && \ - ecap_slads((iommu)->ecap)) + ecap_slads((iommu)->ecap) && \ + ecap_smpwc(iommu->ecap)) #define nested_supported(iommu) (sm_supported(iommu) && \ ecap_nest((iommu)->ecap)) @@ -585,23 +581,36 @@ struct iommu_domain_info { * to VT-d spec, section 9.3 */ }; +/* + * We start simply by using a fixed size for the batched descriptors. This + * size is currently sufficient for our needs. Future improvements could + * involve dynamically allocating the batch buffer based on actual demand, + * allowing us to adjust the batch size for optimal performance in different + * scenarios. + */ +#define QI_MAX_BATCHED_DESC_COUNT 16 +struct qi_batch { + struct qi_desc descs[QI_MAX_BATCHED_DESC_COUNT]; + unsigned int index; +}; + struct dmar_domain { - int nid; /* node id */ + union { + struct iommu_domain domain; + struct pt_iommu iommu; + /* First stage page table */ + struct pt_iommu_x86_64 fspt; + /* Second stage page table */ + struct pt_iommu_vtdss sspt; + }; + struct xarray iommu_array; /* Attached IOMMU array */ - u8 has_iotlb_device: 1; - u8 iommu_coherency: 1; /* indicate coherency of iommu access */ - u8 force_snooping : 1; /* Create IOPTEs with snoop control */ - u8 set_pte_snp:1; - u8 use_first_level:1; /* DMA translation for the domain goes - * through the first level page table, - * otherwise, goes through the second - * level. - */ + u8 force_snooping:1; /* Create PASID entry with snoop control */ u8 dirty_tracking:1; /* Dirty tracking is enabled */ u8 nested_parent:1; /* Has other domains nested on it */ - u8 has_mappings:1; /* Has mappings configured through - * iommu_map() interface. + u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write + * buffer when creating mappings. */ spinlock_t lock; /* Protect device tracking lists */ @@ -610,27 +619,11 @@ struct dmar_domain { spinlock_t cache_lock; /* Protect the cache tag list */ struct list_head cache_tags; /* Cache tag list */ + struct qi_batch *qi_batch; /* Batched QI descriptors */ - int iommu_superpage;/* Level of superpages supported: - 0 == 4KiB (no superpages), 1 == 2MiB, - 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ union { /* DMA remapping domain */ struct { - /* virtual address */ - struct dma_pte *pgd; - /* max guest address width */ - int gaw; - /* - * adjusted guest address width: - * 0: level 2 30-bit - * 1: level 3 39-bit - * 2: level 4 48-bit - * 3: level 5 57-bit - */ - int agaw; - /* maximum mapped address */ - u64 max_addr; /* Protect the s1_domains list */ spinlock_t s1_lock; /* Track s1_domains nested on this domain */ @@ -641,8 +634,6 @@ struct dmar_domain { struct { /* parent page table which the user domain is nested on */ struct dmar_domain *s2_domain; - /* user page table pointer (in GPA) */ - unsigned long s1_pgtbl; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; /* link to parent domain siblings */ @@ -654,10 +645,10 @@ struct dmar_domain { struct mmu_notifier notifier; }; }; - - struct iommu_domain domain; /* generic domain data structure for - iommu core */ }; +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain); /* * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters. @@ -688,8 +679,6 @@ struct iommu_pmu { DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; unsigned char irq_name[16]; - struct hlist_node cpuhp_node; - int cpu; }; #define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) @@ -710,22 +699,22 @@ struct intel_iommu { int msagaw; /* max sagaw of this iommu */ unsigned int irq, pr_irq, perf_irq; u16 segment; /* PCI segment# */ - unsigned char name[13]; /* Device Name */ + unsigned char name[16]; /* Device Name */ #ifdef CONFIG_INTEL_IOMMU - unsigned long *domain_ids; /* bitmap of domains */ + /* mutex to protect domain_ida */ + struct mutex did_lock; + struct ida domain_ida; /* domain id allocator */ unsigned long *copied_tables; /* bitmap of copied tables */ spinlock_t lock; /* protect context, domain ids */ struct root_entry *root_entry; /* virtual address */ struct iommu_flush flush; #endif -#ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ unsigned long prq_seq_number; struct completion prq_complete; -#endif struct iopf_queue *iopf_queue; unsigned char iopfq_name[16]; /* Synchronization between fault report and iommu device release. */ @@ -766,7 +755,9 @@ struct device_domain_info { u8 ats_supported:1; u8 ats_enabled:1; u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */ + u8 domain_attached:1; /* Device has domain attached */ u8 ats_qdep; + unsigned int iopf_refcount; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ @@ -800,6 +791,24 @@ static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) return container_of(dom, struct dmar_domain, domain); } +/* + * Domain ID 0 and 1 are reserved: + * + * If Caching mode is set, then invalid translations are tagged + * with domain-id 0, hence we need to pre-allocate it. We also + * use domain-id 0 as a marker for non-allocated domain-id, so + * make sure it is not used for a real domain. + * + * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid + * entry for first-level or pass-through translation modes should + * be programmed with a domain id different from those used for + * second-level or nested translation. We reserve a domain id for + * this purpose. This domain id is also used for identity domain + * in legacy mode. + */ +#define FLPT_DEFAULT_DID 1 +#define IDA_START_DID 2 + /* Retrieve the domain ID which has allocated to the domain */ static inline u16 domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -810,6 +819,21 @@ domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) return info->did; } +static inline u16 +iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) +{ + if (domain->type == IOMMU_DOMAIN_SVA || + domain->type == IOMMU_DOMAIN_IDENTITY) + return FLPT_DEFAULT_DID; + return domain_id_iommu(to_dmar_domain(domain), iommu); +} + +static inline bool dev_is_real_dma_subdevice(struct device *dev) +{ + return dev && dev_is_pci(dev) && + pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); +} + /* * 0: readable * 1: writable @@ -823,19 +847,13 @@ struct dma_pte { u64 val; }; -static inline void dma_clear_pte(struct dma_pte *pte) -{ - pte->val = 0; -} - static inline u64 dma_pte_addr(struct dma_pte *pte) { #ifdef CONFIG_64BIT - return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD); + return pte->val & VTD_PAGE_MASK; #else /* Must have a full atomic 64-bit read */ - return __cmpxchg64(&pte->val, 0ULL, 0ULL) & - VTD_PAGE_MASK & (~DMA_FL_PTE_XD); + return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK; #endif } @@ -844,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } -static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, - unsigned long flags) -{ - if (flags & IOMMU_DIRTY_NO_CLEAR) - return (pte->val & DMA_SL_PTE_DIRTY) != 0; - - return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, - (unsigned long *)&pte->val); -} - static inline bool dma_pte_superpage(struct dma_pte *pte) { return (pte->val & DMA_PTE_LARGE_PAGE); } -static inline bool first_pte_in_page(struct dma_pte *pte) -{ - return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE); -} - -static inline int nr_pte_to_next_page(struct dma_pte *pte) -{ - return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) : - (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; -} - static inline bool context_present(struct context_entry *context) { return (context->lo & 1); @@ -885,11 +882,6 @@ static inline int agaw_to_level(int agaw) return agaw + 2; } -static inline int agaw_to_width(int agaw) -{ - return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); -} - static inline int width_to_agaw(int width) { return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); @@ -905,44 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level) return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; } -static inline u64 level_mask(int level) -{ - return -1ULL << level_to_offset_bits(level); -} - -static inline u64 level_size(int level) -{ - return 1ULL << level_to_offset_bits(level); -} - -static inline u64 align_to_level(u64 pfn, int level) -{ - return (pfn + level_size(level) - 1) & level_mask(level); -} - -static inline unsigned long lvl_to_nr_pages(unsigned int lvl) -{ - return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); -} - -/* VT-d pages must always be _smaller_ than MM pages. Otherwise things - are never going to work. */ -static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) -{ - return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); -} -static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) -{ - return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; -} -static inline unsigned long page_to_dma_pfn(struct page *pg) -{ - return mm_to_dma_pfn_start(page_to_pfn(pg)); -} -static inline unsigned long virt_to_dma_pfn(void *p) -{ - return page_to_dma_pfn(virt_to_page(p)); -} static inline void context_set_present(struct context_entry *context) { @@ -1047,6 +1001,15 @@ static inline void context_set_sm_pre(struct context_entry *context) context->lo |= BIT_ULL(4); } +/* + * Clear the PRE(Page Request Enable) field of a scalable mode context + * entry. + */ +static inline void context_clear_sm_pre(struct context_entry *context) +{ + context->lo &= ~BIT_ULL(4); +} + /* Returns a number of VTD pages, but aligned to MM page size */ static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size) { @@ -1060,6 +1023,115 @@ static inline unsigned long nrpages_to_size(unsigned long npages) return npages << VTD_PAGE_SHIFT; } +static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_desc *desc) +{ + u8 dw = 0, dr = 0; + int ih = addr & 1; + + if (cap_write_drain(iommu->cap)) + dw = 1; + + if (cap_read_drain(iommu->cap)) + dr = 1; + + desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) + | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; + desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) + | QI_IOTLB_AM(size_order); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_dev_iotlb(u16 sid, u16 pfsid, u16 qdep, u64 addr, + unsigned int mask, struct qi_desc *desc) +{ + if (mask) { + addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; + desc->qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; + } else { + desc->qw1 = QI_DEV_IOTLB_ADDR(addr); + } + + if (qdep >= QI_DEV_IOTLB_MAX_INVS) + qdep = 0; + + desc->qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | + QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_piotlb(u16 did, u32 pasid, u64 addr, + unsigned long npages, bool ih, + struct qi_desc *desc) +{ + if (npages == -1) { + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = 0; + } else { + int mask = ilog2(__roundup_pow_of_two(npages)); + unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); + + if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) + addr = ALIGN_DOWN(addr, align); + + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = QI_EIOTLB_ADDR(addr) | + QI_EIOTLB_IH(ih) | + QI_EIOTLB_AM(mask); + } +} + +static inline void qi_desc_dev_iotlb_pasid(u16 sid, u16 pfsid, u32 pasid, + u16 qdep, u64 addr, + unsigned int size_order, + struct qi_desc *desc) +{ + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); + + desc->qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(pfsid); + + /* + * If S bit is 0, we only flush a single page. If S bit is set, + * The least significant zero bit indicates the invalidation address + * range. VT-d spec 6.5.2.6. + * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. + * size order = 0 is PAGE_SIZE 4KB + * Max Invs Pending (MIP) is set to 0 for now until we have DIT in + * ECAP. + */ + if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) + pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", + addr, size_order); + + /* Take page address */ + desc->qw1 = QI_DEV_EIOTLB_ADDR(addr); + + if (size_order) { + /* + * Existing 0s in address below size_order may be the least + * significant bit, we must set them to 1s to avoid having + * smaller size than desired. + */ + desc->qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, + VTD_PAGE_SHIFT); + /* Clear size_order bit to indicate size */ + desc->qw1 &= ~mask; + /* Set the S bit to indicate flushing more than 1 page */ + desc->qw1 |= QI_DEV_EIOTLB_SIZE; + } +} + /* Convert value to context PASID directory size field coding. */ #define context_pdts(pds) (((pds) & 0x7) << 9) @@ -1091,25 +1163,37 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, unsigned int count, unsigned long options); + +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); /* * Options used in qi_submit_sync: * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. */ #define QI_OPT_WAIT_DRAIN BIT(0) -void domain_update_iotlb(struct dmar_domain *domain); int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev); -void domain_update_iommu_cap(struct dmar_domain *domain); +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); + +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old); int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); -struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, - const struct iommu_user_data *user_data); +struct iommu_domain * +intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, + const struct iommu_user_data *user_data); struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid); enum cache_tag_type { @@ -1135,6 +1219,8 @@ struct cache_tag { unsigned int users; }; +int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev, + ioasid_t pasid, enum cache_tag_type type); int cache_tag_assign_domain(struct dmar_domain *domain, struct device *dev, ioasid_t pasid); void cache_tag_unassign_domain(struct dmar_domain *domain, @@ -1145,18 +1231,57 @@ void cache_tag_flush_all(struct dmar_domain *domain); void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end); +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did); + +int intel_iommu_enable_prq(struct intel_iommu *iommu); +int intel_iommu_finish_prq(struct intel_iommu *iommu); +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); + +int intel_iommu_enable_iopf(struct device *dev); +void intel_iommu_disable_iopf(struct device *dev); + +static inline int iopf_for_domain_set(struct iommu_domain *domain, + struct device *dev) +{ + if (!domain || !domain->iopf_handler) + return 0; + + return intel_iommu_enable_iopf(dev); +} + +static inline void iopf_for_domain_remove(struct iommu_domain *domain, + struct device *dev) +{ + if (!domain || !domain->iopf_handler) + return; + + intel_iommu_disable_iopf(dev); +} + +static inline int iopf_for_domain_replace(struct iommu_domain *new, + struct iommu_domain *old, + struct device *dev) +{ + int ret; + + ret = iopf_for_domain_set(new, dev); + if (ret) + return ret; + + iopf_for_domain_remove(old, dev); + + return 0; +} + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); -int intel_svm_enable_prq(struct intel_iommu *iommu); -int intel_svm_finish_prq(struct intel_iommu *iommu); -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm); -void intel_drain_pasid_prq(struct device *dev, u32 pasid); #else static inline void intel_svm_check(struct intel_iommu *iommu) {} -static inline void intel_drain_pasid_prq(struct device *dev, u32 pasid) {} static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm) { @@ -1183,6 +1308,18 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc); extern const struct iommu_ops intel_iommu_ops; +extern const struct iommu_domain_ops intel_fs_paging_domain_ops; +extern const struct iommu_domain_ops intel_ss_paging_domain_ops; + +static inline bool intel_domain_is_fs_paging(struct dmar_domain *domain) +{ + return domain->domain.ops == &intel_fs_paging_domain_ops; +} + +static inline bool intel_domain_is_ss_paging(struct dmar_domain *domain) +{ + return domain->domain.ops == &intel_ss_paging_domain_ops; +} #ifdef CONFIG_INTEL_IOMMU extern int intel_iommu_sm; diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index e4a70886678c..4f9b01dc91e8 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -10,6 +10,7 @@ #include <linux/hpet.h> #include <linux/pci.h> #include <linux/irq.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/acpi.h> #include <linux/irqdomain.h> #include <linux/crash_dump.h> @@ -24,12 +25,6 @@ #include "iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" -#include "cap_audit.h" - -enum irq_mode { - IRQ_REMAPPING, - IRQ_POSTING, -}; struct ioapic_scope { struct intel_iommu *iommu; @@ -50,8 +45,8 @@ struct irq_2_iommu { u16 irte_index; u16 sub_handle; u8 irte_mask; - enum irq_mode mode; bool posted_msi; + bool posted_vcpu; }; struct intel_ir_data { @@ -139,7 +134,6 @@ static int alloc_irte(struct intel_iommu *iommu, irq_iommu->irte_index = index; irq_iommu->sub_handle = 0; irq_iommu->irte_mask = mask; - irq_iommu->mode = IRQ_REMAPPING; } raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); @@ -194,8 +188,6 @@ static int modify_irte(struct irq_2_iommu *irq_iommu, rc = qi_flush_iec(iommu, index, 0); - /* Update iommu mode according to the IRTE mode */ - irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING; raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); return rc; @@ -312,7 +304,7 @@ static int set_ioapic_sid(struct irte *irte, int apic) for (i = 0; i < MAX_IO_APICS; i++) { if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { - sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; + sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn); break; } } @@ -337,7 +329,7 @@ static int set_hpet_sid(struct irte *irte, u8 id) for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].iommu && ir_hpet[i].id == id) { - sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn); break; } } @@ -527,8 +519,14 @@ static void iommu_enable_irq_remapping(struct intel_iommu *iommu) static int intel_setup_irq_remapping(struct intel_iommu *iommu) { + struct irq_domain_info info = { + .ops = &intel_ir_domain_ops, + .parent = arch_get_ir_parent_domain(), + .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, + .size = INTR_REMAP_TABLE_ENTRIES, + .host_data = iommu, + }; struct ir_table *ir_table; - struct fwnode_handle *fn; unsigned long *bitmap; void *ir_table_base; @@ -539,11 +537,11 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (!ir_table) return -ENOMEM; - ir_table_base = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, - INTR_REMAP_PAGE_ORDER); + /* 1MB - maximum possible interrupt remapping table size */ + ir_table_base = + iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, SZ_1M); if (!ir_table_base) { - pr_err("IR%d: failed to allocate pages of order %d\n", - iommu->seq_id, INTR_REMAP_PAGE_ORDER); + pr_err("IR%d: failed to allocate 1M of pages\n", iommu->seq_id); goto out_free_table; } @@ -553,25 +551,16 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) goto out_free_pages; } - fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id); - if (!fn) + info.fwnode = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id); + if (!info.fwnode) goto out_free_bitmap; - iommu->ir_domain = - irq_domain_create_hierarchy(arch_get_ir_parent_domain(), - 0, INTR_REMAP_TABLE_ENTRIES, - fn, &intel_ir_domain_ops, - iommu); + iommu->ir_domain = msi_create_parent_irq_domain(&info, &dmar_msi_parent_ops); if (!iommu->ir_domain) { pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); goto out_free_fwnode; } - irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_DMAR); - iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT | - IRQ_DOMAIN_FLAG_ISOLATED_MSI; - iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops; - ir_table->base = ir_table_base; ir_table->bitmap = bitmap; iommu->ir_table = ir_table; @@ -597,8 +586,8 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (ir_pre_enabled(iommu)) { if (!is_kdump_kernel()) { - pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n", - iommu->name); + pr_info_once("IRQ remapping was enabled on %s but we are not in kdump mode\n", + iommu->name); clear_ir_pre_enabled(iommu); iommu_disable_irq_remapping(iommu); } else if (iommu_load_old_irte(iommu)) @@ -617,11 +606,11 @@ out_free_ir_domain: irq_domain_remove(iommu->ir_domain); iommu->ir_domain = NULL; out_free_fwnode: - irq_domain_free_fwnode(fn); + irq_domain_free_fwnode(info.fwnode); out_free_bitmap: bitmap_free(bitmap); out_free_pages: - iommu_free_pages(ir_table_base, INTR_REMAP_PAGE_ORDER); + iommu_free_pages(ir_table_base); out_free_table: kfree(ir_table); @@ -642,7 +631,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu) irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } - iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER); + iommu_free_pages(iommu->ir_table->base); bitmap_free(iommu->ir_table->bitmap); kfree(iommu->ir_table); iommu->ir_table = NULL; @@ -727,9 +716,6 @@ static int __init intel_prepare_irq_remapping(void) if (dmar_table_init() < 0) return -ENODEV; - if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL)) - return -ENODEV; - if (!dmar_ir_support()) return -ENODEV; @@ -1173,7 +1159,26 @@ static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {} #endif -static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) +static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + + /* + * Don't modify IRTEs for IRQs that are being posted to vCPUs if the + * host CPU affinity changes. + */ + if (ir_data->irq_2_iommu.posted_vcpu && !force_host) + return; + + ir_data->irq_2_iommu.posted_vcpu = false; + + if (ir_data->irq_2_iommu.posted_msi) + intel_ir_reconfigure_irte_posted(irqd); + else + modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); +} + +static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) { struct intel_ir_data *ir_data = irqd->chip_data; struct irte *irte = &ir_data->irte_entry; @@ -1186,10 +1191,7 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) irte->vector = cfg->vector; irte->dest_id = IRTE_DEST(cfg->dest_apicid); - if (ir_data->irq_2_iommu.posted_msi) - intel_ir_reconfigure_irte_posted(irqd); - else if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) - modify_irte(&ir_data->irq_2_iommu, irte); + __intel_ir_reconfigure_irte(irqd, force_host); } /* @@ -1240,11 +1242,11 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data, static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) { struct intel_ir_data *ir_data = data->chip_data; - struct vcpu_data *vcpu_pi_info = info; + struct intel_iommu_pi_data *pi_data = info; /* stop posting interrupts, back to the default mode */ - if (!vcpu_pi_info) { - modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); + if (!pi_data) { + __intel_ir_reconfigure_irte(data, true); } else { struct irte irte_pi; @@ -1261,12 +1263,13 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) /* Update the posted mode fields */ irte_pi.p_pst = 1; irte_pi.p_urgent = 0; - irte_pi.p_vector = vcpu_pi_info->vector; - irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >> + irte_pi.p_vector = pi_data->vector; + irte_pi.pda_l = (pi_data->pi_desc_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); - irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) & + irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); + ir_data->irq_2_iommu.posted_vcpu = true; modify_irte(&ir_data->irq_2_iommu, &irte_pi); } @@ -1282,43 +1285,44 @@ static struct irq_chip intel_ir_chip = { }; /* - * With posted MSIs, all vectors are multiplexed into a single notification - * vector. Devices MSIs are then dispatched in a demux loop where - * EOIs can be coalesced as well. + * With posted MSIs, the MSI vectors are multiplexed into a single notification + * vector, and only the notification vector is sent to the APIC IRR. Device + * MSIs are then dispatched in a demux loop that harvests the MSIs from the + * CPU's Posted Interrupt Request bitmap. I.e. Posted MSIs never get sent to + * the APIC IRR, and thus do not need an EOI. The notification handler instead + * performs a single EOI after processing the PIR. * - * "INTEL-IR-POST" IRQ chip does not do EOI on ACK, thus the dummy irq_ack() - * function. Instead EOI is performed by the posted interrupt notification - * handler. + * Note! Pending SMP/CPU affinity changes, which are per MSI, must still be + * honored, only the APIC EOI is omitted. * * For the example below, 3 MSIs are coalesced into one CPU notification. Only - * one apic_eoi() is needed. + * one apic_eoi() is needed, but each MSI needs to process pending changes to + * its CPU affinity. * * __sysvec_posted_msi_notification() * irq_enter(); * handle_edge_irq() * irq_chip_ack_parent() - * dummy(); // No EOI + * irq_move_irq(); // No EOI * handle_irq_event() * driver_handler() * handle_edge_irq() * irq_chip_ack_parent() - * dummy(); // No EOI + * irq_move_irq(); // No EOI * handle_irq_event() * driver_handler() * handle_edge_irq() * irq_chip_ack_parent() - * dummy(); // No EOI + * irq_move_irq(); // No EOI * handle_irq_event() * driver_handler() * apic_eoi() * irq_exit() + * */ - -static void dummy_ack(struct irq_data *d) { } - static struct irq_chip intel_ir_chip_post_msi = { .name = "INTEL-IR-POST", - .irq_ack = dummy_ack, + .irq_ack = irq_move_irq, .irq_set_affinity = intel_ir_set_affinity, .irq_compose_msi_msg = intel_ir_compose_msi_msg, .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, @@ -1352,12 +1356,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, case X86_IRQ_ALLOC_TYPE_IOAPIC: /* Set source-id of interrupt request */ set_ioapic_sid(irte, info->devid); - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", - info->devid, irte->present, irte->fpd, - irte->dst_mode, irte->redir_hint, - irte->trigger_mode, irte->dlvry_mode, - irte->avail, irte->vector, irte->dest_id, - irte->sid, irte->sq, irte->svt); + apic_pr_verbose("IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", + info->devid, irte->present, irte->fpd, irte->dst_mode, + irte->redir_hint, irte->trigger_mode, irte->dlvry_mode, + irte->avail, irte->vector, irte->dest_id, irte->sid, + irte->sq, irte->svt); sub_handle = info->ioapic.pin; break; case X86_IRQ_ALLOC_TYPE_HPET: @@ -1464,7 +1467,6 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain, else irq_data->chip = &intel_ir_chip; intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i); - irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); } return 0; @@ -1495,6 +1497,9 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain, struct intel_ir_data *data = irq_data->chip_data; struct irte entry; + WARN_ON_ONCE(data->irq_2_iommu.posted_vcpu); + data->irq_2_iommu.posted_vcpu = false; + memset(&entry, 0, sizeof(entry)); modify_irte(&data->irq_2_iommu, &entry); } @@ -1523,6 +1528,8 @@ static const struct irq_domain_ops intel_ir_domain_ops = { static const struct msi_parent_ops dmar_msi_parent_ops = { .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, + .bus_select_token = DOMAIN_BUS_DMAR, + .bus_select_mask = MATCH_PCI_MSI, .prefix = "IR-", .init_dev_msi_info = msi_parent_init_dev_msi_info, }; @@ -1535,10 +1542,6 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu) int ret; int eim = x2apic_enabled(); - ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu); - if (ret) - return ret; - if (eim && !ecap_eim_support(iommu->ecap)) { pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n", iommu->reg_phys, iommu->ecap); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 16a2bcf5cfeb..a3fb8c193ca6 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -19,7 +19,7 @@ #include "pasid.h" static int intel_nested_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -27,20 +27,14 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, unsigned long flags; int ret = 0; - if (info->domain) - device_block_translation(dev); - - if (iommu->agaw < dmar_domain->s2_domain->agaw) { - dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); - return -ENODEV; - } + device_block_translation(dev); /* * Stage-1 domain cannot work alone, it is nested on a s2_domain. * The s2_domain will be used in nested translation, hence needs * to ensure the s2_domain is compatible with this IOMMU. */ - ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); if (ret) { dev_err_ratelimited(dev, "s2 domain is not compatible\n"); return ret; @@ -56,19 +50,24 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, if (ret) goto detach_iommu; + ret = iopf_for_domain_set(domain, dev); + if (ret) + goto unassign_tag; + ret = intel_pasid_setup_nested(iommu, dev, IOMMU_NO_PASID, dmar_domain); if (ret) - goto unassign_tag; + goto disable_iopf; info->domain = dmar_domain; + info->domain_attached = true; spin_lock_irqsave(&dmar_domain->lock, flags); list_add(&info->link, &dmar_domain->devices); spin_unlock_irqrestore(&dmar_domain->lock, flags); - domain_update_iotlb(dmar_domain); - return 0; +disable_iopf: + iopf_for_domain_remove(domain, dev); unassign_tag: cache_tag_unassign_domain(dmar_domain, dev, IOMMU_NO_PASID); detach_iommu: @@ -85,6 +84,7 @@ static void intel_nested_domain_free(struct iommu_domain *domain) spin_lock(&s2_domain->s1_lock); list_del(&dmar_domain->s2_link); spin_unlock(&s2_domain->s1_lock); + kfree(dmar_domain->qi_batch); kfree(dmar_domain); } @@ -131,25 +131,87 @@ out: return ret; } +static int domain_setup_nested(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_nested(iommu, dev, pasid, domain); + return intel_pasid_replace_nested(iommu, dev, pasid, + iommu_domain_did(old, iommu), + domain); +} + +static int intel_nested_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + goto out_remove_dev_pasid; + + ret = domain_setup_nested(iommu, dmar_domain, dev, pasid, old); + if (ret) + goto out_unwind_iopf; + + domain_remove_dev_pasid(old, dev, pasid); + + return 0; + +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + static const struct iommu_domain_ops intel_nested_domain_ops = { .attach_dev = intel_nested_attach_dev, + .set_dev_pasid = intel_nested_set_dev_pasid, .free = intel_nested_domain_free, .cache_invalidate_user = intel_nested_cache_invalidate_user, }; -struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, - const struct iommu_user_data *user_data) +struct iommu_domain * +intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, + const struct iommu_user_data *user_data) { + struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *s2_domain = to_dmar_domain(parent); + struct intel_iommu *iommu = info->iommu; struct iommu_hwpt_vtd_s1 vtd; struct dmar_domain *domain; int ret; + if (!nested_supported(iommu) || flags & ~IOMMU_HWPT_ALLOC_PASID) + return ERR_PTR(-EOPNOTSUPP); + /* Must be nested domain */ if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) return ERR_PTR(-EOPNOTSUPP); - if (parent->ops != intel_iommu_ops.default_domain_ops || - !s2_domain->nested_parent) + if (!intel_domain_is_ss_paging(s2_domain) || !s2_domain->nested_parent) return ERR_PTR(-EINVAL); ret = iommu_copy_struct_from_user(&vtd, user_data, @@ -161,9 +223,7 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, if (!domain) return ERR_PTR(-ENOMEM); - domain->use_first_level = true; domain->s2_domain = s2_domain; - domain->s1_pgtbl = vtd.pgtbl_addr; domain->s1_cfg = vtd; domain->domain.ops = &intel_nested_domain_ops; domain->domain.type = IOMMU_DOMAIN_NESTED; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index abce19e2ad6f..3e2255057079 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -60,14 +60,14 @@ int intel_pasid_alloc_table(struct device *dev) size = max_pasid >> (PASID_PDE_SHIFT - 3); order = size ? get_order(size) : 0; - dir = iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order); + dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL, + 1 << (order + PAGE_SHIFT)); if (!dir) { kfree(pasid_table); return -ENOMEM; } pasid_table->table = dir; - pasid_table->order = order; pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); info->pasid_table = pasid_table; @@ -97,10 +97,10 @@ void intel_pasid_free_table(struct device *dev) max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; for (i = 0; i < max_pde; i++) { table = get_pasid_table_from_pde(&dir[i]); - iommu_free_page(table); + iommu_free_pages(table); } - iommu_free_pages(pasid_table->table, pasid_table->order); + iommu_free_pages(pasid_table->table); kfree(pasid_table); } @@ -146,7 +146,10 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) retry: entries = get_pasid_table_from_pde(&dir[dir_index]); if (!entries) { - entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC); + u64 tmp; + + entries = iommu_alloc_pages_node_sz(info->iommu->node, + GFP_ATOMIC, SZ_4K); if (!entries) return NULL; @@ -156,9 +159,10 @@ retry: * clear. However, this entry might be populated by others * while we are preparing it. Use theirs with a retry. */ - if (cmpxchg64(&dir[dir_index].val, 0ULL, - (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { - iommu_free_page(entries); + tmp = 0ULL; + if (!try_cmpxchg64(&dir[dir_index].val, &tmp, + (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { + iommu_free_pages(entries); goto retry; } if (!ecap_coherent(info->iommu->ecap)) { @@ -217,7 +221,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, if (pci_dev_is_disconnected(to_pci_dev(dev))) return; - sid = info->bus << 8 | info->devfn; + sid = PCI_DEVID(info->bus, info->devfn); qdep = info->ats_qdep; pfsid = info->pfsid; @@ -241,11 +245,31 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); - if (WARN_ON(!pte) || !pasid_pte_is_present(pte)) { + if (WARN_ON(!pte)) { spin_unlock(&iommu->lock); return; } + if (!pasid_pte_is_present(pte)) { + if (!pasid_pte_is_fault_disabled(pte)) { + WARN_ON(READ_ONCE(pte->val[0]) != 0); + spin_unlock(&iommu->lock); + return; + } + + /* + * When a PASID is used for SVA by a device, it's possible + * that the pasid entry is non-present with the Fault + * Processing Disabled bit set. Clear the pasid entry and + * drain the PRQ for the PASID before return. + */ + pasid_clear_entry(pte); + spin_unlock(&iommu->lock); + intel_iommu_drain_pasid_prq(dev, pasid); + + return; + } + did = pasid_get_domain_id(pte); pgtt = pasid_pte_get_pgtt(pte); intel_pasid_clear_entry(dev, pasid, fault_ignore); @@ -261,9 +285,9 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, else iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); + if (!fault_ignore) + intel_iommu_drain_pasid_prq(dev, pasid); } /* @@ -286,12 +310,72 @@ static void pasid_flush_caches(struct intel_iommu *iommu, } /* + * This function is supposed to be used after caller updates the fields + * except for the SSADE and P bit of a pasid table entry. It does the + * below: + * - Flush cacheline if needed + * - Flush the caches per Table 28 ”Guidance to Software for Invalidations“ + * of VT-d spec 5.0. + */ +static void intel_pasid_flush_present(struct intel_iommu *iommu, + struct device *dev, + u32 pasid, u16 did, + struct pasid_entry *pte) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * VT-d spec 5.0 table28 states guides for cache invalidation: + * + * - PASID-selective-within-Domain PASID-cache invalidation + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + + devtlb_invalidation_with_pasid(iommu, dev, pasid); +} + +/* * Set up the scalable mode pasid table entry for first only * translation type. */ -int intel_pasid_setup_first_level(struct intel_iommu *iommu, - struct device *dev, pgd_t *pgd, - u32 pasid, u16 did, int flags) +static void pasid_pte_config_first_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + phys_addr_t fsptptr, u16 did, + int flags) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + /* Setup the first level page table pointer: */ + pasid_set_flptr(pte, fsptptr); + + if (flags & PASID_FLAG_FL5LP) + pasid_set_flpm(pte, 1); + + if (flags & PASID_FLAG_PAGE_SNOOP) + pasid_set_pgsnp(pte); + + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP); + + /* Setup Present and PASID Granular Transfer Type: */ + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); + pasid_set_present(pte); +} + +int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, + phys_addr_t fsptptr, u32 pasid, u16 did, + int flags) { struct pasid_entry *pte; @@ -319,64 +403,93 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); + pasid_pte_config_first_level(iommu, pte, fsptptr, did, flags); - /* Setup the first level page table pointer: */ - pasid_set_flptr(pte, (u64)__pa(pgd)); + spin_unlock(&iommu->lock); - if (flags & PASID_FLAG_FL5LP) - pasid_set_flpm(pte, 1); + pasid_flush_caches(iommu, pte, pasid, did); - if (flags & PASID_FLAG_PAGE_SNOOP) - pasid_set_pgsnp(pte); + return 0; +} - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - pasid_set_nxe(pte); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, phys_addr_t fsptptr, + u32 pasid, u16 did, u16 old_did, + int flags) +{ + struct pasid_entry *pte, new_pte; - /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); - pasid_set_present(pte); + if (!ecap_flts(iommu->ecap)) { + pr_err("No first level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { + pr_err("No 5-level paging support for first-level on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } /* - * Skip top levels of page tables for iommu which has less agaw - * than default. Unnecessary for PT mode. + * Set up the scalable mode pasid entry for second only translation type. */ -static int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) +static void pasid_pte_config_second_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct dmar_domain *domain, u16 did) { - int agaw; + struct pt_iommu_vtdss_hw_info pt_info; - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - *pgd = phys_to_virt(dma_pte_addr(*pgd)); - if (!dma_pte_present(*pgd)) - return -EINVAL; - } + lockdep_assert_held(&iommu->lock); - return agaw; + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_slptr(pte, pt_info.ssptptr); + pasid_set_address_width(pte, pt_info.aw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); + if (domain->dirty_tracking) + pasid_set_ssade(pte); + + pasid_set_present(pte); } -/* - * Set up the scalable mode pasid entry for second only translation type. - */ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid) { struct pasid_entry *pte; - struct dma_pte *pgd; - u64 pgd_val; - int agaw; u16 did; + /* * If hardware advertises no support for second level * translation, return directly. @@ -387,16 +500,50 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EINVAL; } - pgd = domain->pgd; - agaw = iommu_skip_agaw(domain, iommu, &pgd); - if (agaw < 0) { - dev_err(dev, "Invalid domain page table\n"); + did = domain_id_iommu(domain, iommu); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_pte_config_second_level(iommu, pte, domain, did); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} + +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did; + + /* + * If hardware advertises no support for second level + * translation, return directly. + */ + if (!ecap_slts(iommu->ecap)) { + pr_err("No second level translation support on %s\n", + iommu->name); return -EINVAL; } - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); + pasid_pte_config_second_level(iommu, &new_pte, domain, did); + spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); if (!pte) { @@ -404,25 +551,18 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -ENODEV; } - if (pasid_pte_is_present(pte)) { + if (!pasid_pte_is_present(pte)) { spin_unlock(&iommu->lock); - return -EBUSY; + return -EINVAL; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (domain->dirty_tracking) - pasid_set_ssade(pte); + WARN_ON(old_did != pasid_get_domain_id(pte)); - pasid_set_present(pte); + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } @@ -491,9 +631,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); return 0; } @@ -501,6 +639,20 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, /* * Set up the scalable mode pasid entry for passthrough translation type. */ +static void pasid_pte_config_pass_through(struct intel_iommu *iommu, + struct pasid_entry *pte, u16 did) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_present(pte); +} + int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid) { @@ -519,13 +671,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - pasid_set_present(pte); + pasid_pte_config_pass_through(iommu, pte, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -533,6 +679,38 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did = FLPT_DEFAULT_DID; + + pasid_pte_config_pass_through(iommu, &new_pte, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set the page snoop control for a pasid entry which has been set up. */ @@ -553,26 +731,50 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, did = pasid_get_domain_id(pte); spin_unlock(&iommu->lock); - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(pte, sizeof(*pte)); + intel_pasid_flush_present(iommu, dev, pasid, did, pte); +} - /* - * VT-d spec 3.4 table23 states guides for cache invalidation: - * - * - PASID-selective-within-Domain PASID-cache invalidation - * - PASID-selective PASID-based IOTLB invalidation - * - If (pasid is RID_PASID) - * - Global Device-TLB invalidation to affected functions - * Else - * - PASID-based Device-TLB invalidation (with S=1 and - * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions - */ - pasid_cache_invalidation_with_pasid(iommu, did, pasid); - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); +static void pasid_pte_config_nestd(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct iommu_hwpt_vtd_s1 *s1_cfg, + struct dmar_domain *s2_domain, + u16 did) +{ + struct pt_iommu_vtdss_hw_info pt_info; + + lockdep_assert_held(&iommu->lock); + + pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info); + + pasid_clear_entry(pte); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, pt_info.ssptptr); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, pt_info.aw); + pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); } /** @@ -590,10 +792,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain) { struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; - pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; struct dmar_domain *s2_domain = domain->s2_domain; u16 did = domain_id_iommu(domain, iommu); - struct dma_pte *pgd = s2_domain->pgd; struct pasid_entry *pte; /* Address width should match the address width supported by hardware */ @@ -636,37 +836,73 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return -EBUSY; } - pasid_clear_entry(pte); + pasid_pte_config_nestd(iommu, pte, s1_cfg, s2_domain, did); + spin_unlock(&iommu->lock); - if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) - pasid_set_flpm(pte, 1); + pasid_flush_caches(iommu, pte, pasid, did); - pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + return 0; +} - if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { - pasid_set_sre(pte); - if (s1_cfg->flags & IOMMU_VTD_S1_WPE) - pasid_set_wpe(pte); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct pasid_entry *pte, new_pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; } - if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) - pasid_set_eafe(pte); + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } - if (s2_domain->force_snooping) - pasid_set_pgsnp(pte); + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } - pasid_set_slptr(pte, virt_to_phys(pgd)); - pasid_set_fault_enable(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, s2_domain->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (s2_domain->dirty_tracking) - pasid_set_ssade(pte); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); - pasid_set_present(pte); + pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } @@ -681,6 +917,7 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, false); @@ -689,28 +926,11 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) return; } + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - - /* - * Cache invalidation for changes to a scalable-mode context table - * entry. - * - * Section 6.5.3.3 of the VT-d spec: - * - Device-selective context-cache invalidation; - * - Domain-selective PASID-cache invalidation to affected domains - * (can be skipped if all PASID entries were not-present); - * - Domain-selective IOTLB invalidation to affected domains; - * - Global Device-TLB invalidation to affected functions. - * - * The iommu has been parked in the blocking state. All domains have - * been detached from the device or PASID. The PASID and IOTLB caches - * have been invalidated during the domain detach path. - */ - iommu->flush.flush_context(iommu, 0, PCI_DEVID(bus, devfn), - DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); - devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); + intel_context_flush_no_pasid(info, context, did); } static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) @@ -768,10 +988,10 @@ static int context_entry_set_pasid_table(struct context_entry *context, if (info->ats_supported) context_set_sm_dte(context); - if (info->pri_supported) - context_set_sm_pre(context); if (info->pasid_supported) context_set_pasid(context); + if (info->pri_supported) + context_set_sm_pre(context); context_set_fault_enable(context); context_set_present(context); @@ -872,3 +1092,61 @@ int intel_pasid_setup_sm_context(struct device *dev) return pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_setup, dev); } + +/* + * Global Device-TLB invalidation following changes in a context entry which + * was present. + */ +static void __context_flush_dev_iotlb(struct device_domain_info *info) +{ + if (!info->ats_enabled) + return; + + qi_flush_dev_iotlb(info->iommu, PCI_DEVID(info->bus, info->devfn), + info->pfsid, info->ats_qdep, 0, MAX_AGAW_PFN_WIDTH); + + /* + * There is no guarantee that the device DMA is stopped when it reaches + * here. Therefore, always attempt the extra device TLB invalidation + * quirk. The impact on performance is acceptable since this is not a + * performance-critical path. + */ + quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, IOMMU_NO_PASID, + info->ats_qdep); +} + +/* + * Cache invalidations after change in a context table entry that was present + * according to the Spec 6.5.3.3 (Guidance to Software for Invalidations). + * This helper can only be used when IOMMU is working in the legacy mode or + * IOMMU is in scalable mode but all PASID table entries of the device are + * non-present. + */ +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did) +{ + struct intel_iommu *iommu = info->iommu; + + /* + * Device-selective context-cache invalidation. The Domain-ID field + * of the Context-cache Invalidate Descriptor is ignored by hardware + * when operating in scalable mode. Therefore the @did value doesn't + * matter in scalable mode. + */ + iommu->flush.flush_context(iommu, did, PCI_DEVID(info->bus, info->devfn), + DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); + + /* + * For legacy mode: + * - Domain-selective IOTLB invalidation + * - Global Device-TLB invalidation to all affected functions + */ + if (!sm_supported(iommu)) { + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + __context_flush_dev_iotlb(info); + + return; + } + + __context_flush_dev_iotlb(info); +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index da9978fef7ac..b4c85242dc79 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -22,15 +22,9 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) -/* - * Domain ID reserved for pasid entries programmed for first-level - * only and pass-through transfer modes. - */ -#define FLPT_DEFAULT_DID 1 -#define NUM_RESERVED_DID 2 - #define PASID_FLAG_NESTED BIT(1) #define PASID_FLAG_PAGE_SNOOP BIT(2) +#define PASID_FLAG_PWSNP BIT(2) /* * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- @@ -54,7 +48,6 @@ struct pasid_entry { /* The representative of a PASID table */ struct pasid_table { void *table; /* pasid table pointer */ - int order; /* page order of pasid table */ u32 max_pasid; /* max pasid */ }; @@ -80,6 +73,12 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte) return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT; } +/* Get FPD(Fault Processing Disable) bit of a PASID table entry */ +static inline bool pasid_pte_is_fault_disabled(struct pasid_entry *pte) +{ + return READ_ONCE(pte->val[0]) & PASID_PTE_FPD; +} + /* Get PGTT field of a PASID table entry */ static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) { @@ -248,16 +247,6 @@ static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) } /* - * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID - * entry. It is required when XD bit of the first level page table - * entry is about to be set. - */ -static inline void pasid_set_nxe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); -} - -/* * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode * PASID entry. */ @@ -300,9 +289,9 @@ extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); void intel_pasid_free_table(struct device *dev); struct pasid_table *intel_pasid_get_table(struct device *dev); -int intel_pasid_setup_first_level(struct intel_iommu *iommu, - struct device *dev, pgd_t *pgd, - u32 pasid, u16 did, int flags); +int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, + phys_addr_t fsptptr, u32 pasid, u16 did, + int flags); int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); @@ -313,6 +302,20 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid); int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, phys_addr_t fsptptr, + u32 pasid, u16 did, u16 old_did, int flags); +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain); + void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c index adc4de6bbd88..dceeadc3ee7c 100644 --- a/drivers/iommu/intel/perf.c +++ b/drivers/iommu/intel/perf.c @@ -113,7 +113,7 @@ static char *latency_type_names[] = { " svm_prq" }; -int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) +void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) { struct latency_statistic *lstat = iommu->perf_statistic; unsigned long flags; @@ -122,7 +122,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) memset(str, 0, size); for (i = 0; i < COUNTS_NUM; i++) - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "%s", latency_counter_names[i]); spin_lock_irqsave(&latency_lock, flags); @@ -130,7 +130,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) if (!dmar_latency_enabled(iommu, i)) continue; - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "\n%s", latency_type_names[i]); for (j = 0; j < COUNTS_NUM; j++) { @@ -156,11 +156,9 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) break; } - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "%12lld", val); } } spin_unlock_irqrestore(&latency_lock, flags); - - return bytes; } diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h index df9a36942d64..1d4baad7e852 100644 --- a/drivers/iommu/intel/perf.h +++ b/drivers/iommu/intel/perf.h @@ -40,7 +40,7 @@ void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type); bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type); void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency); -int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size); +void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size); #else static inline int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type) @@ -64,9 +64,8 @@ dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 laten { } -static inline int +static inline void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) { - return 0; } #endif /* CONFIG_DMAR_PERF */ diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c index 44083d01852d..75f493bcb353 100644 --- a/drivers/iommu/intel/perfmon.c +++ b/drivers/iommu/intel/perfmon.c @@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_events_attr_group = { .attrs = attrs_empty, }; -static cpumask_t iommu_pmu_cpu_mask; - -static ssize_t -cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask); -} -static DEVICE_ATTR_RO(cpumask); - -static struct attribute *iommu_pmu_cpumask_attrs[] = { - &dev_attr_cpumask.attr, - NULL -}; - -static struct attribute_group iommu_pmu_cpumask_attr_group = { - .attrs = iommu_pmu_cpumask_attrs, -}; - static const struct attribute_group *iommu_pmu_attr_groups[] = { &iommu_pmu_format_attr_group, &iommu_pmu_events_attr_group, - &iommu_pmu_cpumask_attr_group, NULL }; @@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct intel_iommu *iommu) iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups; iommu_pmu->pmu.attr_update = iommu_pmu_attr_update; iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; iommu_pmu->pmu.module = THIS_MODULE; return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1); @@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu) iommu->perf_irq = 0; } -static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) -{ - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); - - if (cpumask_empty(&iommu_pmu_cpu_mask)) - cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask); - - if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask)) - iommu_pmu->cpu = cpu; - - return 0; -} - -static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) -{ - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); - int target = cpumask_first(&iommu_pmu_cpu_mask); - - /* - * The iommu_pmu_cpu_mask has been updated when offline the CPU - * for the first iommu_pmu. Migrate the other iommu_pmu to the - * new target. - */ - if (target < nr_cpu_ids && target != iommu_pmu->cpu) { - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); - iommu_pmu->cpu = target; - return 0; - } - - if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask)) - return 0; - - target = cpumask_any_but(cpu_online_mask, cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &iommu_pmu_cpu_mask); - else - return 0; - - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); - iommu_pmu->cpu = target; - - return 0; -} - -static int nr_iommu_pmu; -static enum cpuhp_state iommu_cpuhp_slot; - -static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu) -{ - int ret; - - if (!nr_iommu_pmu) { - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, - "driver/iommu/intel/perfmon:online", - iommu_pmu_cpu_online, - iommu_pmu_cpu_offline); - if (ret < 0) - return ret; - iommu_cpuhp_slot = ret; - } - - ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); - if (ret) { - if (!nr_iommu_pmu) - cpuhp_remove_multi_state(iommu_cpuhp_slot); - return ret; - } - nr_iommu_pmu++; - - return 0; -} - -static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu) -{ - cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); - - if (--nr_iommu_pmu) - return; - - cpuhp_remove_multi_state(iommu_cpuhp_slot); -} - void iommu_pmu_register(struct intel_iommu *iommu) { struct iommu_pmu *iommu_pmu = iommu->pmu; @@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iommu *iommu) if (__iommu_pmu_register(iommu)) goto err; - if (iommu_pmu_cpuhp_setup(iommu_pmu)) - goto unregister; - /* Set interrupt for overflow */ if (iommu_pmu_set_interrupt(iommu)) - goto cpuhp_free; + goto unregister; return; -cpuhp_free: - iommu_pmu_cpuhp_free(iommu_pmu); unregister: perf_pmu_unregister(&iommu_pmu->pmu); err: @@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_iommu *iommu) return; iommu_pmu_unset_interrupt(iommu); - iommu_pmu_cpuhp_free(iommu_pmu); perf_pmu_unregister(&iommu_pmu->pmu); } diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c new file mode 100644 index 000000000000..ff63c228e6e1 --- /dev/null +++ b/drivers/iommu/intel/prq.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Intel Corporation + * + * Originally split from drivers/iommu/intel/svm.c + */ + +#include <linux/pci.h> +#include <linux/pci-ats.h> + +#include "iommu.h" +#include "pasid.h" +#include "../iommu-pages.h" +#include "trace.h" + +/* Page request queue descriptor */ +struct page_req_dsc { + union { + struct { + u64 type:8; + u64 pasid_present:1; + u64 rsvd:7; + u64 rid:16; + u64 pasid:20; + u64 exe_req:1; + u64 pm_req:1; + u64 rsvd2:10; + }; + u64 qw_0; + }; + union { + struct { + u64 rd_req:1; + u64 wr_req:1; + u64 lpig:1; + u64 prg_index:9; + u64 addr:52; + }; + u64 qw_1; + }; + u64 qw_2; + u64 qw_3; +}; + +/** + * intel_iommu_drain_pasid_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + int head, tail; + u16 sid, did; + + info = dev_iommu_priv_get(dev); + if (!info->iopf_refcount) + return; + + iommu = info->iommu; + domain = info->domain; + sid = PCI_DEVID(info->bus, info->devfn); + did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (req->rid != sid || + (req->pasid_present && pasid != req->pasid) || + (!req->pasid_present && pasid != IOMMU_NO_PASID)) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + iopf_queue_flush_dev(dev); + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + if (pasid == IOMMU_NO_PASID) { + qi_desc_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, &desc[1]); + qi_desc_dev_iotlb(sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, &desc[2]); + } else { + qi_desc_piotlb(did, pasid, 0, -1, 0, &desc[1]); + qi_desc_dev_iotlb_pasid(sid, info->pfsid, pasid, info->ats_qdep, + 0, MAX_AGAW_PFN_WIDTH, &desc[2]); + } +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + +static bool is_canonical_address(u64 addr) +{ + int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + long saddr = (long)addr; + + return (((saddr << shift) >> shift) == saddr); +} + +static void handle_bad_prq_event(struct intel_iommu *iommu, + struct page_req_dsc *req, int result) +{ + struct qi_desc desc = { }; + + pr_err("%s: Invalid page request: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + + if (!req->lpig) + return; + + desc.qw0 = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID(req->rid) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_RESP_CODE(result) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(req->prg_index); + + qi_submit_sync(iommu, &desc, 1, 0); +} + +static int prq_to_iommu_prot(struct page_req_dsc *req) +{ + int prot = 0; + + if (req->rd_req) + prot |= IOMMU_FAULT_PERM_READ; + if (req->wr_req) + prot |= IOMMU_FAULT_PERM_WRITE; + if (req->exe_req) + prot |= IOMMU_FAULT_PERM_EXEC; + if (req->pm_req) + prot |= IOMMU_FAULT_PERM_PRIV; + + return prot; +} + +static void intel_prq_report(struct intel_iommu *iommu, struct device *dev, + struct page_req_dsc *desc) +{ + struct iopf_fault event = { }; + + /* Fill in event data for device specific processing */ + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; + event.fault.prm.pasid = desc->pasid; + event.fault.prm.grpid = desc->prg_index; + event.fault.prm.perm = prq_to_iommu_prot(desc); + + if (desc->lpig) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (desc->pasid_present) { + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + } + + iommu_report_device_fault(dev, &event); +} + +static irqreturn_t prq_event_thread(int irq, void *d) +{ + struct intel_iommu *iommu = d; + struct page_req_dsc *req; + int head, tail, handled; + struct device *dev; + u64 address; + + /* + * Clear PPR bit before reading head/tail registers, to ensure that + * we get a new interrupt if needed. + */ + writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); + + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + handled = (head != tail); + while (head != tail) { + req = &iommu->prq[head / sizeof(*req)]; + address = (u64)req->addr << VTD_PAGE_SHIFT; + + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", + iommu->name); +bad_req: + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + goto prq_advance; + } + + if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { + pr_err("IOMMU: %s: Page request in Privilege Mode\n", + iommu->name); + goto bad_req; + } + + if (unlikely(req->exe_req && req->rd_req)) { + pr_err("IOMMU: %s: Execution request not supported\n", + iommu->name); + goto bad_req; + } + + /* Drop Stop Marker message. No need for a response. */ + if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) + goto prq_advance; + + /* + * If prq is to be handled outside iommu driver via receiver of + * the fault notifiers, we skip the page response here. + */ + mutex_lock(&iommu->iopf_lock); + dev = device_rbtree_find(iommu, req->rid); + if (!dev) { + mutex_unlock(&iommu->iopf_lock); + goto bad_req; + } + + intel_prq_report(iommu, dev, req); + trace_prq_report(iommu, dev, req->qw_0, req->qw_1, + req->qw_2, req->qw_3, + iommu->prq_seq_number++); + mutex_unlock(&iommu->iopf_lock); +prq_advance: + head = (head + sizeof(*req)) & PRQ_RING_MASK; + } + + dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", + iommu->name); + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + if (head == tail) { + iopf_queue_discard_partial(iommu->iopf_queue); + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", + iommu->name); + } + } + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + + return IRQ_RETVAL(handled); +} + +int intel_iommu_enable_prq(struct intel_iommu *iommu) +{ + struct iopf_queue *iopfq; + int irq, ret; + + iommu->prq = + iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, PRQ_SIZE); + if (!iommu->prq) { + pr_warn("IOMMU: %s: Failed to allocate page request queue\n", + iommu->name); + return -ENOMEM; + } + + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) { + pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", + iommu->name); + ret = -EINVAL; + goto free_prq; + } + iommu->pr_irq = irq; + + snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), + "dmar%d-iopfq", iommu->seq_id); + iopfq = iopf_queue_alloc(iommu->iopfq_name); + if (!iopfq) { + pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); + ret = -ENOMEM; + goto free_hwirq; + } + iommu->iopf_queue = iopfq; + + snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); + + ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, + iommu->prq_name, iommu); + if (ret) { + pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", + iommu->name); + goto free_iopfq; + } + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + + init_completion(&iommu->prq_complete); + + return 0; + +free_iopfq: + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; +free_hwirq: + dmar_free_hwirq(irq); + iommu->pr_irq = 0; +free_prq: + iommu_free_pages(iommu->prq); + iommu->prq = NULL; + + return ret; +} + +int intel_iommu_finish_prq(struct intel_iommu *iommu) +{ + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); + + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + } + + if (iommu->iopf_queue) { + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; + } + + iommu_free_pages(iommu->prq); + iommu->prq = NULL; + + return 0; +} + +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; + struct iommu_fault_page_request *prm; + struct qi_desc desc; + bool pasid_present; + u16 sid; + + prm = &evt->fault.prm; + sid = PCI_DEVID(bus, devfn); + pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + + desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | + QI_PGRP_PASID_P(pasid_present) | + QI_PGRP_RESP_CODE(msg->code) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(prm->grpid); + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(iommu, &desc, 1, 0); +} diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 0e3a9b38bef2..71de7947971f 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -25,92 +25,6 @@ #include "../iommu-pages.h" #include "trace.h" -static irqreturn_t prq_event_thread(int irq, void *d); - -int intel_svm_enable_prq(struct intel_iommu *iommu) -{ - struct iopf_queue *iopfq; - int irq, ret; - - iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); - if (!iommu->prq) { - pr_warn("IOMMU: %s: Failed to allocate page request queue\n", - iommu->name); - return -ENOMEM; - } - - irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); - if (irq <= 0) { - pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", - iommu->name); - ret = -EINVAL; - goto free_prq; - } - iommu->pr_irq = irq; - - snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), - "dmar%d-iopfq", iommu->seq_id); - iopfq = iopf_queue_alloc(iommu->iopfq_name); - if (!iopfq) { - pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); - ret = -ENOMEM; - goto free_hwirq; - } - iommu->iopf_queue = iopfq; - - snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); - - ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, - iommu->prq_name, iommu); - if (ret) { - pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", - iommu->name); - goto free_iopfq; - } - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); - - init_completion(&iommu->prq_complete); - - return 0; - -free_iopfq: - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; -free_hwirq: - dmar_free_hwirq(irq); - iommu->pr_irq = 0; -free_prq: - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return ret; -} - -int intel_svm_finish_prq(struct intel_iommu *iommu) -{ - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); - - if (iommu->pr_irq) { - free_irq(iommu->pr_irq, iommu); - dmar_free_hwirq(iommu->pr_irq); - iommu->pr_irq = 0; - } - - if (iommu->iopf_queue) { - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; - } - - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return 0; -} - void intel_svm_check(struct intel_iommu *iommu) { if (!pasid_supported(iommu)) @@ -184,7 +98,10 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static void intel_mm_free_notifier(struct mmu_notifier *mn) { - kfree(container_of(mn, struct dmar_domain, notifier)); + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); + + kfree(domain->qi_batch); + kfree(domain); } static const struct mmu_notifier_ops intel_mmuops = { @@ -193,359 +110,81 @@ static const struct mmu_notifier_ops intel_mmuops = { .free_notifier = intel_mm_free_notifier, }; -static int intel_svm_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) +static int intel_iommu_sva_supported(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_iommu *iommu = info->iommu; - struct mm_struct *mm = domain->mm; - struct dev_pasid_info *dev_pasid; - unsigned long sflags; - unsigned long flags; - int ret = 0; - - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; - - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - - ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid); - if (ret) - goto free_dev_pasid; - - /* Setup the pasid table: */ - sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; - ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, - FLPT_DEFAULT_DID, sflags); - if (ret) - goto unassign_tag; - - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); - - return 0; - -unassign_tag: - cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); -free_dev_pasid: - kfree(dev_pasid); - - return ret; -} - -/* Page request queue descriptor */ -struct page_req_dsc { - union { - struct { - u64 type:8; - u64 pasid_present:1; - u64 rsvd:7; - u64 rid:16; - u64 pasid:20; - u64 exe_req:1; - u64 pm_req:1; - u64 rsvd2:10; - }; - u64 qw_0; - }; - union { - struct { - u64 rd_req:1; - u64 wr_req:1; - u64 lpig:1; - u64 prg_index:9; - u64 addr:52; - }; - u64 qw_1; - }; - u64 qw_2; - u64 qw_3; -}; - -static bool is_canonical_address(u64 addr) -{ - int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); - long saddr = (long) addr; - - return (((saddr << shift) >> shift) == saddr); -} - -/** - * intel_drain_pasid_prq - Drain page requests and responses for a pasid - * @dev: target device - * @pasid: pasid for draining - * - * Drain all pending page requests and responses related to @pasid in both - * software and hardware. This is supposed to be called after the device - * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB - * and DevTLB have been invalidated. - * - * It waits until all pending page requests for @pasid in the page fault - * queue are completed by the prq handling thread. Then follow the steps - * described in VT-d spec CH7.10 to drain all page requests and page - * responses pending in the hardware. - */ -void intel_drain_pasid_prq(struct device *dev, u32 pasid) -{ - struct device_domain_info *info; - struct dmar_domain *domain; struct intel_iommu *iommu; - struct qi_desc desc[3]; - struct pci_dev *pdev; - int head, tail; - u16 sid, did; - int qdep; - - info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev))) - return; - if (!info->pri_enabled) - return; + if (!info || dmar_disabled) + return -EINVAL; iommu = info->iommu; - domain = info->domain; - pdev = to_pci_dev(dev); - sid = PCI_DEVID(info->bus, info->devfn); - did = domain_id_iommu(domain, iommu); - qdep = pci_ats_queue_depth(pdev); + if (!iommu) + return -EINVAL; - /* - * Check and wait until all pending page requests in the queue are - * handled by the prq handling thread. - */ -prq_retry: - reinit_completion(&iommu->prq_complete); - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - while (head != tail) { - struct page_req_dsc *req; - - req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { - head = (head + sizeof(*req)) & PRQ_RING_MASK; - continue; - } - - wait_for_completion(&iommu->prq_complete); - goto prq_retry; - } + if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) + return -ENODEV; - iopf_queue_flush_dev(dev); + if (!info->pasid_enabled || !info->ats_enabled) + return -EINVAL; /* - * Perform steps described in VT-d spec CH7.10 to drain page - * requests and responses in hardware. + * Devices having device-specific I/O fault handling should not + * support PCI/PRI. The IOMMU side has no means to check the + * capability of device-specific IOPF. Therefore, IOMMU can only + * default that if the device driver enables SVA on a non-PRI + * device, it will handle IOPF in its own way. */ - memset(desc, 0, sizeof(desc)); - desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | - QI_IWD_FENCE | - QI_IWD_TYPE; - desc[1].qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | - QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | - QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(info->pfsid); -qi_retry: - reinit_completion(&iommu->prq_complete); - qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - wait_for_completion(&iommu->prq_complete); - goto qi_retry; - } -} - -static int prq_to_iommu_prot(struct page_req_dsc *req) -{ - int prot = 0; - - if (req->rd_req) - prot |= IOMMU_FAULT_PERM_READ; - if (req->wr_req) - prot |= IOMMU_FAULT_PERM_WRITE; - if (req->exe_req) - prot |= IOMMU_FAULT_PERM_EXEC; - if (req->pm_req) - prot |= IOMMU_FAULT_PERM_PRIV; - - return prot; -} - -static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, - struct page_req_dsc *desc) -{ - struct iopf_fault event = { }; - - /* Fill in event data for device specific processing */ - event.fault.type = IOMMU_FAULT_PAGE_REQ; - event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; - event.fault.prm.pasid = desc->pasid; - event.fault.prm.grpid = desc->prg_index; - event.fault.prm.perm = prq_to_iommu_prot(desc); - - if (desc->lpig) - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - if (desc->pasid_present) { - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; - } - - iommu_report_device_fault(dev, &event); -} - -static void handle_bad_prq_event(struct intel_iommu *iommu, - struct page_req_dsc *req, int result) -{ - struct qi_desc desc = { }; - - pr_err("%s: Invalid page request: %08llx %08llx\n", - iommu->name, ((unsigned long long *)req)[0], - ((unsigned long long *)req)[1]); - - if (!req->lpig) - return; + if (!info->pri_supported) + return 0; - desc.qw0 = QI_PGRP_PASID(req->pasid) | - QI_PGRP_DID(req->rid) | - QI_PGRP_PASID_P(req->pasid_present) | - QI_PGRP_RESP_CODE(result) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(req->prg_index) | - QI_PGRP_LPIG(req->lpig); + /* Devices supporting PRI should have it enabled. */ + if (!info->pri_enabled) + return -EINVAL; - qi_submit_sync(iommu, &desc, 1, 0); + return 0; } -static irqreturn_t prq_event_thread(int irq, void *d) +static int intel_svm_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { - struct intel_iommu *iommu = d; - struct page_req_dsc *req; - int head, tail, handled; - struct device *dev; - u64 address; + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct mm_struct *mm = domain->mm; + struct dev_pasid_info *dev_pasid; + unsigned long sflags; + int ret = 0; - /* - * Clear PPR bit before reading head/tail registers, to ensure that - * we get a new interrupt if needed. - */ - writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); - - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - handled = (head != tail); - while (head != tail) { - req = &iommu->prq[head / sizeof(*req)]; - address = (u64)req->addr << VTD_PAGE_SHIFT; - - if (unlikely(!req->pasid_present)) { - pr_err("IOMMU: %s: Page request without PASID\n", - iommu->name); -bad_req: - handle_bad_prq_event(iommu, req, QI_RESP_INVALID); - goto prq_advance; - } - - if (unlikely(!is_canonical_address(address))) { - pr_err("IOMMU: %s: Address is not canonical\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { - pr_err("IOMMU: %s: Page request in Privilege Mode\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->exe_req && req->rd_req)) { - pr_err("IOMMU: %s: Execution request not supported\n", - iommu->name); - goto bad_req; - } - - /* Drop Stop Marker message. No need for a response. */ - if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) - goto prq_advance; - - /* - * If prq is to be handled outside iommu driver via receiver of - * the fault notifiers, we skip the page response here. - */ - mutex_lock(&iommu->iopf_lock); - dev = device_rbtree_find(iommu, req->rid); - if (!dev) { - mutex_unlock(&iommu->iopf_lock); - goto bad_req; - } - - intel_svm_prq_report(iommu, dev, req); - trace_prq_report(iommu, dev, req->qw_0, req->qw_1, - req->qw_2, req->qw_3, - iommu->prq_seq_number++); - mutex_unlock(&iommu->iopf_lock); -prq_advance: - head = (head + sizeof(*req)) & PRQ_RING_MASK; - } + ret = intel_iommu_sva_supported(dev); + if (ret) + return ret; - dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); - /* - * Clear the page request overflow bit and wake up all threads that - * are waiting for the completion of this handling. - */ - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", - iommu->name); - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - if (head == tail) { - iopf_queue_discard_partial(iommu->iopf_queue); - writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); - pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", - iommu->name); - } - } + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + goto out_remove_dev_pasid; - if (!completion_done(&iommu->prq_complete)) - complete(&iommu->prq_complete); + /* Setup the pasid table: */ + sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + sflags |= PASID_FLAG_PWSNP; + ret = __domain_setup_first_level(iommu, dev, pasid, + FLPT_DEFAULT_DID, __pa(mm->pgd), + sflags, old); + if (ret) + goto out_unwind_iopf; - return IRQ_RETVAL(handled); -} + domain_remove_dev_pasid(old, dev, pasid); -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; - struct iommu_fault_page_request *prm; - struct qi_desc desc; - bool pasid_present; - bool last_page; - u16 sid; - - prm = &evt->fault.prm; - sid = PCI_DEVID(bus, devfn); - pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - - desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | - QI_PGRP_PASID_P(pasid_present) | - QI_PGRP_RESP_CODE(msg->code) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); - desc.qw2 = 0; - desc.qw3 = 0; - - qi_submit_sync(iommu, &desc, 1, 0); + return 0; +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; } static void intel_svm_domain_free(struct iommu_domain *domain) @@ -567,12 +206,15 @@ struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct dmar_domain *domain; int ret; + ret = intel_iommu_sva_supported(dev); + if (ret) + return ERR_PTR(ret); + domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return ERR_PTR(-ENOMEM); domain->domain.ops = &intel_svm_domain_ops; - domain->use_first_level = true; INIT_LIST_HEAD(&domain->dev_pasids); INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->cache_lock); diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h index 9defdae6ebae..6311ba3f1691 100644 --- a/drivers/iommu/intel/trace.h +++ b/drivers/iommu/intel/trace.h @@ -130,11 +130,6 @@ DEFINE_EVENT(cache_tag_log, cache_tag_unassign, TP_ARGS(tag) ); -DEFINE_EVENT(cache_tag_log, cache_tag_flush_all, - TP_PROTO(struct cache_tag *tag), - TP_ARGS(tag) -); - DECLARE_EVENT_CLASS(cache_tag_flush, TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, unsigned long addr, unsigned long pages, unsigned long mask), |
